需求:
對nginx服務器下的log文件進行收集到HDFS上,然後通過mapreduce對日誌文件進行分析。
我這裏的log文件位於 /var/log/nginx/access.log。
通過打開nginx的頁面進行刷新來模擬用戶訪問,會產生新的日誌信息
1、通過flume來收集數據到HDFS上
# define agent
a2.sources = r2
a2.channels = c2
a2.sinks = k2
# definde sources
a2.sources.r2.type = exec
a2.sources.r2.command = tail -f --line=1 /var/log/nginx/access.log
a2.sources.r2.shell = /bin/bash -c
# define channels
a2.channels.c2.type = memory
a2.channels.c2.capacity = 1000
a2.channels.c2.transactionCapacity = 100
# define sinks
a2.sinks.k2.type = hdfs
a2.sinks.k2.hdfs.path = hdfs://master1:8020/user/master1/nginx_logs/
a2.sinks.k2.hdfs.fileType = DataStream
a2.sinks.k2.hdfs.writeFormat = Text
a2.sinks.k2.hdfs.batchSize = 10
# bind channels to sources and sinks
a2.sources.r2.channels = c2
a2.sinks.k2.channel = c2
收集到如下所示格式的數據
190.168.5.111 - - [19/Oct/2017:12:54:33 +0800] "GET / HTTP/1.1" 304 0 "-" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36" "-"
通過mapreduce程序將前三個字段抽出,即ip、訪問時間 、訪問方式&HTTP
package com.bpf.hadoop;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Nginx_log {
public static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {
private Text log = new Text();
@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String info = value.toString();
String[] split = info.split(" ");
String ip = split[0];
String time = split[3]+" "+split[4];
String method = split[5]+split[6]+split[7];
log.set("ip:"+ip+"\t"+"time:"+time+"\t"+"method:"+method);
context.write(log, new Text());
}
}
public static class MyReducer extends Reducer<Text, Text, Text, Text> {
@Override
protected void reduce(Text key, Iterable<Text> value, Context context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
context.write(key, new Text());
}
}
@SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {
System.setProperty("HADOOP_USER_NAME", "master1");
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setJarByClass(Nginx_log.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
Path outPath = new Path("hdfs://master1:8020/user/master1/output");
FileSystem fs = FileSystem.get(new URI("hdfs://master1:8020"), conf);
if(fs.exists(outPath)) {
fs.delete(outPath,true);
}
FileInputFormat.addInputPaths(job, "hdfs://master1:8020/user/master1/nginx_logs");
FileOutputFormat.setOutputPath(job, new Path("hdfs://master1:8020/user/master1/output"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
結果如:
ip:190.168.5.111 time:[19/Oct/2017:12:54:33 +0800] method:"GET / HTTP/1.1"
ip:190.168.5.111 time:[19/Oct/2017:12:54:35 +0800] method:"GET / HTTP/1.1"
ip:190.168.5.111 time:[19/Oct/2017:12:54:44 +0800] method:"GET / HTTP/1.1"
ip:190.168.5.111 time:[19/Oct/2017:12:54:47 +0800] method:"GET / HTTP/1.1"