flume+mapreduce實戰小練習

原創

BestbpF

2020-02-21 07:50

需求：

對nginx服務器下的log文件進行收集到HDFS上，然後通過mapreduce對日誌文件進行分析。

我這裏的log文件位於 /var/log/nginx/access.log。

通過打開nginx的頁面進行刷新來模擬用戶訪問，會產生新的日誌信息

1、通過flume來收集數據到HDFS上

# define agent
a2.sources = r2
a2.channels = c2
a2.sinks = k2

# definde sources
a2.sources.r2.type = exec
a2.sources.r2.command = tail -f --line=1 /var/log/nginx/access.log
a2.sources.r2.shell = /bin/bash -c

# define channels
a2.channels.c2.type = memory
a2.channels.c2.capacity = 1000
a2.channels.c2.transactionCapacity = 100

# define sinks
a2.sinks.k2.type = hdfs
a2.sinks.k2.hdfs.path = hdfs://master1:8020/user/master1/nginx_logs/
a2.sinks.k2.hdfs.fileType = DataStream
a2.sinks.k2.hdfs.writeFormat = Text 
a2.sinks.k2.hdfs.batchSize = 10
# bind channels to sources and sinks
a2.sources.r2.channels = c2
a2.sinks.k2.channel = c2

收集到如下所示格式的數據

190.168.5.111 - - [19/Oct/2017:12:54:33 +0800] "GET / HTTP/1.1" 304 0 "-" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36" "-"

通過mapreduce程序將前三個字段抽出，即ip、訪問時間、訪問方式&HTTP

package com.bpf.hadoop;

import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Nginx_log {

    public static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {
        private Text log = new Text();

        @Override
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            
            String info = value.toString();
            String[] split = info.split(" ");
            String ip = split[0];
            String time = split[3]+" "+split[4];
            String method = split[5]+split[6]+split[7];
            log.set("ip:"+ip+"\t"+"time:"+time+"\t"+"method:"+method);
            context.write(log, new Text());
            
        }
    }

    public static class MyReducer extends Reducer<Text, Text, Text, Text> {

        @Override
        protected void reduce(Text key, Iterable<Text> value, Context context) throws IOException, InterruptedException {
            // TODO Auto-generated method stub
            context.write(key, new Text());
        }
    }

    @SuppressWarnings("deprecation")
    public static void main(String[] args) throws Exception {

        System.setProperty("HADOOP_USER_NAME", "master1");
        Configuration conf = new Configuration();
        Job job = new Job(conf);
        job.setJarByClass(Nginx_log.class);
        
        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);
        
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        
        Path outPath = new Path("hdfs://master1:8020/user/master1/output");
        FileSystem fs = FileSystem.get(new URI("hdfs://master1:8020"), conf);
        if(fs.exists(outPath)) {
            fs.delete(outPath,true);
        }
        
        FileInputFormat.addInputPaths(job, "hdfs://master1:8020/user/master1/nginx_logs");
        FileOutputFormat.setOutputPath(job, new Path("hdfs://master1:8020/user/master1/output"));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

結果如：

ip:190.168.5.111	time:[19/Oct/2017:12:54:33 +0800]	method:"GET / HTTP/1.1"
ip:190.168.5.111	time:[19/Oct/2017:12:54:35 +0800]	method:"GET / HTTP/1.1"
ip:190.168.5.111	time:[19/Oct/2017:12:54:44 +0800]	method:"GET / HTTP/1.1"
ip:190.168.5.111	time:[19/Oct/2017:12:54:47 +0800]	method:"GET / HTTP/1.1"