MapReduce之日誌清洗與分析

MapReduce之日誌清洗與分析

  • 本文運用的日誌文件如下:需要可以留言
    在這裏插入圖片描述
  • 分別爲:訪問者主機名、訪問者IP、訪問時間、訪問資源、訪問狀態(HTTP狀態碼)、本次訪問流量等等
  • 根據要求,我們要將訪問資源號以及訪問流量統計出來,其他數據將清洗,觀察格式發現可以進行二次切割 —> 空格以及等號
1.依賴
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.7.3</version>
</dependency>
2.添加log4j.properties文件在資源目錄下即resources,文件內容如下
log4j.rootLogger=FATAL, dest1
log4j.logger.dsaLogging=DEBUG, dsa
log4j.additivity.dsaLogging=false

log4j.appender.dest1=org.apache.log4j.ConsoleAppender
log4j.appender.dest1.layout=org.apache.log4j.PatternLayout
log4j.appender.dest1.layout.ConversionPattern=%-5p:%l: %m%n
log4j.appender.dest1.ImmediateFlush=true

log4j.appender.dsa=org.apache.log4j.RollingFileAppender
log4j.appender.dsa.File=./logs/dsa.log
log4j.appender.dsa.MaxFileSize=2000KB
# Previously MaxBackupIndex=2
log4j.appender.dsa.MaxBackupIndex=5
log4j.appender.dsa.layout=org.apache.log4j.PatternLayout
log4j.appender.dsa.layout.ConversionPattern=%l:%d: %m%n
3.編寫Mapper類
package com.mr.log;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class LogMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    Text key2 = new Text();
    IntWritable values2 = new IntWritable();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //分詞:找到id
        String data = value.toString();
        String[] splits = value.toString().split(" ");
        String url = splits[6];
        String[] split = url.split("=");
        for(int i =0;i<split.length;i++){
            System.out.println("第二次切割後,數組中的元素" +i+" : "+split[i]);
        }
        String id=split[1];
        key2.set(id);
        values2.set(1);
        context.write(key2,values2);
    }
}
4.編寫Reducer類
package com.mr.log;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;


import java.io.IOException;

public class LogReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int sum=0;
        for (IntWritable v :values) {
            sum += v.get();
        }
        context.write(key,new IntWritable(sum));
    }
}

5.編寫Job類
package com.mr.log;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class LogJob {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Job job = Job.getInstance(new Configuration());
        job.setJarByClass(LogJob.class);

        job.setMapperClass(LogMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setReducerClass(LogReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job,new Path("D:\\HadoopInputText\\data\\log\\access.log"));
        FileOutputFormat.setOutputPath(job,new Path("D:\\HadoopOutputTest\\logout\\"));

        job.waitForCompletion(true);

    }
}
6.運行

在這裏插入圖片描述
在這裏插入圖片描述
在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章