MapReduce分析日誌---代碼流程解讀

原文鏈接:http://blog.fens.me/hadoop-mapreduce-log-kpi/

數據類來源:網站訪問記錄

先上代碼:

對原作者的代碼進行了部分更改以適應新版本的Hadoop,同時記錄下自己的問題和查找答案,方便以後快速回憶

package org.apache.hadoop.examples;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;

public class Kpi {//bean,將數據封裝,轉化爲String

	private String remote_add;
	private String remote_user;
	private String time_local;
	private String request;
	private String statues;
	private String body_bytes_sent;
	private String http_referer;
	private String http_user_agent;
	private boolean valid = true;
	
	public String toString(){
		StringBuilder sb = new StringBuilder();
		sb.append("valid:"+this.valid);
		sb.append("\nremote:_addr:"+this.remote_add);
		sb.append("\nremote_user:"+this.remote_user);
		sb.append("\ntime_local:"+this.time_local);
		sb.append("\request:"+this.request);
		sb.append("\nstatues:"+this.statues);
		sb.append("\nbody_statues:"+this.body_bytes_sent);
		sb.append("\nhttp_referer:"+this.http_referer);
		sb.append("\nhttp_user_agent:"+this.http_user_agent);
		return sb.toString();
	}

	public String getRemote_add() {
		return remote_add;
	}

	public void setRemote_add(String remote_add) {
		this.remote_add = remote_add;
	}

	public String getRemote_user() {
		return remote_user;
	}

	public void setRemote_user(String remote_user) {
		this.remote_user = remote_user;
	}

	public String getTime_local() {
		return time_local;
	}

	public void setTime_local(String time_local) {
		this.time_local = time_local;
	}

	public String getRequest() {
		return request;
	}

	public void setRequest(String request) {
		this.request = request;
	}

	public String getStatues() {
		return statues;
	}

	public void setStatues(String statues) {
		this.statues = statues;
	}

	public String getBody_bytes_sent() {
		return body_bytes_sent;
	}

	public void setBody_bytes_sent(String body_bytes_sent) {
		this.body_bytes_sent = body_bytes_sent;
	}

	public String getHttp_referer() {
		if(http_referer.length()<8){
			return http_referer;
		}
		String str = this.http_referer.replace("\""," ").replace("http://", "").replace("https://", "");
		return str.indexOf("/")>0?str.substring(0,str.indexOf("/")):str;
	}

	public void setHttp_referer(String http_referer) {
		this.http_referer = http_referer;
	}

	public String getHttp_user_agent() {
		return http_user_agent;
	}

	public void setHttp_user_agent(String http_user_agent) {
		this.http_user_agent = http_user_agent;
	}

	public boolean isValid() {
		return valid;
	}

	public void setValid(boolean valid) {
		this.valid = valid;
	};
	public java.util.Date getTime_local_Date() throws ParseException{
		SimpleDateFormat sdf = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss",Locale.US);
		return sdf.parse(this.time_local);
		
	}
	public String getTime_local_Date_hour() throws ParseException{
		SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHH");
		return sdf.format(this.getTime_local_Date());
	}
	  public static void main(String args[]) {//測試是否正確分割
	        String line = "222.68.172.190 - - [18/Sep/2013:06:49:57 +0000] \"GET /images/my.jpg HTTP/1.1\" 200 19939 \"http://www.angularjs.cn/A00n\" \"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36\"";
	        System.out.println(line);
	        Kpi kpi = new Kpi();
	        String[] arr = line.split(" ");

	        kpi.setRemote_add(arr[0]);
	        kpi.setRemote_user(arr[1]);
	        kpi.setTime_local(arr[3].substring(1));
	        kpi.setRequest(arr[6]);
	        kpi.setStatues(arr[8]);
	        kpi.setBody_bytes_sent(arr[9]);
	        kpi.setHttp_referer(arr[10]);
	        kpi.setHttp_user_agent(arr[11] + " " + arr[12]);
	        System.out.println(kpi);

	        try {
	            SimpleDateFormat df = new SimpleDateFormat("yyyy.MM.dd:HH:mm:ss", Locale.US);
	            System.out.println(df.format(kpi.getTime_local_Date()));
	            System.out.println(kpi.getTime_local_Date_hour());
	            System.out.println(kpi.getHttp_referer());
	        } catch (ParseException e) {
	            e.printStackTrace();
	        }
	    }

	    private static Kpi parser(String line) {
	        System.out.println(line);
	        Kpi kpi = new Kpi();
	        String[] arr = line.split(" ");
	        if (arr.length > 11) {
	            kpi.setRemote_add(arr[0]);
	            kpi.setRemote_user(arr[1]);
	            kpi.setTime_local(arr[3].substring(1));
	            kpi.setRequest(arr[6]);
	            kpi.setStatues(arr[8]);
	            kpi.setBody_bytes_sent(arr[9]);
	            kpi.setHttp_referer(arr[10]);
	            
	            if (arr.length > 12) {
	                kpi.setHttp_user_agent(arr[11] + " " + arr[12]);
	            } else {
	                kpi.setHttp_user_agent(arr[11]);
	            }

	            if (Integer.parseInt(kpi.getStatues()) >= 400) {// 大於400,HTTP錯誤
	                kpi.setValid(false);
	            }
	        } else {
	            kpi.setValid(false);
	        }
	        return kpi;
	    
	  }
	    /**
	     * 按page的pv分類, fliter the pages that setted
	     * 
	     */
	    public static Kpi filterPVs(String line) {
	        Kpi kpi = parser(line);
	       /* Set pages = new HashSet();
	        pages.add("/about");
	        pages.add("/black-ip-list/");
	        pages.add("/cassandra-clustor/");
	        pages.add("/finance-rhive-repurchase/");
	        pages.add("/hadoop-family-roadmap/");
	        pages.add("/hadoop-hive-intro/");
	        pages.add("/hadoop-zookeeper-intro/");
	        pages.add("/hadoop-mahout-roadmap/");

	      if (!pages.contains(kpi.getRequest())) {
	            kpi.setValid(false);
	        }*/
	        return kpi;
	    }
	
}






package org.apache.hadoop.examples;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.examples.WordCount.IntSumReducer;
import org.apache.hadoop.examples.WordCount.TokenizerMapper;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class KpiPv {//MapReduce分析指定pv(page view)數據

	/*
	 * MapReduce工作過程分爲map階段和reduce階段。每個階段都有鍵值對作爲輸入和輸出,並且它們的類型是由程序員指定的。同時程序員需要做的工作是編寫map和reduce函數。
	 * map階段輸入的key是在文件開頭部分文本起始處的偏移量,但是一般沒有這方面的需要,所以可以忽略。可以在map階段進行數據的篩選
	 * Hadoop規定了自己的一套可用於網絡序列化的基本類型,便於RPC等功能的實現。所以沒有使用Java內置內型。可以簡單的認爲Text類型相當於java的String,IntWritable相當於Integer
	 * map的輸入參數是個 Text之類的 對象,並不是 file對象
	 * 1. 怎麼將 文件參數 傳遞 到 job中呢?在 client 我們調用了FileInputFormat.addInputPath(job, new Path(otherArgs[0]));FileInputFormat 實現了 InputFormat 接口 
	 *  在 InputFormat 接口中 有getSplits方法,也就是說分片操作實際上實在 map之前 就已經做好了
	 * 2.計算出來的分片有時怎麼傳遞給 map呢 ?對於單詞數量如何累加?
	 * nputFormat中的另一個方法createRecordReader() 這個方法:RecordReader:RecordReader是用來從一個輸入分片中讀取一個一個的K -V 對的抽象類,
	 * 它最主要的方法就是nextKeyvalue()方法,由它獲取分片上的下一個K-V 對
	 * Hadoop把輸入數據劃分爲等長的小數據發送到MapReduce,稱爲分片(input split)每個分片都創建一個map任務,由它來運行用戶自定義的map函數來分析每個分片中的記錄
	 */
	public static class CountMapper extends
			Mapper<Object, Text, Text, IntWritable> {

		private final static IntWritable one = new IntWritable(1);//這裏的one和word的作用是什麼?它們是全局變量
		private Text word = new Text();

		public void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {//map函數中的參數分別對應的是:輸入的鍵,輸入的值,上下文對象(充當舊版本的OutputCollector以及Reporter角色)
				//輸入的鍵是文本的偏移量,通常我們不需要管,輸入的值是一行文本,所以對於用戶的文件,MapReduce框架進行切割處理之後,對於每一行文本都會調用map函數進行處理
				//map函數的作用在於,對於輸入的key-value對進行處理,得到新的key-value輸出作爲reduce函數的輸入
				//map函數確認每一行的依據是什麼?換行符?
			Kpi kpi = Kpi.filterPVs(value.toString());
			if (kpi.isValid()) {//數據篩選
				word.set(kpi.getRequest());
				context.write(word, one);
			}
			/*
			 * while (itr.hasMoreTokens()) { word.set(itr.nextToken());
			 * context.write(word, one); }
			 */
		}
	}

	/*
	  public static class KPIPVReducer extends MapReduceBase implements Reducer {
        private IntWritable result = new IntWritable();

        @Override
        public void reduce(Text key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
		//reduce任務的輸入是所有map任務的輸出所組成的集合,它們通過網絡進行傳輸彙集
            int sum = 0;
            while (values.hasNext()) {
                sum += values.next().get();
            }
            result.set(sum);
            output.collect(key, result);
        }
    }
	 */
	public static class KPiReducer extends
			Reducer<Text, IntWritable, Text, IntWritable> {
		private IntWritable result = new IntWritable();

		public void reduce(Text key, Iterable<IntWritable> values,
				Context context) throws IOException, InterruptedException {
			int sum = 0;
			for (IntWritable val : values) {
				sum += val.get();
			}
			result.set(sum);
			context.write(key, result);
		}
	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		// String[] otherArgs = new GenericOptionsParser(conf, args)
		// .getRemainingArgs();
		String[] otherArgs = new String[] { "input01", "output03" };
		if (otherArgs.length != 2) {
			System.err.println("Usage: wordcount <in> <out>");
			System.exit(2);
		}
		Job job = new Job(conf, "word count");
		job.setJarByClass(KpiPv.class);
		job.setMapperClass(CountMapper.class);
		job.setCombinerClass(IntSumReducer.class);
		job.setReducerClass(IntSumReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));//將文件路徑加載到了conf中;定義輸入的路徑可以是單個文件,也可以是目錄,或者是文件模式的路徑,並且可以被調用多次
		//從而實現使用多路徑輸入。
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));//指定了reduce函數輸出文件寫入的目錄,在運行job前該目錄不應該存在,否則會提示錯誤,這樣式爲了防止數據被覆蓋
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}



發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章