原文鏈接:http://blog.fens.me/hadoop-mapreduce-log-kpi/
數據類來源:網站訪問記錄
先上代碼:
對原作者的代碼進行了部分更改以適應新版本的Hadoop,同時記錄下自己的問題和查找答案,方便以後快速回憶
package org.apache.hadoop.examples;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
public class Kpi {//bean,將數據封裝,轉化爲String
private String remote_add;
private String remote_user;
private String time_local;
private String request;
private String statues;
private String body_bytes_sent;
private String http_referer;
private String http_user_agent;
private boolean valid = true;
public String toString(){
StringBuilder sb = new StringBuilder();
sb.append("valid:"+this.valid);
sb.append("\nremote:_addr:"+this.remote_add);
sb.append("\nremote_user:"+this.remote_user);
sb.append("\ntime_local:"+this.time_local);
sb.append("\request:"+this.request);
sb.append("\nstatues:"+this.statues);
sb.append("\nbody_statues:"+this.body_bytes_sent);
sb.append("\nhttp_referer:"+this.http_referer);
sb.append("\nhttp_user_agent:"+this.http_user_agent);
return sb.toString();
}
public String getRemote_add() {
return remote_add;
}
public void setRemote_add(String remote_add) {
this.remote_add = remote_add;
}
public String getRemote_user() {
return remote_user;
}
public void setRemote_user(String remote_user) {
this.remote_user = remote_user;
}
public String getTime_local() {
return time_local;
}
public void setTime_local(String time_local) {
this.time_local = time_local;
}
public String getRequest() {
return request;
}
public void setRequest(String request) {
this.request = request;
}
public String getStatues() {
return statues;
}
public void setStatues(String statues) {
this.statues = statues;
}
public String getBody_bytes_sent() {
return body_bytes_sent;
}
public void setBody_bytes_sent(String body_bytes_sent) {
this.body_bytes_sent = body_bytes_sent;
}
public String getHttp_referer() {
if(http_referer.length()<8){
return http_referer;
}
String str = this.http_referer.replace("\""," ").replace("http://", "").replace("https://", "");
return str.indexOf("/")>0?str.substring(0,str.indexOf("/")):str;
}
public void setHttp_referer(String http_referer) {
this.http_referer = http_referer;
}
public String getHttp_user_agent() {
return http_user_agent;
}
public void setHttp_user_agent(String http_user_agent) {
this.http_user_agent = http_user_agent;
}
public boolean isValid() {
return valid;
}
public void setValid(boolean valid) {
this.valid = valid;
};
public java.util.Date getTime_local_Date() throws ParseException{
SimpleDateFormat sdf = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss",Locale.US);
return sdf.parse(this.time_local);
}
public String getTime_local_Date_hour() throws ParseException{
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHH");
return sdf.format(this.getTime_local_Date());
}
public static void main(String args[]) {//測試是否正確分割
String line = "222.68.172.190 - - [18/Sep/2013:06:49:57 +0000] \"GET /images/my.jpg HTTP/1.1\" 200 19939 \"http://www.angularjs.cn/A00n\" \"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36\"";
System.out.println(line);
Kpi kpi = new Kpi();
String[] arr = line.split(" ");
kpi.setRemote_add(arr[0]);
kpi.setRemote_user(arr[1]);
kpi.setTime_local(arr[3].substring(1));
kpi.setRequest(arr[6]);
kpi.setStatues(arr[8]);
kpi.setBody_bytes_sent(arr[9]);
kpi.setHttp_referer(arr[10]);
kpi.setHttp_user_agent(arr[11] + " " + arr[12]);
System.out.println(kpi);
try {
SimpleDateFormat df = new SimpleDateFormat("yyyy.MM.dd:HH:mm:ss", Locale.US);
System.out.println(df.format(kpi.getTime_local_Date()));
System.out.println(kpi.getTime_local_Date_hour());
System.out.println(kpi.getHttp_referer());
} catch (ParseException e) {
e.printStackTrace();
}
}
private static Kpi parser(String line) {
System.out.println(line);
Kpi kpi = new Kpi();
String[] arr = line.split(" ");
if (arr.length > 11) {
kpi.setRemote_add(arr[0]);
kpi.setRemote_user(arr[1]);
kpi.setTime_local(arr[3].substring(1));
kpi.setRequest(arr[6]);
kpi.setStatues(arr[8]);
kpi.setBody_bytes_sent(arr[9]);
kpi.setHttp_referer(arr[10]);
if (arr.length > 12) {
kpi.setHttp_user_agent(arr[11] + " " + arr[12]);
} else {
kpi.setHttp_user_agent(arr[11]);
}
if (Integer.parseInt(kpi.getStatues()) >= 400) {// 大於400,HTTP錯誤
kpi.setValid(false);
}
} else {
kpi.setValid(false);
}
return kpi;
}
/**
* 按page的pv分類, fliter the pages that setted
*
*/
public static Kpi filterPVs(String line) {
Kpi kpi = parser(line);
/* Set pages = new HashSet();
pages.add("/about");
pages.add("/black-ip-list/");
pages.add("/cassandra-clustor/");
pages.add("/finance-rhive-repurchase/");
pages.add("/hadoop-family-roadmap/");
pages.add("/hadoop-hive-intro/");
pages.add("/hadoop-zookeeper-intro/");
pages.add("/hadoop-mahout-roadmap/");
if (!pages.contains(kpi.getRequest())) {
kpi.setValid(false);
}*/
return kpi;
}
}
package org.apache.hadoop.examples;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.examples.WordCount.IntSumReducer;
import org.apache.hadoop.examples.WordCount.TokenizerMapper;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class KpiPv {//MapReduce分析指定pv(page view)數據
/*
* MapReduce工作過程分爲map階段和reduce階段。每個階段都有鍵值對作爲輸入和輸出,並且它們的類型是由程序員指定的。同時程序員需要做的工作是編寫map和reduce函數。
* map階段輸入的key是在文件開頭部分文本起始處的偏移量,但是一般沒有這方面的需要,所以可以忽略。可以在map階段進行數據的篩選
* Hadoop規定了自己的一套可用於網絡序列化的基本類型,便於RPC等功能的實現。所以沒有使用Java內置內型。可以簡單的認爲Text類型相當於java的String,IntWritable相當於Integer
* map的輸入參數是個 Text之類的 對象,並不是 file對象
* 1. 怎麼將 文件參數 傳遞 到 job中呢?在 client 我們調用了FileInputFormat.addInputPath(job, new Path(otherArgs[0]));FileInputFormat 實現了 InputFormat 接口
* 在 InputFormat 接口中 有getSplits方法,也就是說分片操作實際上實在 map之前 就已經做好了
* 2.計算出來的分片有時怎麼傳遞給 map呢 ?對於單詞數量如何累加?
* nputFormat中的另一個方法createRecordReader() 這個方法:RecordReader:RecordReader是用來從一個輸入分片中讀取一個一個的K -V 對的抽象類,
* 它最主要的方法就是nextKeyvalue()方法,由它獲取分片上的下一個K-V 對
* Hadoop把輸入數據劃分爲等長的小數據發送到MapReduce,稱爲分片(input split)每個分片都創建一個map任務,由它來運行用戶自定義的map函數來分析每個分片中的記錄
*/
public static class CountMapper extends
Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);//這裏的one和word的作用是什麼?它們是全局變量
private Text word = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {//map函數中的參數分別對應的是:輸入的鍵,輸入的值,上下文對象(充當舊版本的OutputCollector以及Reporter角色)
//輸入的鍵是文本的偏移量,通常我們不需要管,輸入的值是一行文本,所以對於用戶的文件,MapReduce框架進行切割處理之後,對於每一行文本都會調用map函數進行處理
//map函數的作用在於,對於輸入的key-value對進行處理,得到新的key-value輸出作爲reduce函數的輸入
//map函數確認每一行的依據是什麼?換行符?
Kpi kpi = Kpi.filterPVs(value.toString());
if (kpi.isValid()) {//數據篩選
word.set(kpi.getRequest());
context.write(word, one);
}
/*
* while (itr.hasMoreTokens()) { word.set(itr.nextToken());
* context.write(word, one); }
*/
}
}
/*
public static class KPIPVReducer extends MapReduceBase implements Reducer {
private IntWritable result = new IntWritable();
@Override
public void reduce(Text key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
//reduce任務的輸入是所有map任務的輸出所組成的集合,它們通過網絡進行傳輸彙集
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
result.set(sum);
output.collect(key, result);
}
}
*/
public static class KPiReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// String[] otherArgs = new GenericOptionsParser(conf, args)
// .getRemainingArgs();
String[] otherArgs = new String[] { "input01", "output03" };
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "word count");
job.setJarByClass(KpiPv.class);
job.setMapperClass(CountMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));//將文件路徑加載到了conf中;定義輸入的路徑可以是單個文件,也可以是目錄,或者是文件模式的路徑,並且可以被調用多次
//從而實現使用多路徑輸入。
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));//指定了reduce函數輸出文件寫入的目錄,在運行job前該目錄不應該存在,否則會提示錯誤,這樣式爲了防止數據被覆蓋
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}