在Web日誌中,每條日誌通常代表着用戶的一次訪問行爲,例如下面就是一條nginx日誌:
222.68.172.190 - - [18/Sep/2013:06:49:57 +0000] "GET /images/my.jpg HTTP/1.1" 200 19939 "http://www.angularjs.cn/A00n" "Mozilla/5.0 (Windows NT 6.1)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36"
可以從記錄中提取以下8個變量進行統計:
remote_addr: 記錄客戶端的ip地址, 222.68.172.190
remote_user: 記錄客戶端用戶名稱, ––
time_local: 記錄訪問時間與時區, [18/Sep/2013:06:49:57 +0000]
request: 記錄請求的url與http協議, “GET /images/my.jpg HTTP/1.1”
status: 記錄請求狀態,成功是200, 200
body_bytes_sent: 記錄發送給客戶端文件主體內容大小, 19939
http_referer: 用來記錄從那個頁面鏈接訪問過來的, “http://www.angularjs.cn/A00n”
http_user_agent: 記錄客戶瀏覽器的相關信息, “Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/29.0.1547.66 Safari/537.36”
1、定義KPI類用於解析日誌文件,提取要統計的內容
KPI.java
import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashSet; import java.util.Locale; import java.util.Set; public class KPI { private String remote_addr; private String remote_user; private String time_local; private String request; private String status; private String body_bytes_sent; private String http_referer; private String http_user_agent; private boolean valid = true; private static KPI parser(String line) { System.out.println(line); KPI kpi = new KPI(); String[] arr = line.split(" "); if (arr.length > 11) { kpi.setRemote_addr(arr[0]); kpi.setRemote_user(arr[1]); kpi.setTime_local(arr[3].substring(1)); kpi.setRequest(arr[6]); kpi.setStatus(arr[8]); kpi.setBody_bytes_sent(arr[9]); kpi.setHttp_referer(arr[10]); if (arr.length > 12) { kpi.setHttp_user_agent(arr[11] + " " + arr[12]); } else { kpi.setHttp_user_agent(arr[11]); } if (Integer.parseInt(kpi.getStatus()) >= 400) { kpi.setValid(false); } } else { kpi.setValid(false); } return kpi; } public static KPI filterPVs(String line) { KPI kpi = parser(line); Set<String> pages = new HashSet<String>(); pages.add("/about"); pages.add("/black-ip-list/"); pages.add("/cassandra-clustor/"); pages.add("/finance-rhive-repurchase/"); pages.add("/hadoop-family-roadmap/"); pages.add("/hadoop-hive-intro/"); pages.add("/hadoop-zookeeper-intro/"); pages.add("/hadoop-mahout-roadmap/"); if (!pages.contains(kpi.getRequest())) { kpi.setValid(false); } return kpi; } public static KPI filterIPs(String line) { KPI kpi = parser(line); Set<String> pages = new HashSet<String>(); pages.add("/about"); pages.add("/black-ip-list/"); pages.add("/cassandra-clustor/"); pages.add("/finance-rhive-repurchase/"); pages.add("/hadoop-family-roadmap/"); pages.add("/hadoop-hive-intro/"); pages.add("/hadoop-zookeeper-intro/"); pages.add("/hadoop-mahout-roadmap/"); if (!pages.contains(kpi.getRequest())) { kpi.setValid(false); } return kpi; } public static KPI filterBroswer(String line) { return parser(line); } public static KPI filterTime(String line) { return parser(line); } public static KPI filterDomain(String line){ return parser(line); } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("valid:" + this.valid); sb.append("\nremote_addr:" + this.remote_addr); sb.append("\nremote_user:" + this.remote_user); sb.append("\ntime_local:" + this.time_local); sb.append("\nrequest:" + this.request); sb.append("\nstatus:" + this.status); sb.append("\nbody_bytes_sent:" + this.body_bytes_sent); sb.append("\nhttp_referer:" + this.http_referer); sb.append("\nhttp_user_agent:" + this.http_user_agent); return sb.toString(); } public String getRemote_addr() { return remote_addr; } public void setRemote_addr(String remote_addr) { this.remote_addr = remote_addr; } public String getRemote_user() { return remote_user; } public void setRemote_user(String remote_user) { this.remote_user = remote_user; } public String getTime_local() { return time_local; } public Date getTime_local_Date() throws ParseException { SimpleDateFormat df = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.US); return df.parse(this.time_local); } public String getTime_local_Date_hour() throws ParseException{ SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHH"); return df.format(this.getTime_local_Date()); } public void setTime_local(String time_local) { this.time_local = time_local; } public String getRequest() { return request; } public void setRequest(String request) { this.request = request; } public String getStatus() { return status; } public void setStatus(String status) { this.status = status; } public String getBody_bytes_sent() { return body_bytes_sent; } public void setBody_bytes_sent(String body_bytes_sent) { this.body_bytes_sent = body_bytes_sent; } public String getHttp_referer() { return http_referer; } public String getHttp_referer_domain(){ if(http_referer.length()<8){ return http_referer; } String str=this.http_referer.replace("\"", "").replace("http://", "").replace("https://", ""); return str.indexOf("/")>0?str.substring(0, str.indexOf("/")):str; } public void setHttp_referer(String http_referer) { this.http_referer = http_referer; } public String getHttp_user_agent() { return http_user_agent; } public void setHttp_user_agent(String http_user_agent) { this.http_user_agent = http_user_agent; } public boolean isValid() { return valid; } public void setValid(boolean valid) { this.valid = valid; } public static void main(String args[]) { String line = "222.68.172.190 - - [18/Sep/2013:06:49:57 +0000] \"GET /images/my.jpg HTTP/1.1\" 200 19939 \"http://www.angularjs.cn/A00n\" \"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36\""; System.out.println(line); KPI kpi = new KPI(); String[] arr = line.split(" "); kpi.setRemote_addr(arr[0]); kpi.setRemote_user(arr[1]); kpi.setTime_local(arr[3].substring(1)); kpi.setRequest(arr[6]); kpi.setStatus(arr[8]); kpi.setBody_bytes_sent(arr[9]); kpi.setHttp_referer(arr[10]); kpi.setHttp_user_agent(arr[11] + " " + arr[12]); System.out.println(kpi); try { SimpleDateFormat df = new SimpleDateFormat("yyyy.MM.dd:HH:mm:ss", Locale.US); System.out.println(df.format(kpi.getTime_local_Date())); System.out.println(kpi.getTime_local_Date_hour()); System.out.println(kpi.getHttp_referer_domain()); } catch (ParseException e) { e.printStackTrace(); } } }
2、頁面訪問量統計KPIPV.java
import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; public class KPIPV { public static class KPIPVMapper extends Mapper<LongWritable, Text, Text, IntWritable> { private IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { KPI kpi = KPI.filterPVs(value.toString()); if (kpi.isValid()) { word.set(kpi.getRequest()); context.write(word, one); } } } public static class KPIPVReducer extends Reducer<Text, IntWritable, Text, IntWritable> { private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); @SuppressWarnings("deprecation") Job job = new Job(conf, "KPIPV"); job.setJarByClass(KPIPV.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(KPIPVMapper.class); job.setCombinerClass(KPIPVReducer.class); job.setReducerClass(KPIPVReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
運行結果如下:
3、頁面獨立IP的訪問量統計
KPIIP.java
import java.io.IOException; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; public class KPIIP { public static class KPIIPMapper extends Mapper<LongWritable, Text, Text, Text> { private Text word = new Text(); private Text ips = new Text(); public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { KPI kpi = KPI.filterIPs(value.toString()); if (kpi.isValid()) { word.set(kpi.getRequest()); ips.set(kpi.getRemote_addr()); context.write(word, ips); } } } public static class KPIIPReducer extends Reducer<Text, Text, Text, Text> { private Text result = new Text(); private Set<String> count = new HashSet<String>(); public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { Iterator<Text> iterator = values.iterator(); while (iterator.hasNext()) { count.add(iterator.next().toString()); } result.set(String.valueOf(count.size())); context.write(key, result); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); @SuppressWarnings("deprecation") Job job = new Job(conf, "KPIIP"); job.setJarByClass(KPIIP.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(KPIIPMapper.class); job.setCombinerClass(KPIIPReducer.class); job.setReducerClass(KPIIPReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true)?0:1); } }
運行結果如下:
4、用戶每小時PV的統計
KPITime.java
import java.io.IOException; import java.text.ParseException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; public class KPITime { public static class KPITimeMapper extends Mapper<LongWritable, Text, Text, IntWritable> { private IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { KPI kpi = KPI.filterBroswer(value.toString()); if (kpi.isValid()) { try { word.set(kpi.getTime_local_Date_hour()); } catch (ParseException e) { e.printStackTrace(); } context.write(word, one); } } } public static class KPITimeReducer extends Reducer<Text, IntWritable, Text, IntWritable> { private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); @SuppressWarnings("deprecation") Job job = new Job(conf, "KPITime"); job.setJarByClass(KPITime.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(KPITimeMapper.class); job.setCombinerClass(KPITimeReducer.class); job.setReducerClass(KPITimeReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
運行結果如下:
5、用戶的訪問設備統計
KPIBrowser.java
import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; public class KPIBrowser { public static class KPIBrowserMapper extends Mapper<LongWritable, Text, Text, IntWritable> { private IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { KPI kpi = KPI.filterBroswer(value.toString()); if (kpi.isValid()) { word.set(kpi.getHttp_user_agent()); context.write(word, one); } } } public static class KPIBrowserReducer extends Reducer<Text, IntWritable, Text, IntWritable> { private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(KPIBrowser.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(KPIBrowserMapper.class); job.setCombinerClass(KPIBrowserReducer.class); job.setReducerClass(KPIBrowserReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true)?0:1); } }
運行結果如下:
6、用戶來源域名的統計
KPISource.java
import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; public class KPISource { public static class KPISourceMapper extends Mapper<LongWritable, Text, Text, IntWritable>{ private IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { KPI kpi = KPI.filterDomain(value.toString()); if (kpi.isValid()) { word.set(kpi.getHttp_referer()); context.write(word, one); } } } public static class KPISourceReduce extends Reducer<Text, IntWritable, Text, IntWritable>{ private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } } public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(KPISource.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(KPISourceMapper.class); job.setCombinerClass(KPISourceReduce.class); job.setReducerClass(KPISourceReduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true)?0:1); } }
運行結果如下:
原文鏈接:http://blog.fens.me/hadoop-mapreduce-log-kpi/
當被處理對象是無規則文檔時,需要編寫解析工具類,在map階段直接提取需要統計的字段。
學之,以記之。
使用Hadoop提取網絡日誌KPI指標
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.