上傳日誌文件到hadoop的dfs當中去
一、根據上述日誌文件,計算該天的獨立ip數,pv數(注意要篩選日誌,並非每條記錄都要統計),被傳輸頁面的總字節數
1、將日誌信息分爲8個字段,創建指標對象KPI
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
/*
* KPI Object
*/
public class KPI {
private String remote_addr;// 記錄客戶端的ip地址
private String remote_user;// 記錄客戶端用戶名稱,忽略屬性"-"
private String time_local;// 記錄訪問時間與時區
private String request;// 記錄請求的url與http協議
private String status;// 記錄請求狀態;成功是200
private String body_bytes_sent;// 記錄發送給客戶端文件主體內容大小
private String http_referer;// 用來記錄從那個頁面鏈接訪問過來的
private String http_user_agent;// 記錄客戶瀏覽器的相關信息
private boolean valid = true;// 判斷數據是否合法
private static KPI parser(String line) {
KPI kpi = new KPI();
String[] arr = line.split(" ");
if (arr.length > 11) {
kpi.setRemote_addr(arr[0]);
kpi.setRemote_user(arr[1]);
kpi.setTime_local(arr[3].substring(1));
kpi.setRequest(arr[6]);
kpi.setStatus(arr[8]);
kpi.setBody_bytes_sent(arr[9]);
kpi.setHttp_referer(arr[10]);
if (arr.length > 12) {
kpi.setHttp_user_agent(arr[11] + " " + arr[12]);
} else {
kpi.setHttp_user_agent(arr[11]);
}
try{
// 存在status沒有的情況,直接pass
if (Integer.parseInt(kpi.getStatus()) >= 400) {// 大於400,HTTP錯誤
kpi.setValid(false);
}
}catch(Exception e){
System.out.println(line);
kpi.setValid(false);
}
} else {
kpi.setValid(false);
}
return kpi;
}
/**
* 按page的pv分類
*/
public static KPI filterPVs(String line) {
/*KPI kpi = parser(line);
Set<String> pages = new HashSet<String>();
pages.add("/forum-46-1.html");
pages.add("/forum-58-1.html");
pages.add("/forum-61-1.html");
if (!pages.contains(kpi.getRequest())) {
kpi.setValid(false);
}
return kpi;*/
return parser(line);
}
/**
* 按page的獨立ip分類
*/
public static KPI filterIPs(String line) {
/*KPI kpi = parser(line);
Set<String> pages = new HashSet<String>();
pages.add("/forum-46-1.html");
pages.add("/forum-58-1.html");
pages.add("/forum-61-1.html");
if (!pages.contains(kpi.getRequest())) {
kpi.setValid(false);
}
return kpi;*/
return parser(line);
}
/**
* PV按瀏覽器分類
*/
public static KPI filterBroswer(String line) {
return parser(line);
}
/**
* PV按小時分類
*/
public static KPI filterTime(String line) {
return parser(line);
}
/**
* PV按訪問域名分類
*/
public static KPI filterDomain(String line) {
return parser(line);
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("valid:" + this.valid);
sb.append("\nremote_addr:" + this.remote_addr);
sb.append("\nremote_user:" + this.remote_user);
sb.append("\ntime_local:" + this.time_local);
sb.append("\nrequest:" + this.request);
sb.append("\nstatus:" + this.status);
sb.append("\nbody_bytes_sent:" + this.body_bytes_sent);
sb.append("\nhttp_referer:" + this.http_referer);
sb.append("\nhttp_user_agent:" + this.http_user_agent);
return sb.toString();
}
public String getRemote_addr() {
return remote_addr;
}
public void setRemote_addr(String remote_addr) {
this.remote_addr = remote_addr;
}
public String getRemote_user() {
return remote_user;
}
public void setRemote_user(String remote_user) {
this.remote_user = remote_user;
}
public String getTime_local() {
return time_local;
}
public Date getTime_local_Date() throws ParseException {
SimpleDateFormat df = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss",
Locale.US);
return df.parse(this.time_local);
}
public String getTime_local_Date_hour() throws ParseException {
SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHH");
return df.format(this.getTime_local_Date());
}
public void setTime_local(String time_local) {
this.time_local = time_local;
}
public String getRequest() {
return request;
}
public void setRequest(String request) {
this.request = request;
}
public String getStatus() {
return status;
}
public void setStatus(String status) {
this.status = status;
}
public String getBody_bytes_sent() {
return body_bytes_sent;
}
public void setBody_bytes_sent(String body_bytes_sent) {
this.body_bytes_sent = body_bytes_sent;
}
public String getHttp_referer() {
return http_referer;
}
public String getHttp_referer_domain() {
if (http_referer.length() < 8) {
return http_referer;
}
String str = this.http_referer.replace("\"", "").replace("http://", "")
.replace("https://", "");
return str.indexOf("/") > 0 ? str.substring(0, str.indexOf("/")) : str;
}
public void setHttp_referer(String http_referer) {
this.http_referer = http_referer;
}
public String getHttp_user_agent() {
return http_user_agent;
}
public void setHttp_user_agent(String http_user_agent) {
this.http_user_agent = http_user_agent;
}
public boolean isValid() {
return valid;
}
public void setValid(boolean valid) {
this.valid = valid;
}
public static void main(String args[]) {
String line = "112.97.24.243 - - [31/Jan/2012:00:14:48 +0800] \"GET /data/cache/style_2_common.css?AZH HTTP/1.1\" 200 57752 \"http://f.dataguru.cn/forum-58-1.html\" \"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Mobile/9A406\"";
System.out.println(line);
KPI kpi = new KPI();
String[] arr = line.split(" ");
kpi.setRemote_addr(arr[0]);
kpi.setRemote_user(arr[1]);
kpi.setTime_local(arr[3].substring(1));
kpi.setRequest(arr[6]);
kpi.setStatus(arr[8]);
kpi.setBody_bytes_sent(arr[9]);
kpi.setHttp_referer(arr[10]);
kpi.setHttp_user_agent(arr[11] + " " + arr[12]);
System.out.println(kpi);
try {
SimpleDateFormat df = new SimpleDateFormat("yyyy.MM.dd:HH:mm:ss z",Locale.US);
System.out.println(df.format(kpi.getTime_local_Date()));
System.out.println(kpi.getTime_local_Date_hour());
System.out.println(kpi.getHttp_referer_domain());
} catch (ParseException e) {
e.printStackTrace();
}
}
}
2、計算獨立ip、pv和總字節數代碼
package week5;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
public class KPIIPVBYTE {
public static class ParseLogMapper extends MapReduceBase implements Mapper<Object, Text, Text, Text> {
@Override
public void map(Object key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
KPI kpi = KPI.filterIPs(value.toString());
if (kpi.isValid()) {
output.collect(new Text("ip"), new Text(kpi.getRemote_addr()));
output.collect(new Text("pv"), new Text("1"));
output.collect(new Text("ps"), new Text("".equals(kpi.getBody_bytes_sent())?"0":kpi.getBody_bytes_sent()));
}
}
}
public static class parseLogReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
private Set<String> count = new HashSet<String>();
private int sumPv=0;
private long sumPs=0;
@Override
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
String keys = key.toString();
List<Text> listv = new ArrayList<Text>();
if("ip".equals(keys.toLowerCase().trim())){
while (values.hasNext()) {
count.add(values.next().toString());
}
output.collect(new Text("IP總數:"), new Text(String.valueOf(count.size())));
}else if("pv".equals(keys.toLowerCase().trim())){
while (values.hasNext()) {
sumPv+= Long.parseLong(values.next().toString());
}
output.collect(new Text("PV數:"), new Text(String.valueOf(sumPv)));
}else if("ps".equals(keys.toLowerCase().trim())){
while (values.hasNext()) {
sumPs +=Integer.parseInt(values.next().toString());
}
sumPs = sumPs/1024/1024;
output.collect(new Text("總字節數:"), new Text(String.valueOf(sumPs)+"M"));
}
}
}
public static void main(String[] args) throws Exception{
String input = "hdfs://hadoop1:9000/week5/in/access.20120104.log";
String output = "hdfs://hadoop1:9000/week5/out/";
JobConf conf = new JobConf(KPIIPVBYTE.class);
conf.setJobName("IPPVPS");
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(Text.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(ParseLogMapper.class);
conf.setReducerClass(parseLogReducer.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(input));
FileOutputFormat.setOutputPath(conf, new Path(output));
JobClient.runJob(conf);
System.exit(0);
}
}
結果:
IP總數: 34413
總字節數: 67627M
PV數: 2910085
二、統計來源網站,列出域名及帶來的獨立ip數
1、來源統計代碼
package week5;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class SourceCount {
public static class MyMapper extends Mapper<Object, Text, Text, Text> {
private Text state = new Text();
private Text ip = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
KPI kpi = KPI.filterIPs(value.toString());
if (kpi.isValid()) {
state.set(kpi.getHttp_referer());
ip.set(kpi.getRemote_addr());
context.write(state, ip);
}
}
}
public static class SumReducer extends Reducer<Text, Text, Text, Text> {
private Text result = new Text();
public void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {
Set<String> ips = new HashSet<String>();
for (Text val : values) {
ips.add(val.toString());
}
result.set(String.valueOf(ips.size()));
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setJarByClass(SourceCount.class);
job.setMapperClass(MyMapper.class);
job.setCombinerClass(SumReducer.class);
job.setReducerClass(SumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path("hdfs://hadoop1:9000/week5/in/access.20120104.log"));
FileOutputFormat.setOutputPath(job, new Path("hdfs://hadoop1:9000/week5/out3"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
結果:
太多了,取部分爲證
三、統計用戶使用的瀏覽器種類,計算出各種瀏覽器佔的百分比
1、瀏覽器佔比代碼
package week5;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
public class KPIBroswer {
private static long total = 0;
public static class KPIBrowserMapper extends MapReduceBase implements
Mapper<Object, Text, Text, IntWritable> {
private Text browserInfo = new Text();
private IntWritable one = new IntWritable(1);
public void map(Object key, Text value,
OutputCollector<Text, IntWritable> output, Reporter reporter)
throws IOException {
KPI kpi = KPI.filterBroswer(value.toString());
if (kpi.isValid()) {
browserInfo.set(kpi.getHttp_user_agent());
total++;
output.collect(browserInfo, one);
}
}
}
public static class KPIBrowserReducer extends MapReduceBase implements
Reducer<Text, IntWritable, Text, Text> {
private Text result = new Text();
public void reduce(Text key, Iterator<IntWritable> values,
OutputCollector<Text, Text> output, Reporter reporter)
throws IOException {
long sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
System.out.println("answer is over there");
System.out.println(sum);
System.out.println(total);
System.out.println(String.valueOf(((double) sum / total * 100) + "%"));
result.set(String.valueOf(((double) sum / total * 100) + "%"));
output.collect(key, result);
}
}
public static void main(String[] args) throws Exception {
String input = "hdfs://hadoop1:9000/week5/in/access.20120104.log";
String output = "hdfs://hadoop1:9000/week5/out4";
JobConf conf = new JobConf(KPIBroswer.class);
conf.setJobName("KPIBrowser");
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(IntWritable.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(KPIBrowserMapper.class);
// conf.setCombinerClass(KPIBrowserReducer.class);
conf.setReducerClass(KPIBrowserReducer.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(input));
FileOutputFormat.setOutputPath(conf, new Path(output));
JobClient.runJob(conf);
System.exit(0);
}
}
2、結果:
2.0617954458374926E-4%
"" 2.0617954458374926E-4%
"(C)Nokia6700s/SymbianOS/9.1 Series60/3.0" 3.4363257430624875E-5%
"-" 1.5351785257131665%
"-" "-" 6.872651486124975E-5%
"-" "Mozilla/4.0" 1.0308977229187463E-4%
"Amoi-F90/Plat-F/WAP2.0/MIDP1.0/CLDC1.0 UP.Browser/6.2.2.6.f.1.100 6.872651486124975E-5%
"AmoiE70/6.1.08/WAP2.0 Profile/MIDP2.0 3.4363257430624875E-5%
"AndroidDownloadManager" 6.872651486124975E-5%
"Apache-HttpClient/4.1 (java 3.4363257430624875E-5%
"Apache-HttpClient/4.1.1 (java 3.4363257430624875E-5%
"AppEngine-Google; (+http://code.google.com/appengine; 2.0617954458374926E-4%
"Apple-PubSub/28" 0.001683799614100619%
"Apple-PubSub/65" 3.4363257430624875E-5%
"Apple-PubSub/65.20" 3.4363257430624875E-5%
"Apple-PubSub/65.23" 6.185386337512477E-4%
"Apple-PubSub/65.28 AppEngine-Google; 3.4363257430624875E-5%
"Apple-PubSub/65.28" 0.015807098418087445%
"BGSY bot/1.0" 0.0041235908916749855%
"BaiduMobile/1.3.1 CFNetwork/485.12.7 3.4363257430624875E-5%
"BaiduMobile/1.3.2 CFNetwork/548.0.4 3.4363257430624875E-5%
"BaiduMobile/1.3.4 CFNetwork/485.12.7 6.872651486124975E-5%
"BaiduMobile/1.3.4 CFNetwork/485.13.9 6.872651486124975E-5%
"BaiduMobile/1.3.5 CFNetwork/485.12.7 3.4363257430624875E-5%
"BaiduMobile/1.3.5 CFNetwork/485.13.9 2.4054280201437413E-4%
"BaiduMobile/1.3.5 CFNetwork/548.0.3 6.872651486124975E-5%
"BaiduMobile/1.3.5 CFNetwork/548.0.4 1.374530297224995E-4%
"Baiduspider" 1.374530297224995E-4%
"Baiduspider+(+http://www.baidu.com/search/spider.htm)" 0.01147732798182871%
"Baiduspider-news+(+http://www.baidu.com/search/spider.htm)" 0.009003173446823718%
"BlackBerry8820/2.7.0.105-4.5.0.174 Profile/MIDP-2.0 1.718162871531244E-4%
"BlackBerry8900/5.2.0.96 Profile/MIDP-2.0 6.872651486124975E-5%
"BlackBerry9000/5.2.0.89 Profile/MIDP-2.0 3.4363257430624875E-5%
"BlackBerry9700/5.0.0.862 Profile/MIDP-2.1 3.4363257430624875E-5%
"CoolPadF800/CMCC WindowsCEOS/6.0/(2009.10.30)10.01.F800/WAP2.0 1.374530297224995E-4%
"Dalvik/1.2.0 (Linux; 5.49812118889998E-4%
"Dalvik/1.4.0 (Linux; 3.4363257430624875E-5%
"DoCoMo/2.0 N905i(c100;TB;W24H16) 3.779958317368737E-4%
"DoCoMo/2.0 P900i(c100;TB;W24H11) 1.374530297224995E-4%
"Domnutch-Bot/Nutch-1.0 (Domnutch; 6.872651486124975E-5%
"Doubanbot/1.0 ([email protected] 9.278079506268717E-4%
"E63/SymbianOS/9.1 Series60/3.0" 2.4054280201437413E-4%
"E66/SymbianOS/9.1 Series60/3.0" 4.123590891674985E-4%
"E71/SymbianOS/9.1 Series60/3.0" 6.872651486124975E-5%
"FTRF: Friendly 6.185386337512477E-4%
"Feed43 Proxy/1.0 3.779958317368737E-4%
"FeedDemon/3.1 (http://www.feeddemon.com/; 6.872651486124975E-5%
"FeedDemon/4.0 (http://www.feeddemon.com/; 0.00402050111938311%
"FeedFetcher-Google-CoOp; (+http://www.google.com/coop/cse/cref)" 7.559916634737474E-4%
"Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)" 6.872651486124976E-4%
"Feedfetcher-Google; (+http://www.google.com/feedfetcher.html; 0.3223273546992614%
"Feedreader 3.14 0.0035394155153543627%
"GIONEE-L011/SW1.0.0/WAP2.0/MIDP2.1 Configuration/CLDC-1.1" 6.872651486124975E-5%
"GoodReaderIPad/3.12.0 CFNetwork/548.0.4 3.4363257430624875E-5%
"GoogleProducer" 0.0015119833269474948%
"Googlebot-Image/1.0" 1.0308977229187463E-4%
"Googlebot/2.1 (+http://www.google.com/bot.html)" 0.0017181628715312442%
"Googlebot/2.1 (+http://www.googlebot.com/bot.html)" 2.0617954458374926E-4%
"Googlebot/2.1 (http://www.googlebot.com/bot.html)" 1.0308977229187463E-4%
"GreatNews/1.0" 0.012576952219608707%
"HD_T8282 Mozilla/4.0 3.4363257430624875E-5%
"HTCT9188_TD/1.0 WindowsMobile/6.5 1.0308977229187463E-4%
"HTC_Touch_Diamond2_T5353 Mozilla/4.0 1.0308977229187463E-4%
"HTC_Touch_Pro_T7272 Mozilla/4.0 3.4363257430624875E-5%
"HTMLParser/1.6" 6.872651486124975E-5%
"HTTP Fetcher/HTTP/1.0" 3.4363257430624875E-5%
"HTTP_Request2/2.0.0 (http://pear.php.net/package/http_request2) 2.74906059444999E-4%
"Holleycomm-H8800/2.0 WAP2.0 3.436325743062488E-4%
"HuaweiSymantecSpider/[email protected]+(compatible; MSIE 0.04123590891674985%
"HuaweiT5211_TD/1.0 RTKE_OS/01.00 1.718162871531244E-4%
"HuaweiT8100_TD/1.0 Android/2.2 1.374530297224995E-4%
"HuaweiU7520/B000 Browser/NetFront/4.1 3.4363257430624875E-5%
"Huaweisymantecspider (compatible; 0.027490605944499907%
"IUC(U;iOS 3.1.3;Zh-cn;320*480;)" 3.4363257430624875E-5%
"IUC(U;iOS 4.1;Zh-cn;320*480;)/UCWEB8.1.0.104/41/997" 4.123590891674985E-4%
...