hadoop編程實戰——日誌分析

上傳日誌文件到hadoop的dfs當中去

一、根據上述日誌文件,計算該天的獨立ip數,pv數(注意要篩選日誌,並非每條記錄都要統計),被傳輸頁面的總字節數

1、將日誌信息分爲8個字段,創建指標對象KPI

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;

/*
 * KPI Object
 */
public class KPI {
    private String remote_addr;// 記錄客戶端的ip地址
    private String remote_user;// 記錄客戶端用戶名稱,忽略屬性"-"
    private String time_local;// 記錄訪問時間與時區
    private String request;// 記錄請求的url與http協議
    private String status;// 記錄請求狀態;成功是200
    private String body_bytes_sent;// 記錄發送給客戶端文件主體內容大小
    private String http_referer;// 用來記錄從那個頁面鏈接訪問過來的
    private String http_user_agent;// 記錄客戶瀏覽器的相關信息
    private boolean valid = true;// 判斷數據是否合法

    private static KPI parser(String line) {
        KPI kpi = new KPI();
        String[] arr = line.split(" ");
        if (arr.length > 11) {
            kpi.setRemote_addr(arr[0]);
            kpi.setRemote_user(arr[1]);
            kpi.setTime_local(arr[3].substring(1));
            kpi.setRequest(arr[6]);
            kpi.setStatus(arr[8]);
            kpi.setBody_bytes_sent(arr[9]);
            kpi.setHttp_referer(arr[10]);
            if (arr.length > 12) {
                kpi.setHttp_user_agent(arr[11] + " " + arr[12]);
            } else {
                kpi.setHttp_user_agent(arr[11]);
            }
            try{
                // 存在status沒有的情況,直接pass
                if (Integer.parseInt(kpi.getStatus()) >= 400) {// 大於400,HTTP錯誤
                    kpi.setValid(false);
                }
            }catch(Exception e){
                System.out.println(line);
                kpi.setValid(false);
            }

        } else {
            kpi.setValid(false);
        }
        return kpi;
    }

    /**
     * 按page的pv分類
     */
    public static KPI filterPVs(String line) {
        /*KPI kpi = parser(line);
        Set<String> pages = new HashSet<String>();
        pages.add("/forum-46-1.html");
        pages.add("/forum-58-1.html");
        pages.add("/forum-61-1.html");
        if (!pages.contains(kpi.getRequest())) {
            kpi.setValid(false);
        }
        return kpi;*/
        return parser(line);
    }

    /**
     * 按page的獨立ip分類
     */
    public static KPI filterIPs(String line) {
        /*KPI kpi = parser(line);
        Set<String> pages = new HashSet<String>();
        pages.add("/forum-46-1.html");
        pages.add("/forum-58-1.html");
        pages.add("/forum-61-1.html");
        if (!pages.contains(kpi.getRequest())) {
            kpi.setValid(false);
        }
        return kpi;*/
        return parser(line);
    }

    /**
     * PV按瀏覽器分類
     */
    public static KPI filterBroswer(String line) {
        return parser(line);
    }

    /**
     * PV按小時分類
     */
    public static KPI filterTime(String line) {
        return parser(line);
    }

    /**
     * PV按訪問域名分類
     */
    public static KPI filterDomain(String line) {
        return parser(line);
    }

    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder();
        sb.append("valid:" + this.valid);
        sb.append("\nremote_addr:" + this.remote_addr);
        sb.append("\nremote_user:" + this.remote_user);
        sb.append("\ntime_local:" + this.time_local);
        sb.append("\nrequest:" + this.request);
        sb.append("\nstatus:" + this.status);
        sb.append("\nbody_bytes_sent:" + this.body_bytes_sent);
        sb.append("\nhttp_referer:" + this.http_referer);
        sb.append("\nhttp_user_agent:" + this.http_user_agent);
        return sb.toString();
    }

    public String getRemote_addr() {
        return remote_addr;
    }

    public void setRemote_addr(String remote_addr) {
        this.remote_addr = remote_addr;
    }

    public String getRemote_user() {
        return remote_user;
    }

    public void setRemote_user(String remote_user) {
        this.remote_user = remote_user;
    }

    public String getTime_local() {
        return time_local;
    }

    public Date getTime_local_Date() throws ParseException {
        SimpleDateFormat df = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss",
                Locale.US);
        return df.parse(this.time_local);
    }

    public String getTime_local_Date_hour() throws ParseException {
        SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHH");
        return df.format(this.getTime_local_Date());
    }

    public void setTime_local(String time_local) {
        this.time_local = time_local;
    }

    public String getRequest() {
        return request;
    }

    public void setRequest(String request) {
        this.request = request;
    }

    public String getStatus() {
        return status;
    }

    public void setStatus(String status) {
        this.status = status;
    }

    public String getBody_bytes_sent() {
        return body_bytes_sent;
    }

    public void setBody_bytes_sent(String body_bytes_sent) {
        this.body_bytes_sent = body_bytes_sent;
    }

    public String getHttp_referer() {
        return http_referer;
    }

    public String getHttp_referer_domain() {
        if (http_referer.length() < 8) {
            return http_referer;
        }
        String str = this.http_referer.replace("\"", "").replace("http://", "")
                .replace("https://", "");
        return str.indexOf("/") > 0 ? str.substring(0, str.indexOf("/")) : str;
    }

    public void setHttp_referer(String http_referer) {
        this.http_referer = http_referer;
    }

    public String getHttp_user_agent() {
        return http_user_agent;
    }

    public void setHttp_user_agent(String http_user_agent) {
        this.http_user_agent = http_user_agent;
    }

    public boolean isValid() {
        return valid;
    }

    public void setValid(boolean valid) {
        this.valid = valid;
    }

    public static void main(String args[]) {
        String line = "112.97.24.243 - - [31/Jan/2012:00:14:48 +0800] \"GET /data/cache/style_2_common.css?AZH HTTP/1.1\" 200 57752 \"http://f.dataguru.cn/forum-58-1.html\" \"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Mobile/9A406\"";
        System.out.println(line);
        KPI kpi = new KPI();
        String[] arr = line.split(" ");
        kpi.setRemote_addr(arr[0]);
        kpi.setRemote_user(arr[1]);
        kpi.setTime_local(arr[3].substring(1));
        kpi.setRequest(arr[6]);
        kpi.setStatus(arr[8]);
        kpi.setBody_bytes_sent(arr[9]);
        kpi.setHttp_referer(arr[10]);
        kpi.setHttp_user_agent(arr[11] + " " + arr[12]);
        System.out.println(kpi);
        try {
            SimpleDateFormat df = new SimpleDateFormat("yyyy.MM.dd:HH:mm:ss z",Locale.US);
            System.out.println(df.format(kpi.getTime_local_Date()));
            System.out.println(kpi.getTime_local_Date_hour());
            System.out.println(kpi.getHttp_referer_domain());
        } catch (ParseException e) {
            e.printStackTrace();
        }
    }
}

2、計算獨立ip、pv和總字節數代碼


package week5;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;

public class KPIIPVBYTE {

public static class ParseLogMapper extends MapReduceBase implements Mapper<Object, Text, Text, Text> {

  @Override
  public void map(Object key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
   KPI kpi = KPI.filterIPs(value.toString());
   if (kpi.isValid()) {

    output.collect(new Text("ip"), new Text(kpi.getRemote_addr()));

    output.collect(new Text("pv"), new Text("1"));
    output.collect(new Text("ps"), new Text("".equals(kpi.getBody_bytes_sent())?"0":kpi.getBody_bytes_sent()));
   }

  }
}

  public static class parseLogReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
    private  Set<String> count = new HashSet<String>();
       private int sumPv=0;
       private long sumPs=0;

         @Override
         public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
             String keys = key.toString();
             List<Text> listv = new ArrayList<Text>();



           if("ip".equals(keys.toLowerCase().trim())){
            while (values.hasNext()) {
                   count.add(values.next().toString());
               }

            output.collect(new Text("IP總數:"), new Text(String.valueOf(count.size())));
           }else if("pv".equals(keys.toLowerCase().trim())){
            while (values.hasNext()) {
             sumPv+= Long.parseLong(values.next().toString());
               }

            output.collect(new Text("PV數:"), new Text(String.valueOf(sumPv)));
           }else if("ps".equals(keys.toLowerCase().trim())){
           while (values.hasNext()) {
                sumPs +=Integer.parseInt(values.next().toString());
           }
           sumPs = sumPs/1024/1024;
           output.collect(new Text("總字節數:"), new Text(String.valueOf(sumPs)+"M"));
           }   
         }
    }

public static void main(String[] args) throws Exception{
   String input = "hdfs://hadoop1:9000/week5/in/access.20120104.log";
         String output = "hdfs://hadoop1:9000/week5/out/";
         JobConf conf = new JobConf(KPIIPVBYTE.class);
         conf.setJobName("IPPVPS");
         conf.setMapOutputKeyClass(Text.class);
         conf.setMapOutputValueClass(Text.class);

         conf.setOutputKeyClass(Text.class);
         conf.setOutputValueClass(Text.class);

         conf.setMapperClass(ParseLogMapper.class);

         conf.setReducerClass(parseLogReducer.class);
         conf.setInputFormat(TextInputFormat.class);
         conf.setOutputFormat(TextOutputFormat.class);
         FileInputFormat.setInputPaths(conf, new Path(input));
         FileOutputFormat.setOutputPath(conf, new Path(output));
         JobClient.runJob(conf);
         System.exit(0);
     }
}

結果:

IP總數:   34413
總字節數:   67627M
PV數:    2910085

二、統計來源網站,列出域名及帶來的獨立ip數

1、來源統計代碼

package week5;

import java.io.IOException;
import java.util.HashSet;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class SourceCount {
        public static class MyMapper extends Mapper<Object, Text, Text, Text> {
                private Text state = new Text();
                private Text ip = new Text();

                public void map(Object key, Text value, Context context)
                                throws IOException, InterruptedException {

                    KPI kpi = KPI.filterIPs(value.toString());
                    if (kpi.isValid()) {
                            state.set(kpi.getHttp_referer());
                            ip.set(kpi.getRemote_addr());
                            context.write(state, ip);
                    }
                }
        }

        public static class SumReducer extends Reducer<Text, Text, Text, Text> {
                private Text result = new Text();

                public void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {
                        Set<String> ips = new HashSet<String>();
                        for (Text val : values) {
                                ips.add(val.toString());
                        }
                        result.set(String.valueOf(ips.size()));
                        context.write(key, result);
                }
        }

        public static void main(String[] args) throws Exception {
                 Configuration conf = new Configuration();
                 Job job = new Job(conf);
                 job.setJarByClass(SourceCount.class);
                 job.setMapperClass(MyMapper.class);
                 job.setCombinerClass(SumReducer.class);
                 job.setReducerClass(SumReducer.class);
                 job.setOutputKeyClass(Text.class);
                 job.setOutputValueClass(Text.class);
                 FileInputFormat.addInputPath(job, new Path("hdfs://hadoop1:9000/week5/in/access.20120104.log"));
                 FileOutputFormat.setOutputPath(job, new  Path("hdfs://hadoop1:9000/week5/out3"));
                 System.exit(job.waitForCompletion(true) ? 0 : 1);
        }
}

結果:

這裏寫代碼片
太多了,取部分爲證

三、統計用戶使用的瀏覽器種類,計算出各種瀏覽器佔的百分比

1、瀏覽器佔比代碼

package week5;

import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;

public class KPIBroswer {
    private static long total = 0;

    public static class KPIBrowserMapper extends MapReduceBase implements
            Mapper<Object, Text, Text, IntWritable> {
        private Text browserInfo = new Text();
        private IntWritable one = new IntWritable(1);

        public void map(Object key, Text value,
                OutputCollector<Text, IntWritable> output, Reporter reporter)
                throws IOException {
            KPI kpi = KPI.filterBroswer(value.toString());
            if (kpi.isValid()) {
                browserInfo.set(kpi.getHttp_user_agent());
                total++;
                output.collect(browserInfo, one);
            }
        }
    }

    public static class KPIBrowserReducer extends MapReduceBase implements
            Reducer<Text, IntWritable, Text, Text> {
        private Text result = new Text();

        public void reduce(Text key, Iterator<IntWritable> values,
                OutputCollector<Text, Text> output, Reporter reporter)
                throws IOException {
            long sum = 0;
            while (values.hasNext()) {
                sum += values.next().get();
            }
            System.out.println("answer is over there");
            System.out.println(sum);
            System.out.println(total);
            System.out.println(String.valueOf(((double) sum / total * 100) + "%"));
            result.set(String.valueOf(((double) sum / total * 100) + "%"));

            output.collect(key, result);
        }
    }

    public static void main(String[] args) throws Exception {
        String input = "hdfs://hadoop1:9000/week5/in/access.20120104.log";
        String output = "hdfs://hadoop1:9000/week5/out4";
        JobConf conf = new JobConf(KPIBroswer.class);
        conf.setJobName("KPIBrowser");
        conf.setMapOutputKeyClass(Text.class);
        conf.setMapOutputValueClass(IntWritable.class);
        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);
        conf.setMapperClass(KPIBrowserMapper.class);
        // conf.setCombinerClass(KPIBrowserReducer.class);
        conf.setReducerClass(KPIBrowserReducer.class);
        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);
        FileInputFormat.setInputPaths(conf, new Path(input));
        FileOutputFormat.setOutputPath(conf, new Path(output));
        JobClient.runJob(conf);
        System.exit(0);
    }
}

2、結果:

    2.0617954458374926E-4%
""  2.0617954458374926E-4%
"(C)Nokia6700s/SymbianOS/9.1 Series60/3.0"  3.4363257430624875E-5%
"-" 1.5351785257131665%
"-" "-" 6.872651486124975E-5%
"-" "Mozilla/4.0"   1.0308977229187463E-4%
"Amoi-F90/Plat-F/WAP2.0/MIDP1.0/CLDC1.0 UP.Browser/6.2.2.6.f.1.100  6.872651486124975E-5%
"AmoiE70/6.1.08/WAP2.0 Profile/MIDP2.0  3.4363257430624875E-5%
"AndroidDownloadManager"    6.872651486124975E-5%
"Apache-HttpClient/4.1 (java    3.4363257430624875E-5%
"Apache-HttpClient/4.1.1 (java  3.4363257430624875E-5%
"AppEngine-Google; (+http://code.google.com/appengine;  2.0617954458374926E-4%
"Apple-PubSub/28"   0.001683799614100619%
"Apple-PubSub/65"   3.4363257430624875E-5%
"Apple-PubSub/65.20"    3.4363257430624875E-5%
"Apple-PubSub/65.23"    6.185386337512477E-4%
"Apple-PubSub/65.28 AppEngine-Google;   3.4363257430624875E-5%
"Apple-PubSub/65.28"    0.015807098418087445%
"BGSY bot/1.0"  0.0041235908916749855%
"BaiduMobile/1.3.1 CFNetwork/485.12.7   3.4363257430624875E-5%
"BaiduMobile/1.3.2 CFNetwork/548.0.4    3.4363257430624875E-5%
"BaiduMobile/1.3.4 CFNetwork/485.12.7   6.872651486124975E-5%
"BaiduMobile/1.3.4 CFNetwork/485.13.9   6.872651486124975E-5%
"BaiduMobile/1.3.5 CFNetwork/485.12.7   3.4363257430624875E-5%
"BaiduMobile/1.3.5 CFNetwork/485.13.9   2.4054280201437413E-4%
"BaiduMobile/1.3.5 CFNetwork/548.0.3    6.872651486124975E-5%
"BaiduMobile/1.3.5 CFNetwork/548.0.4    1.374530297224995E-4%
"Baiduspider"   1.374530297224995E-4%
"Baiduspider+(+http://www.baidu.com/search/spider.htm)" 0.01147732798182871%
"Baiduspider-news+(+http://www.baidu.com/search/spider.htm)"    0.009003173446823718%
"BlackBerry8820/2.7.0.105-4.5.0.174 Profile/MIDP-2.0    1.718162871531244E-4%
"BlackBerry8900/5.2.0.96 Profile/MIDP-2.0   6.872651486124975E-5%
"BlackBerry9000/5.2.0.89 Profile/MIDP-2.0   3.4363257430624875E-5%
"BlackBerry9700/5.0.0.862 Profile/MIDP-2.1  3.4363257430624875E-5%
"CoolPadF800/CMCC WindowsCEOS/6.0/(2009.10.30)10.01.F800/WAP2.0 1.374530297224995E-4%
"Dalvik/1.2.0 (Linux;   5.49812118889998E-4%
"Dalvik/1.4.0 (Linux;   3.4363257430624875E-5%
"DoCoMo/2.0 N905i(c100;TB;W24H16)   3.779958317368737E-4%
"DoCoMo/2.0 P900i(c100;TB;W24H11)   1.374530297224995E-4%
"Domnutch-Bot/Nutch-1.0 (Domnutch;  6.872651486124975E-5%
"Doubanbot/1.0 ([email protected]  9.278079506268717E-4%
"E63/SymbianOS/9.1 Series60/3.0"    2.4054280201437413E-4%
"E66/SymbianOS/9.1 Series60/3.0"    4.123590891674985E-4%
"E71/SymbianOS/9.1 Series60/3.0"    6.872651486124975E-5%
"FTRF: Friendly 6.185386337512477E-4%
"Feed43 Proxy/1.0   3.779958317368737E-4%
"FeedDemon/3.1 (http://www.feeddemon.com/;  6.872651486124975E-5%
"FeedDemon/4.0 (http://www.feeddemon.com/;  0.00402050111938311%
"FeedFetcher-Google-CoOp; (+http://www.google.com/coop/cse/cref)"   7.559916634737474E-4%
"Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)" 6.872651486124976E-4%
"Feedfetcher-Google; (+http://www.google.com/feedfetcher.html;  0.3223273546992614%
"Feedreader 3.14    0.0035394155153543627%
"GIONEE-L011/SW1.0.0/WAP2.0/MIDP2.1 Configuration/CLDC-1.1" 6.872651486124975E-5%
"GoodReaderIPad/3.12.0 CFNetwork/548.0.4    3.4363257430624875E-5%
"GoogleProducer"    0.0015119833269474948%
"Googlebot-Image/1.0"   1.0308977229187463E-4%
"Googlebot/2.1 (+http://www.google.com/bot.html)"   0.0017181628715312442%
"Googlebot/2.1 (+http://www.googlebot.com/bot.html)"    2.0617954458374926E-4%
"Googlebot/2.1 (http://www.googlebot.com/bot.html)" 1.0308977229187463E-4%
"GreatNews/1.0" 0.012576952219608707%
"HD_T8282 Mozilla/4.0   3.4363257430624875E-5%
"HTCT9188_TD/1.0 WindowsMobile/6.5  1.0308977229187463E-4%
"HTC_Touch_Diamond2_T5353 Mozilla/4.0   1.0308977229187463E-4%
"HTC_Touch_Pro_T7272 Mozilla/4.0    3.4363257430624875E-5%
"HTMLParser/1.6"    6.872651486124975E-5%
"HTTP Fetcher/HTTP/1.0" 3.4363257430624875E-5%
"HTTP_Request2/2.0.0 (http://pear.php.net/package/http_request2)    2.74906059444999E-4%
"Holleycomm-H8800/2.0 WAP2.0    3.436325743062488E-4%
"HuaweiSymantecSpider/[email protected]+(compatible; MSIE  0.04123590891674985%
"HuaweiT5211_TD/1.0 RTKE_OS/01.00   1.718162871531244E-4%
"HuaweiT8100_TD/1.0 Android/2.2 1.374530297224995E-4%
"HuaweiU7520/B000 Browser/NetFront/4.1  3.4363257430624875E-5%
"Huaweisymantecspider (compatible;  0.027490605944499907%
"IUC(U;iOS 3.1.3;Zh-cn;320*480;)"   3.4363257430624875E-5%
"IUC(U;iOS 4.1;Zh-cn;320*480;)/UCWEB8.1.0.104/41/997"   4.123590891674985E-4%
...
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章