HIVE實戰:簡單處理web日誌

測試文件下載

需求:簡單處理一個web日誌,該日誌共有11個字段,5794條,現要進行簡單處理

展示日誌中的一條數據:

"27.38.5.159" "-" "31/Aug/2015:00:04:37 +0800" "GET /course/view.php?id=27 HTTP/1.1" "303" "440" - "http://www.ibeifeng.com/user.php?act=mycourse" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36" "-" "learn.ibeifeng.com"
1、在hive中創建表並加載數據

//創建數據原表
create table if not exists default.web_log(
remote_addr string,
remote_user string,
time_local string,
request string,
status string,
body_bytes_sent string,
request_body string,
http_refer string,
http_user_agent string,
http_x_forwarded_for string,
host string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
  "input.regex" = "(\"[^ ]*\") (\"-|[^ ]*\") (\"[^\]]*\") (\"[^\"]*\") (\"[0-9]*\") (\"[0-9]*\") (-|[^ ]*) (\"[^ ]*\") (\"[^\"]*\") (-|[^ ]*) (\"[^ ]*\")"
)
STORED AS TEXTFILE;
//加載數據
load data local inpath '/usr/datas/moodle.ibeifeng.access.log' into table default.bf_log;
2、創建一個新表,用來存儲 ip、時間、請求、父鏈接

create table if not exists default.web_log_common(
remote_addr string,
time_local string,
request string,
http_refer string
)
row format delimited fields terminated by '\t'
stored as orc tblproperties("orc.compress"="SNAPPY");
3、UDF

//去除引號
public class RemoveQuotes extends UDF {
    
    public Text evaluate(Text a) {
        
        if(a == null) {
            return new Text();
        }
        if(a.toString() == null) {
            return new Text();
        }        
        
        return new Text(a.toString().replaceAll("\"", ""));
    }
}
//規範時間
public class FormatDate extends UDF {
    
    private final SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.ENGLISH);
    private final SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MMM-dd: HH:mm:ss");
    @SuppressWarnings("unused")
    
    public Text evaluate(Text input) {
        Text output = new Text();
        String inputdate = input.toString().trim();
        
        if(input == null) {
            return null;
        }
        if(inputdate == null) {
            return null;
        }     
        

        try {
            
            Date oldDate = inputFormat.parse(inputdate);
            String newDate = outputFormat.format(oldDate);
            output.set(newDate);
            
        } catch (ParseException e) {
            
            e.printStackTrace();
            return output;
        }
        
        
        return output;
    }
}
//獲取IP前兩段
public class IpAddress extends UDF {
       
    public Text evaluate(Text ip) {
        Text output = new Text();
        
        if(ip == null) {
            return new Text();
        }
        if(ip.toString() == null) {
            return new Text();
        }
        if(ip.toString() .equals("")) {
            return new Text();
        }
        System.out.println(ip);
        String[] split = (ip.toString()).split("\\.");
        String ipAddress = split[0] + "." + split[1];
        output.set(ipAddress);
        
        
        
        return output;
    }
}
add jar /usr/datas/myUDF.jar;
1、create temporary function remove_quotes as "com.bpf.hive.udf.RemoveQuotes";
2、create temporary function formatDate as "com.bpf.hive.udf.FormatDate";
3、create temporary function ipAddress as "com.bpf.hive.udf.IpAddress";
4、新表中插入數據

insert overwrite table default.web_log_common select remove_quotes(remote_addr), formatDate(remove_quotes(time_local)), remove_quotes(request),remove_quotes(http_refer) from default.web_log_src;
5、哪個時間段請求網頁的次數多

select t.hour, count(*) cnt from
(select substring(time_local,13,2) hour from web_log_common) t
group by t.hour order by cnt desc;
6、IP所在地域多

select t.ip_prex, count(*) cnt from
(select ipAddress(remote_addr) ip_prex from web_log_common) t
group by t.ip_prex order by cnt desc;








發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章