大數據實訓05--網站離線日誌分析實戰

用戶行爲分析

是指在獲得網站訪問量基本數據的情況下,對有關數據進行統計、分析,從中發現用戶訪問網站的規律,並將這些規律與網絡營銷策略等相結合,從而發現目前網絡營銷活動中可能存在的問題, 併爲進一步修正或重新制定網絡營銷策略提供依據。這是狹義的只指網絡上的用戶行爲分析。 

要分析的KPI:瀏覽量PV、註冊用戶數、IP數、跳出用戶數、訪問頁面詳細統計

運行流程

數據源----數據採集(存到HDFS)-----數據清洗(MapReduce並行計算封裝成jar包)---數據統計分析(hive)

數據轉移(sqoop)----可視化展示

  • 日誌數據上傳到HDFS(在此清晰的是當天的日誌文件) 
  • 清洗數據
  • 建立分區表
  • 關聯數據
  • 統計分析,並通過sqoop轉化數據
  • 通過Echarts做可視化展示

開啓Tomcat

建立分區表以日期作爲分區的指標(將清洗後的數據存入到hive) 方便後續查看

添加時間分區 

命令及SQL語句

#創建目錄
hdfs dfs -mkdir -p /logs/bbs
	-
# 上傳文件hdfs
hdfs dfs -put /home/hpe/access_2017_05_31.log /logs/bbs/

#執行mapReduce程序
hadoop jar /home/LogCleanJob.jar /logs/bbs/access_2017_05_31.log /logs/bbs/cleaned/2017_05_31

#建立分區表以日期作爲分區的指標(將清洗後的數據存入到hive)
create external table bbslogs(
    ip string,
    atime string,
    url string)
    partitioned by (logdate string)
    row format delimited
    fields terminated by '\t' location '/logs/bbs/cleaned';

#增加分區(針對2017_05_31的日誌進行分區)
alter table bbslogs 
add partition(logdate='2017_05_31') location '/logs/bbs/cleaned/2017_05_31';

#查詢日誌表中的數據
select * from bbslogs where atime='20170531011507';

#使用HQL統計關鍵指標
#(1) 每天的pv量
create table bbslogs_pv_2017_05_31 as select count(1) as PV from bbslogs where logdate='2017_05_31' 
#(2) 每天註冊用戶數
create table bbslogs_register_2017_05_31 as select count(1) as REGUSER from 
bbslogs where logdate='2017_05_31' and instr(url,'member.php?mod=register')>0
#(3)每天獨立IP數
create table bbslogs_ip_2017_05_31 as select count(distinct ip) as IP from 
bbslogs where logdate='2017_05_31'
#(4)用戶跳出數
create table bbslogs_jumper_2017_05_31 as select count(1) as jumper from 
(select count(ip) as times from bbslogs where logdate='2017_05_31' group by ip having times=1) e
# 彙總表(將所有關鍵指標放入一張彙總表中)
create table bbslogs_2017_05_31 as 
select '2017_05_31',a.pv,b.reguser,c.ip,d.jumper
from bbslogs_pv_2017_05_31 a 
join bbslogs_register_2017_05_31 b on 1=1
join bbslogs_ip_2017_05_31 c on 1=1 
join bbslogs_jumper_2017_05_31 d on 1=1 

#(5)訪問頁面詳細統計
create table bbslogs_detail_2017_05_31 as select logdate,url,count(url) as count from
bbslogs where logdate='2017_05_31' group by url,logdate
#(6)每天不同時段PV量
create table bbslogs_day_pv_2017_05_31 as select v.logdate,v.hour,count(*) 
from (select logdate,substr(atime,9,2) as hour from bbslogs WHERE logdate='2017_05_31') v 
group by hour,logdate

#彙總表 數據轉化 export to mysql
sqoop export --connect jdbc:mysql://192.168.228.100:3306/bbslogs --username root --password root --table bbs_logs_stat --fields-terminated-by '\001' --export-dir "/user/hive/warehouse/bbslogs_2017_05_31"

#每天不同時段pv量 轉化 bbs_days_ pv
sqoop export --connect jdbc:mysql://192.168.228.100:3306/bbslogs --username root --password root --table bbs_days_pv --fields-terminated-by '\001' --export-dir "/user/hive/warehouse/bbslogs_day_pv_2017_05_31"

#詳細頁面pv量
sqoop export --connect jdbc:mysql://192.168.228.100:3306/bbslogs --username root --password root --table bbs_pv_detail --fields-terminated-by '\001' --export-dir  "/user/hive/warehouse/bbslogs_detail_2017_05_31"

設計腳本文件自動執行系列操作

#!/bin/sh
yesterday=$(date --date='1 days ago' +%Y_%m_%d)
logdirdate=$(date --date='1 days ago' +%Y-%m-%d)
echo ${yesterday}
cd /home/hadoop-2.7.5/bin
# 上傳文件hdfs
./hdfs dfs -put /home/apache-tomcat-8.0.53/logs/localhost_access_log.${logdirdate}.txt /logs/bbs/

#執行mapReduce程序
./hadoop jar /root/LogCleanJob.jar /logs/bbs/localhost_access_log.${logdirdate}.txt /logs/bbs/cleaned/${yesterday}

cd /home/hive-2.3/bin
# 增加分區
./hive -e "ALTER TABLE bbslogs ADD PARTITION(logdate='${yesterday}') LOCATION '/logs/bbs/cleaned/${yesterday}';"

#1每天的PV量
./hive -e "CREATE TABLE bbslogs_pv_${yesterday} AS SELECT COUNT(1) AS PV FROM bbslogs WHERE logdate='${yesterday}';"
#2每天的註冊用戶數
./hive -e "CREATE TABLE bbslogs_register_${yesterday} AS SELECT COUNT(1) AS REGUSER FROM bbslogs WHERE logdate='${yesterday}' AND INSTR(ur
l,'register')>0;"

#3每天的獨立IP數
./hive -e "CREATE TABLE bbslogs_ip_${yesterday} AS SELECT COUNT(DISTINCT ip) AS IP FROM bbslogs WHERE logdate='${yesterday}';"
#4用戶跳出數
./hive -e "CREATE TABLE bbslogs_jumper_${yesterday} AS SELECT COUNT(1) AS jumper FROM (SELECT COUNT(ip) AS times FROM bbslogs WHERE logdat
e='${yesterday}' GROUP BY ip HAVING times=1) e;"

# 彙總表
./hive -e "CREATE TABLE bbslogs_${yesterday} AS SELECT '${yesterday}', a.pv, b.reguser, c.ip, d.jumper FROM bbslogs_pv_${yesterday} a JOIN
 bbslogs_register_${yesterday} b ON 1=1 JOIN bbslogs_ip_${yesterday} c ON 1=1 JOIN bbslogs_jumper_${yesterday} d ON 1=1;"
#刪除表
./hive -e "drop table bbslogs_pv_${yesterday};"
./hive -e "drop table bbslogs_register_${yesterday};"
./hive -e "drop table bbslogs_ip_${yesterday};"
./hive -e "drop table bbslogs_jumper_${yesterday}; "

#5訪問頁面詳細統計
./hive -e "CREATE TABLE bbslogs_detail_${yesterday} AS SELECT logdate,url,COUNT(url) AS count FROM bbslogs WHERE logdate='${yesterday}' GR
OUP BY url,logdate; "
#6每天不同時段pv量
./hive -e "create table bbslogs_day_pv_${yesterday} AS select v.logdate,v.hour,count(*) from (select logdate,substr(atime,9,2) as hour fro
m bbslogs where logdate='${yesterday}') v  group by hour,logdate;"

可視化展示

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章