用戶行爲分析
是指在獲得網站訪問量基本數據的情況下,對有關數據進行統計、分析,從中發現用戶訪問網站的規律,並將這些規律與網絡營銷策略等相結合,從而發現目前網絡營銷活動中可能存在的問題, 併爲進一步修正或重新制定網絡營銷策略提供依據。這是狹義的只指網絡上的用戶行爲分析。
要分析的KPI:瀏覽量PV、註冊用戶數、IP數、跳出用戶數、訪問頁面詳細統計
運行流程
數據源----數據採集(存到HDFS)-----數據清洗(MapReduce並行計算封裝成jar包)---數據統計分析(hive)
數據轉移(sqoop)----可視化展示
- 日誌數據上傳到HDFS(在此清晰的是當天的日誌文件)
- 清洗數據
- 建立分區表
- 關聯數據
- 統計分析,並通過sqoop轉化數據
- 通過Echarts做可視化展示
開啓Tomcat
建立分區表以日期作爲分區的指標(將清洗後的數據存入到hive) 方便後續查看
添加時間分區
命令及SQL語句
#創建目錄
hdfs dfs -mkdir -p /logs/bbs
-
# 上傳文件hdfs
hdfs dfs -put /home/hpe/access_2017_05_31.log /logs/bbs/
#執行mapReduce程序
hadoop jar /home/LogCleanJob.jar /logs/bbs/access_2017_05_31.log /logs/bbs/cleaned/2017_05_31
#建立分區表以日期作爲分區的指標(將清洗後的數據存入到hive)
create external table bbslogs(
ip string,
atime string,
url string)
partitioned by (logdate string)
row format delimited
fields terminated by '\t' location '/logs/bbs/cleaned';
#增加分區(針對2017_05_31的日誌進行分區)
alter table bbslogs
add partition(logdate='2017_05_31') location '/logs/bbs/cleaned/2017_05_31';
#查詢日誌表中的數據
select * from bbslogs where atime='20170531011507';
#使用HQL統計關鍵指標
#(1) 每天的pv量
create table bbslogs_pv_2017_05_31 as select count(1) as PV from bbslogs where logdate='2017_05_31'
#(2) 每天註冊用戶數
create table bbslogs_register_2017_05_31 as select count(1) as REGUSER from
bbslogs where logdate='2017_05_31' and instr(url,'member.php?mod=register')>0
#(3)每天獨立IP數
create table bbslogs_ip_2017_05_31 as select count(distinct ip) as IP from
bbslogs where logdate='2017_05_31'
#(4)用戶跳出數
create table bbslogs_jumper_2017_05_31 as select count(1) as jumper from
(select count(ip) as times from bbslogs where logdate='2017_05_31' group by ip having times=1) e
# 彙總表(將所有關鍵指標放入一張彙總表中)
create table bbslogs_2017_05_31 as
select '2017_05_31',a.pv,b.reguser,c.ip,d.jumper
from bbslogs_pv_2017_05_31 a
join bbslogs_register_2017_05_31 b on 1=1
join bbslogs_ip_2017_05_31 c on 1=1
join bbslogs_jumper_2017_05_31 d on 1=1
#(5)訪問頁面詳細統計
create table bbslogs_detail_2017_05_31 as select logdate,url,count(url) as count from
bbslogs where logdate='2017_05_31' group by url,logdate
#(6)每天不同時段PV量
create table bbslogs_day_pv_2017_05_31 as select v.logdate,v.hour,count(*)
from (select logdate,substr(atime,9,2) as hour from bbslogs WHERE logdate='2017_05_31') v
group by hour,logdate
#彙總表 數據轉化 export to mysql
sqoop export --connect jdbc:mysql://192.168.228.100:3306/bbslogs --username root --password root --table bbs_logs_stat --fields-terminated-by '\001' --export-dir "/user/hive/warehouse/bbslogs_2017_05_31"
#每天不同時段pv量 轉化 bbs_days_ pv
sqoop export --connect jdbc:mysql://192.168.228.100:3306/bbslogs --username root --password root --table bbs_days_pv --fields-terminated-by '\001' --export-dir "/user/hive/warehouse/bbslogs_day_pv_2017_05_31"
#詳細頁面pv量
sqoop export --connect jdbc:mysql://192.168.228.100:3306/bbslogs --username root --password root --table bbs_pv_detail --fields-terminated-by '\001' --export-dir "/user/hive/warehouse/bbslogs_detail_2017_05_31"
設計腳本文件自動執行系列操作
#!/bin/sh
yesterday=$(date --date='1 days ago' +%Y_%m_%d)
logdirdate=$(date --date='1 days ago' +%Y-%m-%d)
echo ${yesterday}
cd /home/hadoop-2.7.5/bin
# 上傳文件hdfs
./hdfs dfs -put /home/apache-tomcat-8.0.53/logs/localhost_access_log.${logdirdate}.txt /logs/bbs/
#執行mapReduce程序
./hadoop jar /root/LogCleanJob.jar /logs/bbs/localhost_access_log.${logdirdate}.txt /logs/bbs/cleaned/${yesterday}
cd /home/hive-2.3/bin
# 增加分區
./hive -e "ALTER TABLE bbslogs ADD PARTITION(logdate='${yesterday}') LOCATION '/logs/bbs/cleaned/${yesterday}';"
#1每天的PV量
./hive -e "CREATE TABLE bbslogs_pv_${yesterday} AS SELECT COUNT(1) AS PV FROM bbslogs WHERE logdate='${yesterday}';"
#2每天的註冊用戶數
./hive -e "CREATE TABLE bbslogs_register_${yesterday} AS SELECT COUNT(1) AS REGUSER FROM bbslogs WHERE logdate='${yesterday}' AND INSTR(ur
l,'register')>0;"
#3每天的獨立IP數
./hive -e "CREATE TABLE bbslogs_ip_${yesterday} AS SELECT COUNT(DISTINCT ip) AS IP FROM bbslogs WHERE logdate='${yesterday}';"
#4用戶跳出數
./hive -e "CREATE TABLE bbslogs_jumper_${yesterday} AS SELECT COUNT(1) AS jumper FROM (SELECT COUNT(ip) AS times FROM bbslogs WHERE logdat
e='${yesterday}' GROUP BY ip HAVING times=1) e;"
# 彙總表
./hive -e "CREATE TABLE bbslogs_${yesterday} AS SELECT '${yesterday}', a.pv, b.reguser, c.ip, d.jumper FROM bbslogs_pv_${yesterday} a JOIN
bbslogs_register_${yesterday} b ON 1=1 JOIN bbslogs_ip_${yesterday} c ON 1=1 JOIN bbslogs_jumper_${yesterday} d ON 1=1;"
#刪除表
./hive -e "drop table bbslogs_pv_${yesterday};"
./hive -e "drop table bbslogs_register_${yesterday};"
./hive -e "drop table bbslogs_ip_${yesterday};"
./hive -e "drop table bbslogs_jumper_${yesterday}; "
#5訪問頁面詳細統計
./hive -e "CREATE TABLE bbslogs_detail_${yesterday} AS SELECT logdate,url,COUNT(url) AS count FROM bbslogs WHERE logdate='${yesterday}' GR
OUP BY url,logdate; "
#6每天不同時段pv量
./hive -e "create table bbslogs_day_pv_${yesterday} AS select v.logdate,v.hour,count(*) from (select logdate,substr(atime,9,2) as hour fro
m bbslogs where logdate='${yesterday}') v group by hour,logdate;"
可視化展示