掌握基於Hive 或 Spark SQL的數據分析
某零售企業根據最近1年門店收集的數據進行數據分析
潛在客戶畫像
用戶消費統計
門店的資源利用率
消費的特徵人羣定位
數據的可視化展現
客戶細節參數:
語言數據不正確
交通參數:
根據transaction_id,該表具有重複行。
然而,這些都是有效的數據,所以我們需要修復它們
商店參數:
商店回顧參數:
該表有爲null的無效分數
該表將transaction_id映射到錯誤的store_id
表之間的關係:
將數據傳入HDFS中:
hdfs dfs -rm -r -f -skipTrash /tmp/shopping/
hdfs dfs -mkdir -p /tmp/shopping/data/customer
hdfs dfs -put customer_details.csv /tmp/shopping/data/customer/
hdfs dfs -mkdir -p /tmp/shopping/data/transaction
hdfs dfs -put transaction_details.csv /tmp/shopping/data/transaction/
hdfs dfs -mkdir -p /tmp/shopping/data/store
hdfs dfs -put store_details.csv /tmp/shopping/data/store/
hdfs dfs -mkdir -p /tmp/shopping/data/review
hdfs dfs -put store_review.csv /tmp/shopping/data/review/
hdfs dfs -ls -R /tmp/shopping/data/
在hive中創建database:
drop database if exists shopping cascade
create database if not exists shopping
創建數據表table
客戶細節表:
create external table if not exists ext_customer_details (
customer_id string,
first_name string,
last_name string,
email string,
gender string,
address string,
country string,
language string,
job string,
credit_type string,
credit_no string
)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
location '/tmp/shopping/data/customer'
tblproperties ("skip.header.line.count"="1")
交通表:
create external table if not exists ext_transaction_details (
transaction_id string,
customer_id string,
store_id string,
price decimal(8,2),
product string,
purchase_date string,
purchase_time string
)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
location '/tmp/shopping/data/transaction' --this must tblproperties
tblproperties ("skip.header.line.count"="1")
商店表:
create external table if not exists ext_store_details (
store_id string,
store_name string,
employee_number int
)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
location '/tmp/shopping/data/store' --this must tblproperties
tblproperties ("skip.header.line.count"="1")
商店回顧表:
create external table if not exists ext_store_review (
transaction_id string,
store_id string,
review_score int
)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
location '/tmp/shopping/data/review' --this must tblproperties
tblproperties ("skip.header.line.count"="1")
數據清洗:
解決以下有問題的數據:
transaction_details中的重複數據,對重複數據生成新ID
過濾掉store_review中沒有評分的數據
可以把清洗好的數據放到另一個表或者用View表示
找出個人信息確認:PII(personal information identification)或個人機密信息:PCI(personal confidential information)數據進行加密或hash
重新組織transaction數據按照日期YYYY-MM做分區
個人信息進行加密:
create view if not exists vw_customer_details as
select
customer_id,
first_name,
unbase64(last_name) as last_name,
unbase64(email) as email,
gender,
unbase64(address) as address,
country,
job,
credit_type,
unbase64(concat(unbase64(credit_no), 'seed')) as credit_no --better way than hash since it can be decoded base64(regexp_replace(base64(credit_no), unbase64('seed'), ''))
from ext_customer_details
ext_transaction_details中有重複的transaction_id:
先建立一個transaction_details表:
create table if not exists transaction_details (
transaction_id string,
customer_id string,
store_id string,
price decimal(8,2),
product string,
purchase_time string,
purchase_date date
)
partitioned by (purchase_month string)
select transaction_id, count(*) from ext_transaction_details group by transaction_id having count(*) > 1(標準的sql查重)
設置臨時結果集base:查找出相同的transaction_id,相同的transaction_id會產生不同的行number
with base as (
select row_number() over(partition by transaction_id order by 1) as rn,*
from ext_transaction_details
)
select count(*) from base where rn > 1
交通細節表:生成臨時子集base,使用transaction_id分組
with base as (
select
transaction_id,
customer_id,
store_id,
price,
product,
purchase_time,
purchase_date,
--對時間格式進行轉換
from_unixtime(unix_timestamp(purchase_date ,'yyyy-MM-dd'), 'yyyy-MM') as purchase_month,
--以transaction_id爲基礎排列row-number
row_number() over(partition by transaction_id order by store_id) as rn
from ext_transaction_details
where customer_id<>'customer_id'
)
from base
insert overwrite table transaction_details partition(purchase_month)
select
if(rn = 1, transaction_id, concat(transaction_id, '_fix', rn)) as transaction_id,
--rn = 1的時候輸出transaction_id,當rn >= 2的時候通過concat(transaction_id, '_fix', rn)連接字符串
customer_id,
store_id,
price,
product,
purchase_time,
purchase_date,
purchase_month
商戶回顧表:創建視圖,取出不爲空的部分
--select count(*) from ext_store_review r join transaction_details t on r.transaction_id = t.transaction_id and r.store_id = t.store_id where review_score <> '' --153 row
--select count(*) from ext_store_review where review_score <> '' --937 rows
因爲store_id有很多行是錯誤的,所以在這裏我們將忽略store_id
create view if not exists vw_store_review as
select transaction_id, review_score
from ext_store_review
where review_score <> ''
分析數據:
以客戶爲中心:
1. 找出顧客最常用的信用卡:
select credit_type, count(distinct credit_no) as credit_cnt
from vw_customer_details group by country, credit_type order by credit_cnt desc
2. 找出客戶數據中的前5個職位名稱:
select job, count(*) as pn from vw_customer_details group by job order by pn desc limit 5
3. 對於美國女性來說,她們手中最受歡迎的卡片是什麼
select credit_type, count(distinct credit_no) as ct
from vw_customer_details
where country = 'United States' and gender = 'Female'
group by credit_type order by ct desc
4. 按性別和國家計算客戶統計
select count(*), country, gender
from vw_customer_details
group by country, gender
以交易爲中心:
1. 計算每月總收入
select sum(price) as revenue_mom, purchase_month from transaction_details
group by purchase_month order by purchase_month
2. 按季度計算總收入
with base as (
select price,
concat_ws('-', substr(purchase_date, 1, 4), cast(ceil(month(purchase_date)/3.0) as string)) as year_quarter
from transaction_details
)
select
sum(price) as revenue_qoq, year_quarter --quarter does not support until hive 1.3
from base group by year_quarter
3. 按年計算總收入
select sum(price) as revenue_mom, substr(purchase_date, 1, 4) as year
from transaction_details
group by substr(purchase_date, 1, 4)
4. 按工作日星期計算總收入
select sum(price) as revenue_wow, date_format(purchase_date, 'u') as weekday
from transaction_details
group by date_format(purchase_date, 'u') --1 is Mon and 7 is Sun
可以合併在一起:
with base as (
select price, date_format(purchase_date, 'u') as weekday, purchase_month,
concat_ws('-', substr(purchase_date, 1, 4), cast(ceil(month(purchase_date)/3.0) as string)) as year_quarter, substr(purchase_date, 1, 4) as year
from transaction_details
)
select sum(price) as total, weekday, purchase_month, year_quarter, year from base group by weekday, purchase_month, year_quarter, year
5. (需要清理數據)(按時間桶(上午、中午等)計算總收入/平均購買量)
--定義時間分桶
--early morning: (5:00, 8:00]
--morning: (8:00, 11:00]
--noon: (11:00, 13:00]
--afternoon: (13:00, 18:00]
--evening: (18:00, 22:00]
--night: (22:00, 5:00]
--因爲它不是線性遞增的,所以把它當作else
--我們也安排時間。第一個格式時間到19:23喜歡,然後比較,然後轉換分鐘到小時
with base as (
select price, purchase_time, if(purchase_time like '%M',
--12小時制轉換成24小時制
from_unixtime(unix_timestamp(purchase_time,'hh:mm aa'),'HH:mm'), purchase_time) as time_format
from transaction_details ),
timeformat as (
select
--整合時間變成13.5這種形式
purchase_time, price, (cast(split(time_format, ':')[0] as decimal(4,2)) + cast(split(time_format, ':')[1] as decimal(4,2))/60) as purchase_time_in_hrs
from base ),
timebucket as (
select
price, purchase_time, purchase_time_in_hrs,
if(purchase_time_in_hrs > 5 and purchase_time_in_hrs <=8, 'early morning',
if(purchase_time_in_hrs > 8 and purchase_time_in_hrs <=11, 'morning',
if(purchase_time_in_hrs > 11 and purchase_time_in_hrs <=13, 'noon',
if(purchase_time_in_hrs > 13 and purchase_time_in_hrs <=18, 'afternoon',
--其他是night
if(purchase_time_in_hrs > 18 and purchase_time_in_hrs <=22, 'evening', 'night'))))) as time_bucket
from timeformat
)
--select * from timebucket 這樣可以知道,時間格式太多了需要清洗
select time_bucket, avg(price) as avg_spend, sum(price)/1000 as revenue_k from timebucket group by time_bucket
-- divide 1k to see the chater more clear
6. 按工作日計算平均消費
select avg(price) as avg_price, date_format(purchase_date, 'u') as weekday
from transaction_details
where date_format(purchase_date, 'u') is not null
group by date_format(purchase_date, 'u')
7. 計算年、月、日交易總額
with base as (
select transaction_id, date_format(purchase_date, 'u') as weekday, purchase_month,
concat_ws('-', substr(purchase_date, 1, 4), cast(ceil(month(purchase_date)/3.0) as string)) as year_quarter, substr(purchase_date, 1, 4) as year
from transaction_details
where purchase_month is not null )
select count(distinct transaction_id) as total, weekday, purchase_month, year_quarter, year
from base group by weekday, purchase_month, year_quarter, year order by year, purchase_month
8. 找出交易量最大的10個客戶
with base as (
select customer_id, count(distinct transaction_id) as trans_cnt, sum(price) as spend_total
from transaction_details
where purchase_month is not null group by customer_id
),
cust_detail as (
select *, concat_ws(' ', first_name, last_name) as cust_name
--表連接因爲信息不全
From base td join vw_customer_details cd on td.customer_id = cd.customer_id
)
select trans_cnt, cust_name as top10_trans_cust
from cust_detail order by trans_cnt desc limit 10
9. 找出消費最多的十大客戶
with base as (
select
customer_id, count(distinct transaction_id) as trans_cnt, sum(price) as spend_total
from transaction_details
where purchase_month is not null group by customer_id
),
cust_detail as (
select *, concat_ws(' ', first_name, last_name) as cust_name
from base td join vw_customer_details cd on td.customer_id = cd.customer_id )
select spend_total, cust_name as top10_trans_cust
from cust_detail order by spend_total desc limit 10
10. 誰擁有該期間的最小交易數量
with base as (
select customer_id, count(distinct transaction_id) as trans_cnt
from transaction_details
where purchase_month is not null
group by customer_id )
select * from base order by trans_cnt limit 10
11. 按季度/年計算獨立客戶總數
with base as (
select transaction_id,
concat_ws('-', substr(purchase_date, 1, 4), cast(ceil(month(purchase_date)/3.0) as string)) as year_quarter, substr(purchase_date, 1, 4) as year
from transaction_details
where purchase_month is not null )
select count(distinct transaction_id) as total, year_quarter, year
from base
group by year_quarter, year order by year_quarter
12. 計算整個活動中客戶的平均最大值
with base as (
select customer_id, avg(price) as price_avg, max(price) as price_max
from transaction_details
where purchase_month is not null group by customer_id )
select max(price_avg)
from base
13. 每個月誰花的錢最多
with base as (
select customer_id, purchase_month, sum(price) as price_sum, count(transaction_id) as trans_cnt
from transaction_details
where purchase_month is not null group by purchase_month, customer_id ),
rank_sum as (
select
rank() over(partition by purchase_month order by price_sum desc) as rn_sum,
rank() over(partition by purchase_month order by trans_cnt desc) as rn_cnt,
purchase_month, price_sum, trans_cnt, customer_id
from base )
select purchase_month, 'spend' as measure_name, price_sum as measure_value, customer_id
from rank_sum where rn_sum = 1
14. 每個月誰是最頻繁的訪客
with base as (
select customer_id, purchase_month, sum(price) as price_sum, count(transaction_id) as trans_cnt
from transaction_details
where purchase_month is not null group by purchase_month, customer_id ),
rank_sum as (
select
rank() over(partition by purchase_month order by price_sum desc) as rn_sum,
rank() over(partition by purchase_month order by trans_cnt desc) as rn_cnt,
purchase_month, price_sum, trans_cnt, customer_id
from base )
select purchase_month, 'visit' as measure_name, trans_cnt as measure_value, customer_id
from rank_sum where rn_cnt = 1 order by measure_name, purchase_month
15. 找出最受歡迎的5種產品的總價格
select product, sum(price) as price_sum
from transaction_details
where purchase_month is not null group by product
order by price_sum desc limit 5
16. 根據購買頻率找出最受歡迎的5種產品
select product, count(transaction_id) as freq_buy
from transaction_details
where purchase_month is not null
group by product order by freq_buy desc limit 5
17. 根據客戶數量找出最受歡迎的5種產品
select product, count(customer_id) as freq_cust
from transaction_details
where purchase_month is not null
group by product order by freq_cust desc limit 5
以商店爲中心:
1. 找出最受歡迎的商店正在訪問的獨特的客戶
select sd.store_name, count(distinct customer_id) as unique_visit
from transaction_details td join ext_store_details sd on td.store_id = sd.store_id
group by store_name order by unique_visit desc limit 5
2. 根據顧客購買的數量找出最受歡迎的商店
select sd.store_name, sum(td.price) as total_revnue
from transaction_details td join ext_store_details sd on td.store_id =sd.store_id
group by store_name
order by total_revnue desc limit 5
3. 根據顧客交易情況找出最受歡迎的商店
select sd.store_name, count(transaction_id) as unique_purchase
from transaction_details td join ext_store_details sd on td.store_id = sd.store_id
group by store_name order by unique_purchase desc limit 5
4. 通過唯一的客戶id獲得商店最受歡迎的產品
with base as (
select store_id, product, count(distinct customer_id) as freq_cust
from transaction_details
where purchase_month is not null group by store_id, product ),
prod_rank as (
select store_id, product, freq_cust, rank()
over(partition by store_id order by freq_cust desc) as rn
from base )
select store_name, product, freq_cust
from prod_rank td join ext_store_details sd on td.store_id = sd.store_id
where td.rn = 1
5. 獲得每個商店的員工與客戶訪問率
with base as (
select store_id, count(distinct customer_id, purchase_date) as cust_visit
from transaction_details
where purchase_month is not null
group by store_id
)
Select store_name, cust_visit, employee_number,
round(cust_visit/employee_number,2) as cust_per_employee_within_period
from base td join ext_store_details sd on td.store_id = sd.store_id
6. 按年、月計算各門店收入
select store_name, purchase_month, sum(price) as revenue
from transaction_details td join ext_store_details sd on td.store_id = sd.store_id
where purchase_month is not null
group by store_name, purchase_month
7. 根據商店的總收入製作餅圖
select store_name, sum(price) as revenue
from transaction_details td join ext_store_details sd on td.store_id = sd.store_id
where purchase_month is not null
group by store_name
8. 找出每個商店最繁忙的時間段
--define time bucket
--early morning: (5:00, 8:00]
--morning: (8:00, 11:00]
--noon: (11:00, 13:00]
--afternoon: (13:00, 18:00]
--evening: (18:00, 22:00]
--night: (22:00, 5:00] --make it as else, since it is not liner increasing
--We also format the time. 1st format time to 19:23 like, then compare, then convert minites to hours
with base as (
select transaction_id, purchase_time, if(purchase_time like '%M', from_unixtime(unix_timestamp(purchase_time,'hh:mm aa'),'HH:mm'), purchase_time) as time_format, store_id from transaction_details
where purchase_month is not null
),
timeformat as (
select purchase_time, transaction_id, (cast(split(time_format, ':')[0] as decimal(4,2)) + cast(split(time_format, ':')[1] as decimal(4,2))/60) as purchase_time_in_hrs, store_id
from base
),
timebucket as (
select
transaction_id, purchase_time, purchase_time_in_hrs, store_id,
if(purchase_time_in_hrs > 5 and purchase_time_in_hrs <=8, 'early morning',
if(purchase_time_in_hrs > 8 and purchase_time_in_hrs <=11, 'morning',
if(purchase_time_in_hrs > 11 and purchase_time_in_hrs <=13, 'noon',
if(purchase_time_in_hrs > 13 and purchase_time_in_hrs <=18, 'afternoon',
if(purchase_time_in_hrs > 18 and purchase_time_in_hrs <=22, 'evening', 'night'))))) as time_bucket
from timeformat
)
--select * from timebucket 這樣可以知道,時間格式太多了需要清洗
select sd.store_name, count(transaction_id) as tran_cnt, time_bucket
from timebucket td join ext_store_details sd on td.store_id = sd.store_id
group by sd.store_name, time_bucket order by sd.store_name, tran_cnt desc
9. 找出每家店的忠實顧客
--忠誠顧客是這段時間內在商店裏消費最多的前5名顧客
with base as (
select store_name, customer_id, sum(td.price) as total_cust_purphase
from transaction_details td join ext_store_details sd on td.store_id = sd.store_id
where purchase_month is not null
group by store_name, customer_id
),
rk_cust as (
select store_name, customer_id, total_cust_purphase,
rank() over(partition by store_name order by total_cust_purphase desc) as rn
from base
)
select * from rk_cust where rn <= 5
10. 找出明星店的最大收入
with base as (
select store_id, sum(price) as revenue
from transaction_details
where purchase_month is not null
group by store_id
)
Select store_name, revenue, employee_number,
round(revenue/employee_number,2) as revenue_per_employee_within_period
from base td join ext_store_details sd on td.store_id = sd.store_id
以評估爲中心:
1. 找出要在ext_store_review中檢查映射的衝突事務
--衝突意味着事務表和檢查表中的transaction_id、store_id對不同
--td.transaction_id, td.store_id, sd.store_id
select count(*) --725/8101
from transaction_details td join ext_store_review sd on td.transaction_id = sd.transaction_id
where purchase_month is not null and td.store_id != sd.store_id
2. 瞭解客戶評審範圍
Select count(td.transaction_id) as total_trans,
sum(if(sd.transaction_id is null, 1, 0)) as total_review_missed,
sum(if(sd.transaction_id is not null, 1, 0)) as total_review_exist
from transaction_details td left join ext_store_review sd on td.transaction_id = sd.transaction_id
where purchase_month is not null
3. 瞭解客戶/交易評審分數的分佈情況
Select review_score, count(distinct customer_id) as num_customer, count(*) as num_reviews
From transaction_details td join ext_store_review sd on td.transaction_id = sd.transaction_id
where purchase_month is not null and review_score <> ''
group by review_score
4. 顧客總是光顧同一家商店嗎
select -- not always
count(*) as visit_cnt,
customer_id,
td.store_id
from transaction_details td join ext_store_review sd on td.transaction_id = sd.transaction_id
where
purchase_month is not null and review_score = '5'
group by customer_id, td.store_id order by visit_cnt desc