Hive實戰之影音系統
-
數據(https://download.csdn.net/download/kevin__durant/11798895)這上面還要積分,如果有需要我私發
-
數據格式↓前九行分別對應表名,最後對應相關視頻
-
LKh7zAJ4nwo TheReceptionist 653 Entertainment 424 13021 4.34 1305 744 DjdA-5oKYFQ NxTDlnOuybo c-8VuICzXtU DH56yrIO5nI W1Uo5DQTtzc E-3zXq_r4w0 1TCeoRPg5dE yAr26YhuYNY 2ZgXx72XmoE -7ClGo-YgZ0 vmdPOOd6cxI KRHfMQqSHpk pIMpORZthYw 1tUDzOp10pk heqocRij5P0 _XIuvoH6rUg LGVU5DsezE0 uO2kj6_D8B4 xiDqywcDQRM uX81lMev6_o
需求描述
統計硅谷影音視頻網站的常規指標,各種TopN指標:
–統計視頻觀看數Top10
–統計視頻類別熱度Top10
–統計視頻觀看數Top20所屬類別
–統計視頻觀看數Top50所關聯視頻的所屬類別Rank
–統計每個類別中的視頻熱度Top10
–統計每個類別中視頻流量Top10
–統計上傳視頻最多的用戶Top10以及他們上傳的視頻
–統計每個類別視頻觀看數Top10
項目
數據結構
1.視頻表
表6-13 視頻表
字段 | 備註 | 詳細描述 |
---|---|---|
video id | 視頻唯一id | 11位字符串 |
uploader | 視頻上傳者 | 上傳視頻的用戶名String |
age | 視頻年齡 | 視頻在平臺上的整數天 |
category | 視頻類別 | 上傳視頻指定的視頻分類 |
length | 視頻長度 | 整形數字標識的視頻長度 |
views | 觀看次數 | 視頻被瀏覽的次數 |
rate | 視頻評分 | 滿分5分 |
ratings | 流量 | 視頻的流量,整型數字 |
conments | 評論數 | 一個視頻的整數評論數 |
related ids | 相關視頻id | 相關視頻的id,最多20個 |
2.用戶表
表6-14 用戶表
字段 | 備註 | 字段類型 | |
---|---|---|---|
uploader | 上傳者用戶名 | string | |
videos | 上傳視頻數 | int | |
friends | 朋友數量 | int |
2 ETL原始數據
通過觀察原始數據形式,可以發現,視頻可以有多個所屬分類,每個所屬分類用&符號分割,且分割的兩邊有空格字符,同時相關視頻也是可以有多個元素,多個相關視頻又用“\t”進行分割。爲了分析數據時方便對存在多個子元素的數據進行操作,我們首先進行數據重組清洗操作。即:將所有的類別用“&”分割,同時去掉兩邊空格,多個相關視頻id也使用“&”進行分割。
- 原始數據在目錄下,此處使用MapReduce清洗數據
數據清洗
清洗工具類
public String dataRinse(String str){
String[] split = str.split("\t");
//過濾沒有視頻的用戶
if (split.length<9){
return "";
}
//將用戶中的空格替換掉
split[3] = split[3].replaceAll(" ","");
//將後面的視頻的數據合併爲&分割的數據
StringBuilder stringBuilder = new StringBuilder();
for (int i = 0; i < split.length; i++) {
if (i<9){
if (i==split.length-1){
stringBuilder.append(split[i]);
}else {
stringBuilder.append(split[i]).append("\t");
}
}else {
if (i==split.length-1){
stringBuilder.append(split[i]);
}else {
stringBuilder.append(split[i]).append("&");
}
}
}
return stringBuilder.toString();
}
public static void main(String[] args) {
String s = new ETLUtil().dataRinse("uFoWXi25RBk");
System.out.println(s);
}
清洗Mapper端
public class ETLMapper extends Mapper<LongWritable , Text , Text , NullWritable> {
Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//獲得數據
String line = value.toString();
//清洗
String s = new ETLUtil().dataRinse(line);
k.set(s);
//輸出
context.write(k , NullWritable.get());
}
}
清洗驅動Driver
public class ETLDriver implements Tool {
private Configuration configuration;
public int run(String[] args) throws Exception {
Job job = Job.getInstance(getConf());
job.setJarByClass(ETLDriver.class);
job.setMapperClass(ETLMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setNumReduceTasks(0);
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
boolean b = job.waitForCompletion(true);
return b ? 0 : 1;
}
public void setConf(Configuration conf) {
this.configuration = conf;
}
public Configuration getConf() {
return configuration;
}
public static void main(String[] args) throws Exception {
int run = ToolRunner.run(new ETLDriver(), args);
System.out.println(run);
}
}
- 將vedio執行清洗
hive數據分析
建表
- 創建表:gulivideo_ori gulivideo_user_ori
create table gulivideo_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as textfile;
create table gulivideo_user_ori(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as textfile;
導入數據
- 將video數據導入前表,將user導入後表直接load data 即可
業務分析
統計視頻觀看數Top10
- 直接單表排序分頁
select uploader,views
from gulivideo_ori
order by views desc
limit 10;
統計視頻類別熱度Top10
select 3.取出前十
t3.cate,t3.cou_cate
from
(
select 2.統計沒類的熱度
t2.cate cate , count(*) cou_cate
from
(
select t1.ca cate 1.將類別炸開
from gulivideo_ori lateral view explode(category) t1 as ca
)t2
group by t2.cate
)t3
order by t3.cou_cate
limit 10
統計視頻觀看數Top20所屬類別
select 3.對相同類別去重
distinct(cate)
from
(
select 2.取出前二十觀看數,類別,和視頻id
cate,views,videoid
from
(
select t1.ca cate,videoid,views 1.將類別炸開
from gulivideo_ori lateral view explode(category) t1 as ca
)t2
order by views desc
limit 20
)t3
統計視頻觀看數Top50所關聯視頻的所屬類別Rank
select 5.排序rank
*
from
(
select 4.將合併的表的類別字段炸開,對組進行分組,統計count
t4.category , count(*) hot
from
(
select 3.然後與原表再連接join
*
from
(
select 2.因爲關聯視頻字段是數組,將其炸開並對相關視頻字段去重
distinct(relatedids_name)
from
(
select * 1.取出觀看次數top50
from gulivideo_ori
order by views desc
limit 50
)t1
lateral view explode(t1.relatedid) relatedids_t as relatedids_name
)t2
join gulivideo_ori t3
where t2.relatedids_name=t3.videoid
)t4
lateral view explode(t4.category) category_t as category_name
group by t4.category
)t5
order by t5.hot desc
-
下面幾個業務都遇到了需要將類別炸開的形式,那就先將類別炸開的表先導入臨時表
-
create table gulivideo_category( videoId string, uploader string, age int, categoryId string, length int, views int, rate float, ratings int, comments int, relatedId array<string>) row format delimited fields terminated by "\t" collection items terminated
-
insert into table gulivideo_category select videoId, uploader, age, categoryId, length, views, rate, ratings, comments, relatedId from gulivideo_orc lateral view explode(category) catetory as categoryId;
統計每個類別中的視頻熱度Top10
select 2.取出top10
t1.categoryId,
t1.views,
from
(
select 1.按類別分組,觀看次數排序
categoryId,
views,
row_number() over(partition by categoryId order by views desc) rank
from gulivideo_category
)t1
where rank <= 10
統計每個類別中視頻流量Top10
select
t1.categoryId,
t1.ratings
from
(
select
categoryId,
ratings,
row_number() over(partition by categoryId order by ratings desc) rank
from gulivideo_category
)t1
where rank <= 10
統計上傳視頻最多的用戶Top10以及他們上傳的觀看次數在前20的視頻
select
t2.uploader,
t2.views
from
(
select
*
from gulivideo_user_ori
order by videos desc
limit 20
)t1
join
(
select
*
from gulivideo_ori
)t2
where t1.uploader=t2.uploader
order by views desc
limit 20
統計每個類別視頻觀看數Top10
select
t1.categoryId,
t1.views,
from
(
select
categoryId,
views,
row_number() over(partition by categoryId order by views desc) rank
from gulivideo_category
)t1
where rank <= 10