Hive數據跑批python一例

1、目標表

drop table if exists xxyl0628_result;
CREATE TABLE `xxyl0628_result`(
  `rn` string, 
  `game_name` string, 
  `active_anchor_cnt` string, 
  `active_uid_cnt` string, 
  `view_time` string, 
  `gift_point` string, 
  `cost_amount` string)
PARTITIONED BY ( 
  `pt_month` string, 
  `partype` string)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.mapred.TextInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
  'hdfs://emr-cluster/user/hive/warehouse/xxyl0628_result'
;

2、主程序
/Users/nisj/PycharmProjects/BiDataProc/love/HiveRunData-yl0628.py

# -*- coding=utf-8 -*-
import os
import datetime
import warnings
import time
import threadpool
import calendar
import datetime

warnings.filterwarnings("ignore")

def dateRange(beginDate, endDate):
    dates = []
    dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
    date = beginDate[:]
    while date <= endDate:
        dates.append(date)
        dt = dt + datetime.timedelta(1)
        date = dt.strftime("%Y-%m-%d")
    return dates

def monthRange(beginMonth, endMonth):
    months = set([])
    mt = datetime.datetime.strptime(beginMonth, "%Y-%m")
    month = beginMonth[:]
    while month <= endMonth:
        months.add(month)
        mt = mt + datetime.timedelta(1)
        month = mt.strftime("%Y-%m")
    monthList = sorted(months)
    return monthList

def months_addsub(dt,months):
    month = dt.month - 1 + months
    year = dt.year + month / 12
    month = month % 12 + 1
    day = min(dt.day,calendar.monthrange(year,month)[1])
    dt = dt.replace(year=year, month=month, day=day)
    return str(dt.replace(year=year, month=month, day=day))

def getYesterday():
    today = datetime.date.today()
    oneday = datetime.timedelta(days=1)
    yesterday = today - oneday
    return yesterday

def hiveRunData(pt_month):
    Yesterday=str(getYesterday())
    pt_month_frist = pt_month+'-01'
    pt_month_frist_format = datetime.date(int(pt_month_frist[0:4]), int(pt_month_frist[5:7]), int(pt_month_frist[8:10]))
    pt_month_next = months_addsub(pt_month_frist_format, +1)[0:7]

    os.system("""/usr/lib/hive-current/bin/hive -e " \
                drop table if exists xxyl0628_live_salary; \
                create table xxyl0628_live_salary as \
                with tab_live as( \
                select room_id,game_id,game_name,count(distinct case when live_mins>=30 then pt_day else null end) live_eff_day,sum(live_mins) live_mins,row_number()over(partition by room_id order by sum(live_mins) desc) rn_live_long,row_number()over(partition by room_id order by count(distinct case when live_mins>=30 then pt_day else null end) desc) rn_live_days \
                from (select room_id,game_id,game_name,pt_day,sum(unix_timestamp(updated_time)-unix_timestamp(switch_time))/60 live_mins \
                from oss_bi_all_live_history_status \
                where pt_month='{pt_month}' \
                  and game_id<>-1 \
                group by room_id,game_id,game_name,pt_day) a1 \
                group by room_id,game_id,game_name), \
                tab_salary as( \
                select room_id,sum(amount) salary_amount,sum(case when type=1 then amount else 0 end) rank_salary,sum(case when type=2 then amount else 0 end) contract_salary \
                from oss_bi_all_finance_salary_record \
                where pt_month ='{pt_month_next}' \
                and type in(1,2) and state=0 \
                group by room_id) \
                select a1.room_id,a1.game_id,a1.game_name,a1.live_eff_day,a1.live_mins,coalesce(a3.salary_amount,0) salary_concat_rank_amount \
                from tab_live a1 \
                inner join oss_bi_all_room a2 on a1.room_id=a2.id \
                left join tab_salary a3 on a1.room_id=a3.room_id \
                where a1.live_eff_day>=3 and a1.rn_live_long=1 and a1.rn_live_days=1 \
                  and a2.pt_day='{Yesterday}' and a2.is_profession=1 and a2.state=0; \

                drop table if exists xxyl0628_view_gift; \
                create table xxyl0628_view_gift as \
                with tab_view as( \
                select uid,roomid room_id,sum(view_time) view_time,row_number()over(partition by uid order by sum(view_time) desc) rn \
                from recommend_data_view a1 \
                where substr(pt_day,1,7)='{pt_month}' \
                group by uid,roomid \
                ), \
                tab_gift as( \
                select room_id,uid,sum(gift_point) gift_point \
                from honeycomb_all_gift_record \
                where pt_month='{pt_month}' \
                  and gift_point<>0 \
                group by room_id,uid) \
                select a1.uid,a1.room_id,a1.view_time,coalesce(a3.gift_point,0) gift_point,a2.game_id,a2.game_name,a2.salary_concat_rank_amount \
                from tab_view a1 \
                inner join xxyl0628_live_salary a2 on a1.room_id=a2.room_id \
                left join tab_gift a3 on a1.uid=a3.uid and a1.room_id=a3.room_id \
                where a1.rn=1; \
                " """.format(pt_month=pt_month, pt_month_next=pt_month_next, Yesterday=Yesterday));

    os.system("""/usr/lib/hive-current/bin/hive -e " \
            alter table xxyl0628_result drop partition(pt_month='{pt_month}',partype={partype}); \
            alter table xxyl0628_result add partition(pt_month='{pt_month}',partype={partype}) location '{pt_month}/{partype}'; \
            with tab_result as( \
            select row_number()over(order by sum(gift_point) desc) rn,a1.game_name,count(distinct room_id) active_anchor_cnt,count(distinct uid) active_uid_cnt,sum(view_time) view_time,sum(gift_point) gift_point,sum(gift_point)/1000/2 cost_amount1 \
            from xxyl0628_view_gift a1 \
            group by a1.game_name), \
            tab_salary as( \
            select a1.game_name, sum(a1.salary_concat_rank_amount) cost_amount2 \
            from xxyl0628_live_salary a1 \
            group by a1.game_name) \
            insert overwrite table xxyl0628_result partition(pt_month='{pt_month}',partype={partype}) \
            select a1.rn,a1.game_name,a1.active_anchor_cnt,a1.active_uid_cnt,a1.view_time,a1.gift_point,cast(cost_amount1+cost_amount2 as bigint) cost_amount \
            from tab_result a1 \
            left join tab_salary a2 on a1.game_name=a2.game_name \
            where a1.rn<=100; \
            " """.format(pt_month=pt_month,partype=1));

    os.system("""/usr/lib/hive-current/bin/hive -e " \
            alter table xxyl0628_result drop partition(pt_month='{pt_month}',partype={partype}); \
            alter table xxyl0628_result add partition(pt_month='{pt_month}',partype={partype}) location '{pt_month}/{partype}'; \
            with tab_result as( \
            select row_number()over(order by count(distinct room_id) desc) rn,a1.game_name,count(distinct room_id) active_anchor_cnt,count(distinct uid) active_uid_cnt,sum(view_time) view_time,sum(gift_point) gift_point,sum(gift_point)/1000/2 cost_amount1 \
            from xxyl0628_view_gift a1 \
            group by a1.game_name), \
            tab_salary as( \
            select a1.game_name, sum(a1.salary_concat_rank_amount) cost_amount2 \
            from xxyl0628_live_salary a1 \
            group by a1.game_name) \
            insert overwrite table xxyl0628_result partition(pt_month='{pt_month}',partype={partype}) \
            select a1.rn,a1.game_name,a1.active_anchor_cnt,a1.active_uid_cnt,a1.view_time,a1.gift_point,cast(cost_amount1+cost_amount2 as bigint) cost_amount \
            from tab_result a1 \
            left join tab_salary a2 on a1.game_name=a2.game_name \
            where a1.rn<=100; \
            " """.format(pt_month=pt_month,partype=2));

    os.system("""/usr/lib/hive-current/bin/hive -e " \
            alter table xxyl0628_result drop partition(pt_month='{pt_month}',partype={partype}); \
            alter table xxyl0628_result add partition(pt_month='{pt_month}',partype={partype}) location '{pt_month}/{partype}'; \
            with tab_result as( \
            select row_number()over(order by count(distinct uid) desc) rn,a1.game_name,count(distinct room_id) active_anchor_cnt,count(distinct uid) active_uid_cnt,sum(view_time) view_time,sum(gift_point) gift_point,sum(gift_point)/1000/2 cost_amount1 \
            from xxyl0628_view_gift a1 \
            group by a1.game_name), \
            tab_salary as( \
            select a1.game_name, sum(a1.salary_concat_rank_amount) cost_amount2 \
            from xxyl0628_live_salary a1 \
            group by a1.game_name) \
            insert overwrite table xxyl0628_result partition(pt_month='{pt_month}',partype={partype}) \
            select a1.rn,a1.game_name,a1.active_anchor_cnt,a1.active_uid_cnt,a1.view_time,a1.gift_point,cast(cost_amount1+cost_amount2 as bigint) cost_amount \
            from tab_result a1 \
            left join tab_salary a2 on a1.game_name=a2.game_name \
            where a1.rn<=100; \
            " """.format(pt_month=pt_month,partype=3));

    os.system("""/usr/lib/hive-current/bin/hive -e " \
            alter table xxyl0628_result drop partition(pt_month='{pt_month}',partype={partype}); \
            alter table xxyl0628_result add partition(pt_month='{pt_month}',partype={partype}) location '{pt_month}/{partype}'; \
            with tab_result as( \
            select row_number()over(order by sum(view_time) desc) rn,a1.game_name,count(distinct room_id) active_anchor_cnt,count(distinct uid) active_uid_cnt,sum(view_time) view_time,sum(gift_point) gift_point,sum(gift_point)/1000/2 cost_amount1 \
            from xxyl0628_view_gift a1 \
            group by a1.game_name), \
            tab_salary as( \
            select a1.game_name, sum(a1.salary_concat_rank_amount) cost_amount2 \
            from xxyl0628_live_salary a1 \
            group by a1.game_name) \
            insert overwrite table xxyl0628_result partition(pt_month='{pt_month}',partype={partype}) \
            select a1.rn,a1.game_name,a1.active_anchor_cnt,a1.active_uid_cnt,a1.view_time,a1.gift_point,cast(cost_amount1+cost_amount2 as bigint) cost_amount \
            from tab_result a1 \
            left join tab_salary a2 on a1.game_name=a2.game_name \
            where a1.rn<=100; \
            " """.format(pt_month=pt_month,partype=4));

# run parallel Batch
now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "當前時間是:",now_time


runDay_list = monthRange(beginMonth='2017-05', endMonth='2019-05')
requests = []
request_hiveRunData2localFile_batchCtl = threadpool.makeRequests(hiveRunData, runDay_list)
requests.extend(request_hiveRunData2localFile_batchCtl)
main_pool = threadpool.ThreadPool(1)
[main_pool.putRequest(req) for req in requests]

if __name__ == '__main__':
    while True:
        try:
            time.sleep(30)
            main_pool.poll()
        except KeyboardInterrupt:
            print("**** Interrupted!")
            break
        except threadpool.NoResultsPending:
            break

    if main_pool.dismissedWorkers:
        print("Joining all dismissed worker threads...")
        main_pool.joinAllDismissedWorkers()

now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "當前時間是:",now_time





3、說明
由於特殊情況,不能使用臨時表,所以不能使用並行跑批。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章