pig-hive-elasticsearch

#!/usr/bin/env bash

set -x
set -e

target_date=`date -d last-day +%Y-%m-%d`
echo 'it works!'

echo "target_date = " $target_date

hive -hiveconf target_date=$target_date -f preprocess.hql

pig \
-useHCatalog \
-Dmapreduce.job.acl-view-job=* \
-Dmapreduce.job.queuename=root.bdp_jmart_tbi_union.bdp_jmart_tbi_dev \
-Dmapred.child.java.opts='-Xmx8192m ' \
-p start_date=$target_date \
-p target_date=$target_date \
es_loader.pig 2>&1

echo 'DONE!'
******************************************************************************
CREATE TABLE IF NOT EXISTS xxx.yyy (
album_id STRING,
album_name STRING,
album_size BIGINT,
album_subtype STRING,
album_type STRING,
alias STRING,
gender BIGINT,
id STRING,
mp3_source STRING,
name STRING,
nation BIGINT,
popularity BIGINT,
position BIGINT,
singer_id STRING,
singer_name STRING,
song_tags STRING
)
PARTITIONED BY(dt STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\u0001'
STORED AS TEXTFILE;

ALTER TABLE xxx.yyy DROP IF EXISTS PARTITION (dt='${hiveconf:target_date}');
****************************************************************************************
REGISTER elasticsearch-hadoop-pig-5.4.1.jar;
REGISTER yyy.jar;

DEFINE YYYFormatter com.jd.nlp.pig.udf.YYYFormatter();

DEFINE EsStorage org.elasticsearch.hadoop.pig.EsStorage (
              'es.http.timeout= 5m',
              'es.index.auto.create = true',
              'es.mapping.pig.tuple.use.field.names = true',
              'es.nodes = [ip]',
              'es.mapping.id = id',
              'es.write.operation = upsert',
              'es.output.json=true'
       );
data = LOAD 'index/type' USING EsStorage('es.query:?q=*');
data =FOREACH data GENERATE YYYFormatter(*);
decoupledata = FOREACH data GENERATE flatten($0) AS (album_id:chararray,
                                album_name:chararray,
                                album_size:long,
                                album_subtype:chararray,
                        album_type:chararray,
                        alias:chararray,
                        gender:long,
                        id:chararray,
                        mp3_source:chararray,
                        name:chararray,
                        nation:long,
                        popularity:long,
                        position:long,
                        singer_id:chararray,
                        singer_name:chararray,
                        song_tags:chararray
                        );
data = FOREACH decoupledata GENERATE *, '$target_date' AS dt;
STORE data INTO 'xxx.yyy' USING org.apache.hive.hcatalog.pig.HCatStorer() PARALLEL 1;
**********************************************************************************************
import com.alibaba.fastjson.JSONObject;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import com.alibaba.fastjson.JSON;

import java.io.IOException;


public class YYYFormatter extends EvalFunc<Tuple> {
    private TupleFactory tupleFactory = TupleFactory.getInstance();


    @Override
    public Tuple exec(Tuple tuple) throws IOException {
        String jsonString = (String) tuple.get(0);
        Tuple result = tupleFactory.newTuple();
        if (jsonString.isEmpty()) {
            throw new RuntimeException();
        }
        ZZZBean yyyBean= JSON.parseObject(jsonString, MiguSongBean.class);
        result.append(yyyBean .getAlbumId());
        result.append(yyyBean .getAlbumName());
        return result;
    }

}



發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章