目錄
Hbase搭建
下載hbase-1.0.0-cdh5.5.1.tar.gz,解壓並修改配置文件
修改conf/hbase-env.sh的java環境變量
修改conf/hbase-site.xml
<configuration>
<property>
<name>hbase.rootdir</name>
<value>hdfs://master:9000/hbase</value>
</property>
<property>
<name>hbase.cluster.distributed</name>
<value>true</value>
</property>
<property>
<name>hbase.zookeeper.quorum</name>
<value>master</value>
</property>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
</configuration>
修改conf/regionservers
修改爲master
啓動
訪問:
至此存儲搭建完成,比較順利
Hbase工具類
添加jar包
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.2.3</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>1.0.0-cdh5.5.1</version>
</dependency>
Hbase上創建表
添加HbaseUtil工具類,可以去文末github查看
添加Pv(訪問量),Uv(獨立訪客)相關邏輯
修改map邏輯
public class PindaopvuvMap implements FlatMapFunction<KafkaMessage, PidaoPvUv> {
@Override
public void flatMap(KafkaMessage value, Collector<PidaoPvUv> out) throws Exception {
String jsonstring = value.getJsonmessage();
long timestamp = value.getTimestamp();
String hourtimestamp = DateUtil.getDateby(timestamp,"yyyyMMddhh");//小時
String daytimestamp = DateUtil.getDateby(timestamp,"yyyyMMdd");//天
String monthtimestamp = DateUtil.getDateby(timestamp,"yyyyMM");//月
UserscanLog userscanLog = JSON.parseObject(jsonstring, UserscanLog.class);
long pingdaoid = userscanLog.getPingdaoid();
long userid = userscanLog.getUserid();
//從hbase中獲取用戶的狀態信息
UserState userState = PdvisterDao.getUserSatebyvistertime(userid+"",timestamp);
boolean isFirsthour = userState.isFisrthour();
boolean isFisrtday = userState.isFisrtday();
boolean isFisrtmonth = userState.isFisrtmonth();
PidaoPvUv pidaoPvUv = new PidaoPvUv();
pidaoPvUv.setPingdaoid(pingdaoid);
pidaoPvUv.setUserid(userid);
pidaoPvUv.setPvcount(Long.valueOf(value.getCount()+""));
pidaoPvUv.setUvcount(isFirsthour==true?1l:0l);
pidaoPvUv.setTimestamp(timestamp);
pidaoPvUv.setTimestring(hourtimestamp);
pidaoPvUv.setGroupbyfield(hourtimestamp+pingdaoid);
out.collect(pidaoPvUv);
System.out.println("小時=="+pidaoPvUv);
//天
pidaoPvUv.setUvcount(isFisrtday==true?1l:0l);
pidaoPvUv.setGroupbyfield(daytimestamp+pingdaoid);
pidaoPvUv.setTimestring(daytimestamp);
out.collect(pidaoPvUv);
System.out.println("天=="+pidaoPvUv);
//月
pidaoPvUv.setUvcount(isFisrtmonth==true?1l:0l);
pidaoPvUv.setGroupbyfield(monthtimestamp+pingdaoid);
pidaoPvUv.setTimestring(monthtimestamp);
out.collect(pidaoPvUv);
System.out.println("月=="+pidaoPvUv);
}
}
修改reduce邏輯
public class PindaopvuvReduce implements ReduceFunction<PidaoPvUv> {
@Override
public PidaoPvUv reduce(PidaoPvUv value1, PidaoPvUv value2) throws Exception {
System.out.println( "value1=="+value1);
System.out.println( "value2=="+value2);
long pingdaoid = value1.getPingdaoid();
long timestampvalue = value1.getTimestamp();
String timestring = value1.getTimestring();
long pvcountvalue1 = value1.getPvcount();
long uvcountvalue1 = value1.getUvcount();
long pvcountvalue2 = value2.getPvcount();
long uvcountvalue2 = value2.getUvcount();
PidaoPvUv pidaoPvUv = new PidaoPvUv();
pidaoPvUv.setPingdaoid(pingdaoid);
pidaoPvUv.setTimestamp(timestampvalue);
pidaoPvUv.setTimestring(timestring);
pidaoPvUv.setPvcount(pvcountvalue1+pvcountvalue2);
pidaoPvUv.setUvcount(uvcountvalue1+uvcountvalue2);
System.out.println( "recuduce --pidaoPvUv=="+pidaoPvUv);
return pidaoPvUv;
}
}
本地測試類
public static void main(String[] args) {
System.setProperty("hadoop.home.dir","D:\\soft\\hadoop-2.6.0-cdh5.5.1\\hadoop_dll2.6.0");
args = new String[]{"--input-topic","test1","--bootstrap.servers","111.231.99.181:9092",
"--zookeeper.connect","111.231.99.181:2181","--group.id","myconsumer1","--winsdows.size","50"};
final ParameterTool parameterTool = ParameterTool.fromArgs(args);
if (parameterTool.getNumberOfParameters() < 5) {
System.out.println("Missing parameters!\n" +
"Usage: Kafka --input-topic <topic>" +
"--bootstrap.servers <kafka brokers> " +
"--zookeeper.connect <zk quorum> --group.id <some id>");
return;
}
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.getConfig().disableSysoutLogging();
env.getConfig().setRestartStrategy(RestartStrategies.fixedDelayRestart(4, 10000));
env.enableCheckpointing(5000); // create a checkpoint every 5 seconds
env.getConfig().setGlobalJobParameters(parameterTool); // make parameters available in the web interface
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
FlinkKafkaConsumer010 flinkKafkaConsumer = new FlinkKafkaConsumer010<KafkaMessage>(parameterTool.getRequired("input-topic"), new KafkaMessageSchema(), parameterTool.getProperties());
DataStream<KafkaMessage> input = env.addSource(flinkKafkaConsumer.assignTimestampsAndWatermarks(new KafkaMessageWatermarks()));
DataStream<PidaoPvUv> map = input.flatMap(new PindaopvuvMap());
DataStream<PidaoPvUv> reduce = map.keyBy("groupbyfield").countWindow(Long.valueOf(parameterTool.getRequired("winsdows.size"))).reduce(new PindaopvuvReduce());
reduce.print();
try {
env.execute("pindaossfx");
} catch (Exception e) {
e.printStackTrace();
}
}
後續將計算結果存入Hbase,暫時在本地測試,本地測試需要添加hadoop相關文件。
總結
昨天的bug是因爲hosts文件修改的問題。
後續將放在服務器上直接測試,由於服務器內存較小,測試條件有限,後續將擴容內存。後續將完善頻道新鮮度和頻道瀏覽地區分佈分析。
具體代碼可參照我的git項目地址,現有代碼均已通過測試可以使用,後續會持續更新,直到項目結束,不懂的細節,可以關注公衆號,後臺留言,會細緻解答。
git地址:https://github.com/jyqjyq/filnkDS.git