/**
* @author cai_huaxue 36927
* 數據清洗
* 清洗規則
* 1. 只保留公民身份證號長度是18位數的。
* 2. 性別代碼 從公民身份證號裏獲取,身份證號的導數第二位是奇數則表示男性1, 如果是偶數則表示女性1。
* 3. 刪選出rzsj_dt, ldsj_dt的長度在(17,18,19)的。
* 4. 篩選出開房時間在5 min 到 90天之內的。
* 5. 對zslg清洗,過濾掉旅館名稱含有[浴業,浴場,沐浴,洗浴,足浴,淋浴,浴室, 浴城,溫泉,浴館]
* 6. 對fh清洗,過濾掉fh名稱含有[廳,手牌,足療,足浴,dt]
* 7.對於某個房號,如果某天 內,開房次數超過10次,就把該房間當天的數據全部刪除掉
*/
package com.dahua;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import com.aliyun.odps.OdpsException;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.data.TableInfo;
import com.aliyun.odps.mapred.JobClient;
import com.aliyun.odps.mapred.MapperBase;
import com.aliyun.odps.mapred.ReducerBase;
import com.aliyun.odps.mapred.conf.JobConf;
import com.aliyun.odps.mapred.utils.InputUtils;
import com.aliyun.odps.mapred.utils.OutputUtils;
import com.aliyun.odps.mapred.utils.SchemaUtils;
import com.dahua.utils.*;
public class CleanLgzsData {
private String inputTable;
private String outputTable;
public CleanLgzsData(String inputTable,String outputTable){
this.inputTable=inputTable;
this.outputTable=outputTable;
}
public void executeMR() throws OdpsException {
JobConf aliJob = new JobConf();
aliJob.setMapperClass(CleanLgzsDataMapper.class);
aliJob.setReducerClass(CleanLgzsDataReducer.class);
/**
* 通常情況下,GroupingColumns包含在KeySortColumns中,KeySortColumns和PartitionColumns要包含在Key schema中
*/
/**
* map 階段
*/
//設置Mapper輸出到Reducer的Key行屬性
aliJob.setMapOutputKeySchema(SchemaUtils.fromString("gmsfhm: String"));
//設置Mapper輸出到Reducer的Value行屬性
aliJob.setMapOutputValueSchema(SchemaUtils.fromString("rzsj_dt: datetime, ldsj_dt: datetime, rzsj_int: bigint, ldsj_int: bigint, fh: String, zslgdm: String, zslg: String, ssdsdm: String, xbdm: String"));
/**
* Shuffle-合併排序
*/
//Mapper輸出的Record會根據設置的PartitionColumns計算哈希值,決定分配到哪個Reducer.
//有可能是多個gmsfhm證號分配到同意reducer。
aliJob.setPartitionColumns(new String[]{"gmsfhm"});
//Mapper輸出的Record會根據KeySortColumns對Record進行排序.
aliJob.setOutputKeySortColumns(new String[]{"gmsfhm"});
/**
* Reduce階段
*/
//在Reduce端,會根據GroupingColumns指定的列對輸入的Records進行分組,把GroupingColumns所指定列相同的Records作爲一次reduce函數調用的輸入.
//數據在Reducer裏排序好了後,是哪些數據進入到同一個reduce方法的,就是看這裏的設置。一般來說,設置的和setPartitionColumns(String[] cols)一樣
aliJob.setOutputGroupingColumns(new String[]{"gmsfhm"});
//輸入表
InputUtils.addTable(TableInfo.builder().tableName(inputTable).build(), aliJob);
//輸出表
OutputUtils.addTable(TableInfo.builder().tableName(outputTable).build(), aliJob);
JobClient.runJob(aliJob);
}
/**
* map過程
*
*/
public static class CleanLgzsDataMapper extends MapperBase {
private Record key;
private Record value;
@Override
public void setup(TaskContext context) throws IOException {
key = context.createMapOutputKeyRecord();
value = context.createMapOutputValueRecord();
}
@Override
public void map(long recordNum, Record record, TaskContext context)throws IOException {
/*
* 這裏注意數據類型,Datetime
*/
key.set(0, record.getString("gmsfhm"));
value.set(0,record.getDatetime("rzsj_dt"));
value.set(1, record.getDatetime("ldsj_dt"));
value.set(2, record.getBigint("rzsj_int"));
value.set(3, record.getBigint("ldsj_int"));
value.set(4, record.getString("fh"));
value.set(5,record.getString("zslgdm"));
value.set(6, record.getString("zslg"));
value.set(7, record.getString("ssdsdm"));
value.set(8, record.getString("xbdm"));
System.out.println("map讀進數據成功");
context.write(key, value);
}
}
/**
*
* reduce過程
*
*/
public static class CleanLgzsDataReducer extends ReducerBase{
private Record result;
@Override
public void setup(TaskContext context) throws IOException {
//result是輸出的數據
result = context.createOutputRecord();
}
@Override
public void reduce(Record key, Iterator<Record> values, TaskContext context) throws IOException {
String gmsfhm="123456";
//因爲公民身份證號也會有空值。
if(key.getString("gmsfhm") != null){
gmsfhm=key.getString("gmsfhm");
}
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
int gmsfhmStdLen18=18;
int gmsfhmStdLen15=15;
if(gmsfhm.length() == gmsfhmStdLen18 || gmsfhm.length()==gmsfhmStdLen15){
while(values.hasNext()){
Record value = values.next();
/*
* 這裏定義了僞值,因爲源數據,這些字段有些爲空,後面進行格式轉化時,會報空指針錯誤。
*
*/
String rzsj_dt="rzsj_dt";
String ldsj_dt="ldsj_dt";
Long rzsj_int = 100L;
Long ldsj_int = 200L;
String fh = "足療";
String zslg = "浴室";
if(value.getDatetime("rzsj_dt") != null){
rzsj_dt = sdf.format(value.getDatetime("rzsj_dt"));
}
if(value.getDatetime("ldsj_dt") != null){
ldsj_dt = sdf.format(value.getDatetime("ldsj_dt"));
}
if(value.getBigint("rzsj_int") != null){
rzsj_int = value.getBigint("rzsj_int");
}
if (value.getBigint("ldsj_int") != null){
ldsj_int = value.getBigint("ldsj_int");
}
if(value.getString("zslg") != null){
zslg = value.getString("zslg");
}
if (value.getString("fh") != null){
fh = value.getString("fh");
}
String xbdm=UtilFunc.getXbdmFromGmsfhm(gmsfhm);
boolean rzsjBool=UtilFunc.checkSjLen(rzsj_dt);
boolean ldsjBool=UtilFunc.checkSjLen(ldsj_dt);
boolean fhBool = UtilFunc.checkFh(fh);
boolean zslgBool = UtilFunc.checkZslg(zslg);
boolean zslgTimeBool=UtilFunc.checkZslgTime(rzsj_int,ldsj_int);
if(rzsjBool==true && ldsjBool==true && fhBool==true && zslgBool==true && zslgTimeBool==true){
result.set(0, gmsfhm);
Date rzsjDt;
Date ldsjDt;
try{
rzsjDt = sdf.parse(rzsj_dt);
ldsjDt = sdf.parse(ldsj_dt);
result.set(1, rzsjDt);
result.set(2,ldsjDt);
}catch(Exception e){
//throws e.getMessage();
}
result.set(3,rzsj_int);
result.set(4,ldsj_int);
result.set(5,fh);
result.set(6,zslg);
result.set(7,value.getString("zslgdm"));
result.set(8,value.get("ssdsdm"));
result.set(9,xbdm);
context.write(result);
System.out.println("reduce 結果輸出成功");
}
}
}
}
}
}
MapReduce DEMO
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.