/**
* @author cai_huaxue 36927
* 数据清洗
* 清洗规则
* 1. 只保留公民身份证号长度是18位数的。
* 2. 性别代码 从公民身份证号里获取,身份证号的导数第二位是奇数则表示男性1, 如果是偶数则表示女性1。
* 3. 删选出rzsj_dt, ldsj_dt的长度在(17,18,19)的。
* 4. 筛选出开房时间在5 min 到 90天之内的。
* 5. 对zslg清洗,过滤掉旅馆名称含有[浴业,浴场,沐浴,洗浴,足浴,淋浴,浴室, 浴城,温泉,浴馆]
* 6. 对fh清洗,过滤掉fh名称含有[厅,手牌,足疗,足浴,dt]
* 7.对于某个房号,如果某天 内,开房次数超过10次,就把该房间当天的数据全部删除掉
*/
package com.dahua;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import com.aliyun.odps.OdpsException;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.data.TableInfo;
import com.aliyun.odps.mapred.JobClient;
import com.aliyun.odps.mapred.MapperBase;
import com.aliyun.odps.mapred.ReducerBase;
import com.aliyun.odps.mapred.conf.JobConf;
import com.aliyun.odps.mapred.utils.InputUtils;
import com.aliyun.odps.mapred.utils.OutputUtils;
import com.aliyun.odps.mapred.utils.SchemaUtils;
import com.dahua.utils.*;
public class CleanLgzsData {
private String inputTable;
private String outputTable;
public CleanLgzsData(String inputTable,String outputTable){
this.inputTable=inputTable;
this.outputTable=outputTable;
}
public void executeMR() throws OdpsException {
JobConf aliJob = new JobConf();
aliJob.setMapperClass(CleanLgzsDataMapper.class);
aliJob.setReducerClass(CleanLgzsDataReducer.class);
/**
* 通常情况下,GroupingColumns包含在KeySortColumns中,KeySortColumns和PartitionColumns要包含在Key schema中
*/
/**
* map 阶段
*/
//设置Mapper输出到Reducer的Key行属性
aliJob.setMapOutputKeySchema(SchemaUtils.fromString("gmsfhm: String"));
//设置Mapper输出到Reducer的Value行属性
aliJob.setMapOutputValueSchema(SchemaUtils.fromString("rzsj_dt: datetime, ldsj_dt: datetime, rzsj_int: bigint, ldsj_int: bigint, fh: String, zslgdm: String, zslg: String, ssdsdm: String, xbdm: String"));
/**
* Shuffle-合并排序
*/
//Mapper输出的Record会根据设置的PartitionColumns计算哈希值,决定分配到哪个Reducer.
//有可能是多个gmsfhm证号分配到同意reducer。
aliJob.setPartitionColumns(new String[]{"gmsfhm"});
//Mapper输出的Record会根据KeySortColumns对Record进行排序.
aliJob.setOutputKeySortColumns(new String[]{"gmsfhm"});
/**
* Reduce阶段
*/
//在Reduce端,会根据GroupingColumns指定的列对输入的Records进行分组,把GroupingColumns所指定列相同的Records作为一次reduce函数调用的输入.
//数据在Reducer里排序好了后,是哪些数据进入到同一个reduce方法的,就是看这里的设置。一般来说,设置的和setPartitionColumns(String[] cols)一样
aliJob.setOutputGroupingColumns(new String[]{"gmsfhm"});
//输入表
InputUtils.addTable(TableInfo.builder().tableName(inputTable).build(), aliJob);
//输出表
OutputUtils.addTable(TableInfo.builder().tableName(outputTable).build(), aliJob);
JobClient.runJob(aliJob);
}
/**
* map过程
*
*/
public static class CleanLgzsDataMapper extends MapperBase {
private Record key;
private Record value;
@Override
public void setup(TaskContext context) throws IOException {
key = context.createMapOutputKeyRecord();
value = context.createMapOutputValueRecord();
}
@Override
public void map(long recordNum, Record record, TaskContext context)throws IOException {
/*
* 这里注意数据类型,Datetime
*/
key.set(0, record.getString("gmsfhm"));
value.set(0,record.getDatetime("rzsj_dt"));
value.set(1, record.getDatetime("ldsj_dt"));
value.set(2, record.getBigint("rzsj_int"));
value.set(3, record.getBigint("ldsj_int"));
value.set(4, record.getString("fh"));
value.set(5,record.getString("zslgdm"));
value.set(6, record.getString("zslg"));
value.set(7, record.getString("ssdsdm"));
value.set(8, record.getString("xbdm"));
System.out.println("map读进数据成功");
context.write(key, value);
}
}
/**
*
* reduce过程
*
*/
public static class CleanLgzsDataReducer extends ReducerBase{
private Record result;
@Override
public void setup(TaskContext context) throws IOException {
//result是输出的数据
result = context.createOutputRecord();
}
@Override
public void reduce(Record key, Iterator<Record> values, TaskContext context) throws IOException {
String gmsfhm="123456";
//因为公民身份证号也会有空值。
if(key.getString("gmsfhm") != null){
gmsfhm=key.getString("gmsfhm");
}
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
int gmsfhmStdLen18=18;
int gmsfhmStdLen15=15;
if(gmsfhm.length() == gmsfhmStdLen18 || gmsfhm.length()==gmsfhmStdLen15){
while(values.hasNext()){
Record value = values.next();
/*
* 这里定义了伪值,因为源数据,这些字段有些为空,后面进行格式转化时,会报空指针错误。
*
*/
String rzsj_dt="rzsj_dt";
String ldsj_dt="ldsj_dt";
Long rzsj_int = 100L;
Long ldsj_int = 200L;
String fh = "足疗";
String zslg = "浴室";
if(value.getDatetime("rzsj_dt") != null){
rzsj_dt = sdf.format(value.getDatetime("rzsj_dt"));
}
if(value.getDatetime("ldsj_dt") != null){
ldsj_dt = sdf.format(value.getDatetime("ldsj_dt"));
}
if(value.getBigint("rzsj_int") != null){
rzsj_int = value.getBigint("rzsj_int");
}
if (value.getBigint("ldsj_int") != null){
ldsj_int = value.getBigint("ldsj_int");
}
if(value.getString("zslg") != null){
zslg = value.getString("zslg");
}
if (value.getString("fh") != null){
fh = value.getString("fh");
}
String xbdm=UtilFunc.getXbdmFromGmsfhm(gmsfhm);
boolean rzsjBool=UtilFunc.checkSjLen(rzsj_dt);
boolean ldsjBool=UtilFunc.checkSjLen(ldsj_dt);
boolean fhBool = UtilFunc.checkFh(fh);
boolean zslgBool = UtilFunc.checkZslg(zslg);
boolean zslgTimeBool=UtilFunc.checkZslgTime(rzsj_int,ldsj_int);
if(rzsjBool==true && ldsjBool==true && fhBool==true && zslgBool==true && zslgTimeBool==true){
result.set(0, gmsfhm);
Date rzsjDt;
Date ldsjDt;
try{
rzsjDt = sdf.parse(rzsj_dt);
ldsjDt = sdf.parse(ldsj_dt);
result.set(1, rzsjDt);
result.set(2,ldsjDt);
}catch(Exception e){
//throws e.getMessage();
}
result.set(3,rzsj_int);
result.set(4,ldsj_int);
result.set(5,fh);
result.set(6,zslg);
result.set(7,value.getString("zslgdm"));
result.set(8,value.get("ssdsdm"));
result.set(9,xbdm);
context.write(result);
System.out.println("reduce 结果输出成功");
}
}
}
}
}
}
MapReduce DEMO
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.