MapReduce DEMO

/**
 * @author cai_huaxue 36927
 * 数据清洗
 * 清洗规则
 * 1. 只保留公民身份证号长度是18位数的。
 * 2. 性别代码 从公民身份证号里获取,身份证号的导数第二位是奇数则表示男性1, 如果是偶数则表示女性1。
 * 3. 删选出rzsj_dt, ldsj_dt的长度在(17,18,19)的。
 * 4. 筛选出开房时间在5 min 到 90天之内的。
 * 5. 对zslg清洗,过滤掉旅馆名称含有[浴业,浴场,沐浴,洗浴,足浴,淋浴,浴室, 浴城,温泉,浴馆]
 * 6. 对fh清洗,过滤掉fh名称含有[厅,手牌,足疗,足浴,dt]
 * 7.对于某个房号,如果某天 内,开房次数超过10次,就把该房间当天的数据全部删除掉
 */
package com.dahua;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;

import com.aliyun.odps.OdpsException;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.data.TableInfo;
import com.aliyun.odps.mapred.JobClient;
import com.aliyun.odps.mapred.MapperBase;
import com.aliyun.odps.mapred.ReducerBase;
import com.aliyun.odps.mapred.conf.JobConf;
import com.aliyun.odps.mapred.utils.InputUtils;
import com.aliyun.odps.mapred.utils.OutputUtils;
import com.aliyun.odps.mapred.utils.SchemaUtils;
import com.dahua.utils.*;

public class CleanLgzsData {
	
	private String inputTable;
	private String outputTable;
	
	public CleanLgzsData(String inputTable,String outputTable){
		this.inputTable=inputTable;
		this.outputTable=outputTable;
	}
	
	 public void executeMR() throws OdpsException {
	        JobConf aliJob = new JobConf();
	        aliJob.setMapperClass(CleanLgzsDataMapper.class);
	        aliJob.setReducerClass(CleanLgzsDataReducer.class);  
	        /**
	         * 通常情况下,GroupingColumns包含在KeySortColumns中,KeySortColumns和PartitionColumns要包含在Key schema中
	         */
	        /**
	         * map 阶段
	         */
	        //设置Mapper输出到Reducer的Key行属性 
	        aliJob.setMapOutputKeySchema(SchemaUtils.fromString("gmsfhm: String"));
	        
	        //设置Mapper输出到Reducer的Value行属性
	        aliJob.setMapOutputValueSchema(SchemaUtils.fromString("rzsj_dt: datetime, ldsj_dt: datetime, rzsj_int: bigint, ldsj_int: bigint, fh: String, zslgdm: String, zslg: String, ssdsdm: String, xbdm: String"));
	        
	        /**
	         * Shuffle-合并排序
	         */
	        //Mapper输出的Record会根据设置的PartitionColumns计算哈希值,决定分配到哪个Reducer.
	        //有可能是多个gmsfhm证号分配到同意reducer。
            aliJob.setPartitionColumns(new String[]{"gmsfhm"});
            
            //Mapper输出的Record会根据KeySortColumns对Record进行排序.
            aliJob.setOutputKeySortColumns(new String[]{"gmsfhm"});

             /**
              * Reduce阶段
              */
	        //在Reduce端,会根据GroupingColumns指定的列对输入的Records进行分组,把GroupingColumns所指定列相同的Records作为一次reduce函数调用的输入.
            //数据在Reducer里排序好了后,是哪些数据进入到同一个reduce方法的,就是看这里的设置。一般来说,设置的和setPartitionColumns(String[] cols)一样
	        aliJob.setOutputGroupingColumns(new String[]{"gmsfhm"});

	       //输入表
	        InputUtils.addTable(TableInfo.builder().tableName(inputTable).build(), aliJob);
	        
	       //输出表
	        OutputUtils.addTable(TableInfo.builder().tableName(outputTable).build(), aliJob);
	        
	        JobClient.runJob(aliJob);
	    }
	 
		/**
		 * map过程
		 *
		 */
	    public static class CleanLgzsDataMapper extends MapperBase {
			private Record key;
			private Record value;
			
			@Override
	        public void setup(TaskContext context) throws IOException {
	            key = context.createMapOutputKeyRecord();
	            value = context.createMapOutputValueRecord();    
	        }
			
	        @Override
	        public void map(long recordNum, Record record, TaskContext context)throws IOException {
	        	/*
	        	 * 这里注意数据类型,Datetime
	        	 */
	        	key.set(0, record.getString("gmsfhm"));
                             
                value.set(0,record.getDatetime("rzsj_dt"));
                value.set(1, record.getDatetime("ldsj_dt"));
	            value.set(2, record.getBigint("rzsj_int"));
	            value.set(3, record.getBigint("ldsj_int"));
	            value.set(4, record.getString("fh"));
	            value.set(5,record.getString("zslgdm"));
	            value.set(6, record.getString("zslg"));
	            value.set(7, record.getString("ssdsdm"));
	            value.set(8, record.getString("xbdm"));
	            System.out.println("map读进数据成功");
	            context.write(key, value);
	        }
	        
		}
	    
	    /**
	     * 
	     * reduce过程
	     *
	     */
	    public static class CleanLgzsDataReducer extends ReducerBase{
			 private Record result;
			
			 @Override
	         public void setup(TaskContext context) throws IOException {
	            //result是输出的数据
	            result = context.createOutputRecord();
	         }
			 
			 @Override
		     public void reduce(Record key, Iterator<Record> values, TaskContext context) throws IOException {
			      String gmsfhm="123456";
			      //因为公民身份证号也会有空值。
			      if(key.getString("gmsfhm") != null){
			         gmsfhm=key.getString("gmsfhm");
			      }
				  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
				  int gmsfhmStdLen18=18;
				  int gmsfhmStdLen15=15;
				  if(gmsfhm.length() == gmsfhmStdLen18 || gmsfhm.length()==gmsfhmStdLen15){
					  while(values.hasNext()){
						  Record value = values.next();
						  /*
						   * 这里定义了伪值,因为源数据,这些字段有些为空,后面进行格式转化时,会报空指针错误。
						   * 
						   */
						  String rzsj_dt="rzsj_dt";
						  String ldsj_dt="ldsj_dt";
						  Long rzsj_int = 100L;
						  Long ldsj_int = 200L;
						  String fh = "足疗";
						  String zslg = "浴室";
 
						  if(value.getDatetime("rzsj_dt") != null){
						    rzsj_dt = sdf.format(value.getDatetime("rzsj_dt"));
						  }
                         
						  if(value.getDatetime("ldsj_dt") != null){
						    ldsj_dt = sdf.format(value.getDatetime("ldsj_dt"));
						  }
						 
						  if(value.getBigint("rzsj_int") != null){
						     rzsj_int = value.getBigint("rzsj_int");
						  }
                          
						  if (value.getBigint("ldsj_int") != null){
						     ldsj_int = value.getBigint("ldsj_int");
						  }
						  
						  if(value.getString("zslg") != null){
						     zslg = value.getString("zslg");
						  }
						  
						  if (value.getString("fh") != null){
						     fh = value.getString("fh");
						  }
						  
					      String xbdm=UtilFunc.getXbdmFromGmsfhm(gmsfhm);
						  boolean rzsjBool=UtilFunc.checkSjLen(rzsj_dt);
						  boolean ldsjBool=UtilFunc.checkSjLen(ldsj_dt);
						  boolean fhBool = UtilFunc.checkFh(fh);
						  boolean zslgBool = UtilFunc.checkZslg(zslg);
						  
	 
						  boolean zslgTimeBool=UtilFunc.checkZslgTime(rzsj_int,ldsj_int);
						  
						  if(rzsjBool==true && ldsjBool==true && fhBool==true && zslgBool==true && zslgTimeBool==true){
							  result.set(0, gmsfhm);
						
							  Date rzsjDt;
							  Date ldsjDt;
							  try{
							    rzsjDt = sdf.parse(rzsj_dt);
							    ldsjDt = sdf.parse(ldsj_dt);
							    result.set(1, rzsjDt);
	                            result.set(2,ldsjDt);
							  }catch(Exception e){
							    //throws e.getMessage();
							  }
							  
							  result.set(3,rzsj_int);
							  result.set(4,ldsj_int);
							  result.set(5,fh);
							  result.set(6,zslg);
							  result.set(7,value.getString("zslgdm"));
							  result.set(8,value.get("ssdsdm"));
							  result.set(9,xbdm);
							  context.write(result); 
							  System.out.println("reduce 结果输出成功");
						  }
					  } 
				  }
  
			 }
			 
        }
}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章