MapReduce DEMO

/**
 * @author cai_huaxue 36927
 * 數據清洗
 * 清洗規則
 * 1. 只保留公民身份證號長度是18位數的。
 * 2. 性別代碼 從公民身份證號裏獲取,身份證號的導數第二位是奇數則表示男性1, 如果是偶數則表示女性1。
 * 3. 刪選出rzsj_dt, ldsj_dt的長度在(17,18,19)的。
 * 4. 篩選出開房時間在5 min 到 90天之內的。
 * 5. 對zslg清洗,過濾掉旅館名稱含有[浴業,浴場,沐浴,洗浴,足浴,淋浴,浴室, 浴城,溫泉,浴館]
 * 6. 對fh清洗,過濾掉fh名稱含有[廳,手牌,足療,足浴,dt]
 * 7.對於某個房號,如果某天 內,開房次數超過10次,就把該房間當天的數據全部刪除掉
 */
package com.dahua;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;

import com.aliyun.odps.OdpsException;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.data.TableInfo;
import com.aliyun.odps.mapred.JobClient;
import com.aliyun.odps.mapred.MapperBase;
import com.aliyun.odps.mapred.ReducerBase;
import com.aliyun.odps.mapred.conf.JobConf;
import com.aliyun.odps.mapred.utils.InputUtils;
import com.aliyun.odps.mapred.utils.OutputUtils;
import com.aliyun.odps.mapred.utils.SchemaUtils;
import com.dahua.utils.*;

public class CleanLgzsData {
	
	private String inputTable;
	private String outputTable;
	
	public CleanLgzsData(String inputTable,String outputTable){
		this.inputTable=inputTable;
		this.outputTable=outputTable;
	}
	
	 public void executeMR() throws OdpsException {
	        JobConf aliJob = new JobConf();
	        aliJob.setMapperClass(CleanLgzsDataMapper.class);
	        aliJob.setReducerClass(CleanLgzsDataReducer.class);  
	        /**
	         * 通常情況下,GroupingColumns包含在KeySortColumns中,KeySortColumns和PartitionColumns要包含在Key schema中
	         */
	        /**
	         * map 階段
	         */
	        //設置Mapper輸出到Reducer的Key行屬性 
	        aliJob.setMapOutputKeySchema(SchemaUtils.fromString("gmsfhm: String"));
	        
	        //設置Mapper輸出到Reducer的Value行屬性
	        aliJob.setMapOutputValueSchema(SchemaUtils.fromString("rzsj_dt: datetime, ldsj_dt: datetime, rzsj_int: bigint, ldsj_int: bigint, fh: String, zslgdm: String, zslg: String, ssdsdm: String, xbdm: String"));
	        
	        /**
	         * Shuffle-合併排序
	         */
	        //Mapper輸出的Record會根據設置的PartitionColumns計算哈希值,決定分配到哪個Reducer.
	        //有可能是多個gmsfhm證號分配到同意reducer。
            aliJob.setPartitionColumns(new String[]{"gmsfhm"});
            
            //Mapper輸出的Record會根據KeySortColumns對Record進行排序.
            aliJob.setOutputKeySortColumns(new String[]{"gmsfhm"});

             /**
              * Reduce階段
              */
	        //在Reduce端,會根據GroupingColumns指定的列對輸入的Records進行分組,把GroupingColumns所指定列相同的Records作爲一次reduce函數調用的輸入.
            //數據在Reducer裏排序好了後,是哪些數據進入到同一個reduce方法的,就是看這裏的設置。一般來說,設置的和setPartitionColumns(String[] cols)一樣
	        aliJob.setOutputGroupingColumns(new String[]{"gmsfhm"});

	       //輸入表
	        InputUtils.addTable(TableInfo.builder().tableName(inputTable).build(), aliJob);
	        
	       //輸出表
	        OutputUtils.addTable(TableInfo.builder().tableName(outputTable).build(), aliJob);
	        
	        JobClient.runJob(aliJob);
	    }
	 
		/**
		 * map過程
		 *
		 */
	    public static class CleanLgzsDataMapper extends MapperBase {
			private Record key;
			private Record value;
			
			@Override
	        public void setup(TaskContext context) throws IOException {
	            key = context.createMapOutputKeyRecord();
	            value = context.createMapOutputValueRecord();    
	        }
			
	        @Override
	        public void map(long recordNum, Record record, TaskContext context)throws IOException {
	        	/*
	        	 * 這裏注意數據類型,Datetime
	        	 */
	        	key.set(0, record.getString("gmsfhm"));
                             
                value.set(0,record.getDatetime("rzsj_dt"));
                value.set(1, record.getDatetime("ldsj_dt"));
	            value.set(2, record.getBigint("rzsj_int"));
	            value.set(3, record.getBigint("ldsj_int"));
	            value.set(4, record.getString("fh"));
	            value.set(5,record.getString("zslgdm"));
	            value.set(6, record.getString("zslg"));
	            value.set(7, record.getString("ssdsdm"));
	            value.set(8, record.getString("xbdm"));
	            System.out.println("map讀進數據成功");
	            context.write(key, value);
	        }
	        
		}
	    
	    /**
	     * 
	     * reduce過程
	     *
	     */
	    public static class CleanLgzsDataReducer extends ReducerBase{
			 private Record result;
			
			 @Override
	         public void setup(TaskContext context) throws IOException {
	            //result是輸出的數據
	            result = context.createOutputRecord();
	         }
			 
			 @Override
		     public void reduce(Record key, Iterator<Record> values, TaskContext context) throws IOException {
			      String gmsfhm="123456";
			      //因爲公民身份證號也會有空值。
			      if(key.getString("gmsfhm") != null){
			         gmsfhm=key.getString("gmsfhm");
			      }
				  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
				  int gmsfhmStdLen18=18;
				  int gmsfhmStdLen15=15;
				  if(gmsfhm.length() == gmsfhmStdLen18 || gmsfhm.length()==gmsfhmStdLen15){
					  while(values.hasNext()){
						  Record value = values.next();
						  /*
						   * 這裏定義了僞值,因爲源數據,這些字段有些爲空,後面進行格式轉化時,會報空指針錯誤。
						   * 
						   */
						  String rzsj_dt="rzsj_dt";
						  String ldsj_dt="ldsj_dt";
						  Long rzsj_int = 100L;
						  Long ldsj_int = 200L;
						  String fh = "足療";
						  String zslg = "浴室";
 
						  if(value.getDatetime("rzsj_dt") != null){
						    rzsj_dt = sdf.format(value.getDatetime("rzsj_dt"));
						  }
                         
						  if(value.getDatetime("ldsj_dt") != null){
						    ldsj_dt = sdf.format(value.getDatetime("ldsj_dt"));
						  }
						 
						  if(value.getBigint("rzsj_int") != null){
						     rzsj_int = value.getBigint("rzsj_int");
						  }
                          
						  if (value.getBigint("ldsj_int") != null){
						     ldsj_int = value.getBigint("ldsj_int");
						  }
						  
						  if(value.getString("zslg") != null){
						     zslg = value.getString("zslg");
						  }
						  
						  if (value.getString("fh") != null){
						     fh = value.getString("fh");
						  }
						  
					      String xbdm=UtilFunc.getXbdmFromGmsfhm(gmsfhm);
						  boolean rzsjBool=UtilFunc.checkSjLen(rzsj_dt);
						  boolean ldsjBool=UtilFunc.checkSjLen(ldsj_dt);
						  boolean fhBool = UtilFunc.checkFh(fh);
						  boolean zslgBool = UtilFunc.checkZslg(zslg);
						  
	 
						  boolean zslgTimeBool=UtilFunc.checkZslgTime(rzsj_int,ldsj_int);
						  
						  if(rzsjBool==true && ldsjBool==true && fhBool==true && zslgBool==true && zslgTimeBool==true){
							  result.set(0, gmsfhm);
						
							  Date rzsjDt;
							  Date ldsjDt;
							  try{
							    rzsjDt = sdf.parse(rzsj_dt);
							    ldsjDt = sdf.parse(ldsj_dt);
							    result.set(1, rzsjDt);
	                            result.set(2,ldsjDt);
							  }catch(Exception e){
							    //throws e.getMessage();
							  }
							  
							  result.set(3,rzsj_int);
							  result.set(4,ldsj_int);
							  result.set(5,fh);
							  result.set(6,zslg);
							  result.set(7,value.getString("zslgdm"));
							  result.set(8,value.get("ssdsdm"));
							  result.set(9,xbdm);
							  context.write(result); 
							  System.out.println("reduce 結果輸出成功");
						  }
					  } 
				  }
  
			 }
			 
        }
}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章