實際需要的字段:
手機號碼、上行數據包數、下行數據包數、上行總流量、下行總流量。
2) 自定義數據類型(五個字段)
DataWritable implement WritableComparable接口。
1 package org.dragon.hadoop.mapreduce.app; 2 3 import java.io.DataInput; 4 import java.io.DataOutput; 5 import java.io.IOException; 6 7 import org.apache.hadoop.io.Writable; 8 9 /** 10 * 11 * @author ZhuXY 12 * @time 2016-3-10 下午3:49:55 13 * 14 */ 15 public class DataWritable implements Writable { 16 17 // telsphone 18 19 // upload 20 private int upPackNum; 21 private int upPayLoad; 22 23 // download 24 private int downPackNum; 25 private int downPayLoad; 26 27 public DataWritable() { 28 29 } 30 31 public void set(int upPackNum, int upPayLoad, int downPackNum, 32 int downPayload) { 33 this.upPackNum = upPackNum; 34 this.upPayLoad = upPayLoad; 35 this.downPackNum = downPackNum; 36 this.downPayLoad = downPayload; 37 38 } 39 40 public int getUpPackNum() { 41 return upPackNum; 42 } 43 44 public int getUpPayLoas() { 45 return upPayLoad; 46 } 47 48 public int getDownPackNum() { 49 return downPackNum; 50 } 51 52 public int getDownPayload() { 53 return downPayLoad; 54 } 55 56 @Override 57 public void write(DataOutput out) throws IOException { 58 out.writeInt(upPackNum); 59 out.writeInt(upPayLoad); 60 out.writeInt(downPackNum); 61 out.writeInt(downPayLoad); 62 } 63 64 /** 65 * 讀出的順序要和寫入的順序相同 66 */ 67 @Override 68 public void readFields(DataInput in) throws IOException { 69 // TODO Auto-generated method stub 70 this.upPackNum = in.readInt(); 71 this.upPayLoad = in.readInt(); 72 this.downPackNum = in.readInt(); 73 this.downPayLoad = in.readInt(); 74 } 75 76 @Override 77 public String toString() { 78 return upPackNum + "\t" + upPayLoad + "\t" + downPackNum + "\t" 79 + downPayLoad; 80 } 81 82 @Override 83 public int hashCode() { 84 final int prime = 31; 85 int result = 1; 86 result = prime * result + downPackNum; 87 result = prime * result + downPayLoad; 88 result = prime * result + upPackNum; 89 result = prime * result + upPayLoad; 90 return result; 91 } 92 93 @Override 94 public boolean equals(Object obj) { 95 if (this == obj) 96 return true; 97 if (obj == null) 98 return false; 99 if (getClass() != obj.getClass()) 100 return false; 101 DataWritable other = (DataWritable) obj; 102 if (downPackNum != other.downPackNum) 103 return false; 104 if (downPayLoad != other.downPayLoad) 105 return false; 106 if (upPackNum != other.upPackNum) 107 return false; 108 if (upPayLoad != other.upPayLoad) 109 return false; 110 return true; 111 } 112 113 }
3) 分析MapReduce寫法,哪些業務邏輯在Map階段執行,哪些業務邏輯在reduce階段執行。
Map階段:從文件中獲取數據,抽取出需要的五個字段,輸出的Key爲手機號碼,輸出的Value爲數據量的類型DataWritable對象。
Reduce階段:將相同手機號碼的Value中的數據流量進行相加,得出手機流量的總數(數據包和數據流量)。輸出到文件中,以製表符分開。
1 package org.dragon.hadoop.mapreduce.app.topk; 2 3 import java.io.IOException; 4 import java.util.Iterator; 5 import java.util.TreeMap; 6 import java.util.TreeSet; 7 8 import javax.security.auth.callback.LanguageCallback; 9 10 import org.apache.hadoop.classification.InterfaceAudience.Private; 11 import org.apache.hadoop.conf.Configuration; 12 import org.apache.hadoop.fs.Path; 13 import org.apache.hadoop.io.LongWritable; 14 import org.apache.hadoop.io.NullWritable; 15 import org.apache.hadoop.io.Text; 16 import org.apache.hadoop.mapreduce.Job; 17 import org.apache.hadoop.mapreduce.Mapper; 18 import org.apache.hadoop.mapreduce.Reducer; 19 import org.apache.hadoop.mapreduce.Mapper.Context; 20 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 21 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 22 import org.dragon.hadoop.mapreduce.app.topk.TopKMapReduceV3.TopKReducer; 23 24 import com.sun.jersey.core.header.LanguageTag; 25 26 import sun.reflect.LangReflectAccess; 27 28 /** 29 * 30 * @author ZhuXY 31 * @time 2016-3-13 下午12:57:26 32 * 33 */ 34 35 /** 36 * 統計 & TopKey 37 * 38 * 數據格式: 語言類別 歌曲名稱 收藏次數 播放次數 歌手名稱 需求: 統計前十首播放次數最多的歌曲名稱和次數。 39 * 40 * 思想:在Mapper中輸出:key---歌曲類型+歌曲名稱 41 * value---播放次數 42 * Reducer中:key----封裝成TopKWritable對象 43 * value---nullwritable 44 * reduce方法中進行集合存儲,然後刪除多餘的 45 * 46 */ 47 public class TopKMapReduceV4 { 48 private static final int KEY = 4; 49 50 // Mapper class 51 public static class TopKMapper extends 52 Mapper<LongWritable, Text, Text, LongWritable> { 53 54 @Override 55 protected void cleanup(Context context) throws IOException, 56 InterruptedException { 57 super.cleanup(context); 58 } 59 60 @Override 61 protected void map(LongWritable key, Text value, Context context) 62 throws IOException, InterruptedException { 63 //文件的輸入類型爲TextInputFormat,默認到map中的爲<Longwritable,Text> 64 String lineValue = value.toString(); 65 66 if (null == lineValue) { 67 return; 68 } 69 70 //split 71 String[] splitValue = lineValue.split("\t"); 72 73 if (splitValue != null && splitValue.length == 5) { 74 String languageType = splitValue[0]; 75 String songName = splitValue[1]; 76 Long playNum = Long.parseLong(splitValue[3]); 77 78 context.write(new Text(languageType + "\t" + songName), 79 new LongWritable(playNum)); 80 } 81 } 82 83 @Override 84 protected void setup(Context context) throws IOException, 85 InterruptedException { 86 // TODO Auto-generated method stub 87 super.setup(context); 88 } 89 } 90 91 // Reducer class 92 public static class TopKReducer extends 93 Reducer<Text, LongWritable, TopKWritable, NullWritable> { 94 95 //此集合的排序規則即爲TopKWritable中comparaTo的排序規則 96 TreeSet<TopKWritable> treeSet=new TreeSet<TopKWritable>(); 97 98 @Override 99 protected void setup(Context context) 100 throws IOException, InterruptedException { 101 // TODO Auto-generated method stub 102 super.setup(context); 103 } 104 105 @Override 106 protected void reduce(Text key, Iterable<LongWritable> values, 107 Context context) throws IOException, InterruptedException { 108 109 Long palyNum=(long) 0; 110 if (key==null) { 111 return; 112 } 113 114 //get key 115 String[] keyArr=key.toString().split("\t"); 116 String languageType=keyArr[0]; 117 String songName=keyArr[1]; 118 119 //sum 120 for(LongWritable value:values){ 121 palyNum+=value.get(); 122 } 123 124 //歌曲類型、歌曲名稱、歌曲播放次數封裝成TopKWritable對象,保存在treeSet集合中,此集合自動排序 125 treeSet.add(new TopKWritable( 126 languageType,songName,palyNum 127 )); 128 129 if (treeSet.size()>KEY) { 130 treeSet.remove(treeSet.last());//remove the current small longNum 131 } 132 } 133 134 @Override 135 protected void cleanup(Context context) 136 throws IOException, InterruptedException { 137 for (TopKWritable topKWritable : treeSet) { 138 context.write(topKWritable,NullWritable.get()); 139 } 140 } 141 142 143 } 144 // Driver Code 145 public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 146 // get conf 147 Configuration conf=new Configuration(); 148 149 // create job 150 Job job =new Job(conf, TopKMapReduceV4.class.getSimpleName());//Job name 151 152 // set job 153 job.setJarByClass(TopKMapReduceV4.class); 154 155 // 1)set inputPath 156 FileInputFormat.addInputPath(job, new Path(args[0])); 157 158 // 2)set map 159 job.setMapperClass(TopKMapper.class); 160 job.setMapOutputKeyClass(Text.class); 161 job.setMapOutputValueClass(LongWritable.class); 162 163 // 3)set outputPath 164 FileOutputFormat.setOutputPath(job, new Path(args[1])); 165 166 // 4)set reduce 167 job.setReducerClass(TopKReducer.class); 168 job.setOutputKeyClass(TopKWritable.class); 169 job.setOutputValueClass(NullWritable.class); 170 171 // submit job 172 boolean isSuccess=job.waitForCompletion(true); 173 174 //return status 175 return isSuccess?0:1; 176 } 177 178 public static void main(String[] args) throws IOException, InterruptedException, Exception { 179 180 args=new String[]{ 181 "hdfs://hadoop-master.dragon.org:9000/wc/wcinput/", 182 "hdfs://hadoop-master.dragon.org:9000/wc/wcoutput" 183 }; 184 int status =new TopKMapReduceV4().run(args); 185 System.exit(status); 186 } 187 }
實際的業務中,原始數據存儲在文件或者關係型數據庫中,需要進行多次的數據的清理和篩選,符合我們需要的數據,將不合格的數據全部進行過濾,
Sqoop 框架,將關係型數據和Hbase、Hive以及HDFS中導入導出數據。
針對實際的負責的實際業務,都需要自己編寫代碼進行數據的清洗。