Hadoop實戰【二、MapReduce+自定義數據類型】

一、合久必分——MapReduce

  • HDFS是hadoop的雲存儲,而MapReduce即是hadoop雲計算。
  • MapReduce採用”分而治之“的思想,把對大規模數據集的操作,分發給一個主節點管理下的各分節點共同完成,然後通過整合各分節點的中間結果,得到最終的結果。
  • Map階段:MapReduce框架將任務的輸入數據分割成固定大小的片段(splits),隨後將每個split進一步分解成一批鍵值對<K1,V1>。Hadoop爲每一個split創建一個Map任務用於執行用戶自定義的map函數,以<K1,V1>對作爲輸入,得到計算的中間結果<K2,V2>。接着將中間結果按照K2進行排序,並將key值相同的value組合成爲<K2,list<V2>>元組,分組後對應不同的Reduce任務
  • Reduce階段:Reduce把從不同Mapper接收來的數據整合在一起,調用用戶自定義的reduce函數,對從map傳來得<K2,list<V2>>進行相應的處理,得到最終結果<K3,V3>並輸出到HDFS上

         

          一、WordCount案例

package com.yc.hadoop;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCount {
	private final static IntWritable one = new IntWritable(1);
	
	/**
	 * 自定義Mapper
	 * 功能:將輸入的文本設爲key,將value設爲1,
	 * 當map()運行完後Hadoop框架會對中間數據進行排序合併
	 * @author wrm
	 *
	 */
	public static class TokenizerMapper 
	extends Mapper<LongWritable, Text, Text, IntWritable>{
		private Text word = new Text();
		protected void map(LongWritable key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
			String[] vs=value.toString().split("\\s");
			for (String v : vs) {
				context.write(new Text(v), one);
			}

		}
	}
	
	/**
	 * 自定義Reduce
	 * 功能:將從Mapper輸入的數據進行計算。由於Hadoop對中間數據進行了合併
	 * Key又是單詞,所以相同的單詞就會變爲<K,list<V>>模式,迭代,累加後輸出
	 * @author wrm
	 *
	 */
	public static class IntSumReducer 
	extends Reducer<Text,IntWritable,Text,IntWritable> {
		private IntWritable result = new IntWritable();

		@Override
		protected void reduce(Text key, Iterable<IntWritable> value,
				Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
			int count=0;
			for (IntWritable vs : value) {
				count+=vs.get();
			}
			context.write(key, new IntWritable(count));	//輸出數據
		}
	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		//    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
		//    System.out.println(otherArgs);
		//    if (otherArgs.length != 2) {
		//      System.err.println("Usage: wordcount <in> <out>");
		//      System.exit(2);
		//    }
		Job job = Job.getInstance(conf, "Word Count");
		job.setJarByClass(WordCount.class);		//設置程序啓動類
		job.setMapperClass(TokenizerMapper.class);	//設置Mapper使用類
		job.setReducerClass(IntSumReducer.class);	//設置Reduce使用類
		job.setOutputKeyClass(Text.class);	//設置從Map中輸出的Key的數據類型
		job.setOutputValueClass(IntWritable.class);	//設置從Map中輸出的Value的數據類型
		FileInputFormat.addInputPath(job, new Path("C:/Users/wrm/Desktop/1.txt"));		//設置源目錄
		FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.1.108:9000/file1/"));		//設置目標目錄
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}

二、自定義數據類型



Hadoop有八大數據類型分別對應Java中的:

Hadoop                                            Java

BooleanWritable                             boolean

ByteWritable                                     byte

DoubleWritable                               double

FloatWritable                                    float

IntWritable                                         int

LongWritable                                    long

Text                                                     String

NullWritable                                      Null


這八大數據類型肯定是不夠用的,那麼就必須像JavaBean一樣的自定義數據類型。而Hadoop是基於流操作的,所以它的數據類型必須要有讀出和寫入的操作。如下實例是一個用戶的數據類型:

package com.yc.hadoop.bean;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;

public class UserInfo implements WritableComparable<UserInfo>,Serializable {
	/**
	 * 
	 */
	private static final long serialVersionUID = -5923323181398894001L;
	private String name;
	private String id;
	private String age;
	private String sex;
	
	public String getName() {
		return name;
	}
	@Override
	public String toString() {
		return "name:" + name + " id:" + id + " age:" + age + " sex:" + sex;
	}
	public UserInfo() {
	}
	
	public UserInfo(String id,String name,String age, String sex) {
		this.name = name;
		this.id = id;
		this.age = age;
		this.sex = sex;
    }
	
	


	/**
	 * 必有,作用是從流中讀取出參數
	 */
	 
	@Override
	public void readFields(DataInput in) throws IOException {
		this.name=in.readUTF();
		this.id=in.readUTF();
		this.age=in.readUTF();
		this.sex=in.readUTF();
	}
	
	/**
	 * 必有:將對象寫入流中
	 */
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(name);
		out.writeUTF(id);
		out.writeUTF(age);
		out.writeUTF(sex);
		
	}
	
	/**
	 * 需做Key時必有,因爲Key在中間數據階段需要排序,必須要能判斷其大小
	 */
	@Override
	public int compareTo(UserInfo o) {
		return Integer.parseInt(this.id)-Integer.parseInt(o.id);
	}
	@Override
	public int hashCode() {
		final int prime = 31;
		int result = 1;
		result = prime * result + ((age == null) ? 0 : age.hashCode());
		result = prime * result + ((id == null) ? 0 : id.hashCode());
		result = prime * result + ((name == null) ? 0 : name.hashCode());
		result = prime * result + ((sex == null) ? 0 : sex.hashCode());
		return result;
	}
	@Override
	public boolean equals(Object obj) {
		if (this == obj)
			return true;
		if (obj == null)
			return false;
		if (getClass() != obj.getClass())
			return false;
		UserInfo other = (UserInfo) obj;
		if (age == null) {
			if (other.age != null)
				return false;
		} else if (!age.equals(other.age))
			return false;
		if (id == null) {
			if (other.id != null)
				return false;
		} else if (!id.equals(other.id))
			return false;
		if (name == null) {
			if (other.name != null)
				return false;
		} else if (!name.equals(other.name))
			return false;
		if (sex == null) {
			if (other.sex != null)
				return false;
		} else if (!sex.equals(other.sex))
			return false;
		return true;
	}
	public void setName(String name) {
		this.name = name;
	}
	public void setId(String id) {
		this.id = id;
	}
	public void setAge(String age) {
		this.age = age;
	}
	public void setSex(String sex) {
		this.sex = sex;
	}
	public String getId() {
		return id;
	}
	public String getAge() {
		return age;
	}
	public String getSex() {
		return sex;
	}
	
	
	
	
	
	
	
}

其的使用方法:
job.setOutputKeyClass(UsetInfo.class);   //作爲Key時
job.setOutputValueClass(UserInfo.class);    //作爲value時

以上,就是MapReduce的基本用法和自定義數據類型的用法。更多的加深實際上是在算法層面的。






發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章