Hadoop實戰【二、MapReduce+自定義數據類型】

一、合久必分——MapReduce

HDFS是hadoop的雲存儲，而MapReduce即是hadoop雲計算。

MapReduce採用”分而治之“的思想，把對大規模數據集的操作，分發給一個主節點管理下的各分節點共同完成，然後通過整合各分節點的中間結果，得到最終的結果。

Map階段：MapReduce框架將任務的輸入數據分割成固定大小的片段（splits），隨後將每個split進一步分解成一批鍵值對<K1，V1>。Hadoop爲每一個split創建一個Map任務用於執行用戶自定義的map函數，以<K1,V1>對作爲輸入，得到計算的中間結果<K2,V2>。接着將中間結果按照K2進行排序，並將key值相同的value組合成爲<K2，list<V2>>元組，分組後對應不同的Reduce任務

Reduce階段：Reduce把從不同Mapper接收來的數據整合在一起，調用用戶自定義的reduce函數，對從map傳來得<K2，list<V2>>進行相應的處理，得到最終結果<K3，V3>並輸出到HDFS上

一、WordCount案例

package com.yc.hadoop;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCount {
	private final static IntWritable one = new IntWritable(1);
	
	/**
	 * 自定義Mapper
	 * 功能：將輸入的文本設爲key，將value設爲1，
	 * 當map()運行完後Hadoop框架會對中間數據進行排序合併
	 * @author wrm
	 *
	 */
	public static class TokenizerMapper 
	extends Mapper<LongWritable, Text, Text, IntWritable>{
		private Text word = new Text();
		protected void map(LongWritable key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
			String[] vs=value.toString().split("\\s");
			for (String v : vs) {
				context.write(new Text(v), one);
			}

		}
	}
	
	/**
	 * 自定義Reduce
	 * 功能：將從Mapper輸入的數據進行計算。由於Hadoop對中間數據進行了合併
	 * Key又是單詞，所以相同的單詞就會變爲<K,list<V>>模式，迭代，累加後輸出
	 * @author wrm
	 *
	 */
	public static class IntSumReducer 
	extends Reducer<Text,IntWritable,Text,IntWritable> {
		private IntWritable result = new IntWritable();

		@Override
		protected void reduce(Text key, Iterable<IntWritable> value,
				Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
			int count=0;
			for (IntWritable vs : value) {
				count+=vs.get();
			}
			context.write(key, new IntWritable(count));	//輸出數據
		}
	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		//    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
		//    System.out.println(otherArgs);
		//    if (otherArgs.length != 2) {
		//      System.err.println("Usage: wordcount <in> <out>");
		//      System.exit(2);
		//    }
		Job job = Job.getInstance(conf, "Word Count");
		job.setJarByClass(WordCount.class);		//設置程序啓動類
		job.setMapperClass(TokenizerMapper.class);	//設置Mapper使用類
		job.setReducerClass(IntSumReducer.class);	//設置Reduce使用類
		job.setOutputKeyClass(Text.class);	//設置從Map中輸出的Key的數據類型
		job.setOutputValueClass(IntWritable.class);	//設置從Map中輸出的Value的數據類型
		FileInputFormat.addInputPath(job, new Path("C:/Users/wrm/Desktop/1.txt"));		//設置源目錄
		FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.1.108:9000/file1/"));		//設置目標目錄
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}

二、自定義數據類型

Hadoop有八大數據類型分別對應Java中的：

Hadoop Java

BooleanWritable boolean

ByteWritable byte

DoubleWritable double

FloatWritable float

IntWritable int

LongWritable long

Text String

NullWritable Null

這八大數據類型肯定是不夠用的，那麼就必須像JavaBean一樣的自定義數據類型。而Hadoop是基於流操作的，所以它的數據類型必須要有讀出和寫入的操作。如下實例是一個用戶的數據類型：

package com.yc.hadoop.bean;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;

public class UserInfo implements WritableComparable<UserInfo>,Serializable {
	/**
	 * 
	 */
	private static final long serialVersionUID = -5923323181398894001L;
	private String name;
	private String id;
	private String age;
	private String sex;
	
	public String getName() {
		return name;
	}
	@Override
	public String toString() {
		return "name:" + name + " id:" + id + " age:" + age + " sex:" + sex;
	}
	public UserInfo() {
	}
	
	public UserInfo(String id,String name,String age, String sex) {
		this.name = name;
		this.id = id;
		this.age = age;
		this.sex = sex;
    }
	
	


	/**
	 * 必有，作用是從流中讀取出參數
	 */
	 
	@Override
	public void readFields(DataInput in) throws IOException {
		this.name=in.readUTF();
		this.id=in.readUTF();
		this.age=in.readUTF();
		this.sex=in.readUTF();
	}
	
	/**
	 * 必有：將對象寫入流中
	 */
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(name);
		out.writeUTF(id);
		out.writeUTF(age);
		out.writeUTF(sex);
		
	}
	
	/**
	 * 需做Key時必有，因爲Key在中間數據階段需要排序，必須要能判斷其大小
	 */
	@Override
	public int compareTo(UserInfo o) {
		return Integer.parseInt(this.id)-Integer.parseInt(o.id);
	}
	@Override
	public int hashCode() {
		final int prime = 31;
		int result = 1;
		result = prime * result + ((age == null) ? 0 : age.hashCode());
		result = prime * result + ((id == null) ? 0 : id.hashCode());
		result = prime * result + ((name == null) ? 0 : name.hashCode());
		result = prime * result + ((sex == null) ? 0 : sex.hashCode());
		return result;
	}
	@Override
	public boolean equals(Object obj) {
		if (this == obj)
			return true;
		if (obj == null)
			return false;
		if (getClass() != obj.getClass())
			return false;
		UserInfo other = (UserInfo) obj;
		if (age == null) {
			if (other.age != null)
				return false;
		} else if (!age.equals(other.age))
			return false;
		if (id == null) {
			if (other.id != null)
				return false;
		} else if (!id.equals(other.id))
			return false;
		if (name == null) {
			if (other.name != null)
				return false;
		} else if (!name.equals(other.name))
			return false;
		if (sex == null) {
			if (other.sex != null)
				return false;
		} else if (!sex.equals(other.sex))
			return false;
		return true;
	}
	public void setName(String name) {
		this.name = name;
	}
	public void setId(String id) {
		this.id = id;
	}
	public void setAge(String age) {
		this.age = age;
	}
	public void setSex(String sex) {
		this.sex = sex;
	}
	public String getId() {
		return id;
	}
	public String getAge() {
		return age;
	}
	public String getSex() {
		return sex;
	}
	
	
	
	
	
	
	
}

其的使用方法：

job.setOutputKeyClass(UsetInfo.class);   //作爲Key時
job.setOutputValueClass(UserInfo.class);    //作爲value時

以上，就是MapReduce的基本用法和自定義數據類型的用法。更多的加深實際上是在算法層面的。

Hadoop實戰【二、MapReduce+自定義數據類型】

一、合久必分——MapReduce

一、WordCount案例

二、自定義數據類型

linux安裝cuda和cudnn

模擬手機設備：使用 Playwright 實現移動端自動化測試

Mellanox網卡開啓SR-IOV

測試人員都是畫畫大神，讓我看看誰還不會用代碼圖？

Object.values()對象遍歷

我拍了拍Redis，被移出了羣聊···

網絡現代化通向雲原生應用的高速公路

面試官：說說你對序列化的理解

我宣佈，這是我找到的史上AI最全論文體系！

Hadoop實戰【二、MapReduce+自定義數據類型】

Python+Selenium初入殿堂

Redis學習日誌【三、jedis+struts2】

redis學習日誌【二、redis+jedis】

VMware Tools for linux安裝

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結