一、合久必分——MapReduce
- HDFS是hadoop的雲存儲,而MapReduce即是hadoop雲計算。
- MapReduce採用”分而治之“的思想,把對大規模數據集的操作,分發給一個主節點管理下的各分節點共同完成,然後通過整合各分節點的中間結果,得到最終的結果。
- Map階段:MapReduce框架將任務的輸入數據分割成固定大小的片段(splits),隨後將每個split進一步分解成一批鍵值對<K1,V1>。Hadoop爲每一個split創建一個Map任務用於執行用戶自定義的map函數,以<K1,V1>對作爲輸入,得到計算的中間結果<K2,V2>。接着將中間結果按照K2進行排序,並將key值相同的value組合成爲<K2,list<V2>>元組,分組後對應不同的Reduce任務
- Reduce階段:Reduce把從不同Mapper接收來的數據整合在一起,調用用戶自定義的reduce函數,對從map傳來得<K2,list<V2>>進行相應的處理,得到最終結果<K3,V3>並輸出到HDFS上
一、WordCount案例
package com.yc.hadoop;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
private final static IntWritable one = new IntWritable(1);
/**
* 自定義Mapper
* 功能:將輸入的文本設爲key,將value設爲1,
* 當map()運行完後Hadoop框架會對中間數據進行排序合併
* @author wrm
*
*/
public static class TokenizerMapper
extends Mapper<LongWritable, Text, Text, IntWritable>{
private Text word = new Text();
protected void map(LongWritable key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
String[] vs=value.toString().split("\\s");
for (String v : vs) {
context.write(new Text(v), one);
}
}
}
/**
* 自定義Reduce
* 功能:將從Mapper輸入的數據進行計算。由於Hadoop對中間數據進行了合併
* Key又是單詞,所以相同的單詞就會變爲<K,list<V>>模式,迭代,累加後輸出
* @author wrm
*
*/
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> value,
Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int count=0;
for (IntWritable vs : value) {
count+=vs.get();
}
context.write(key, new IntWritable(count)); //輸出數據
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
// System.out.println(otherArgs);
// if (otherArgs.length != 2) {
// System.err.println("Usage: wordcount <in> <out>");
// System.exit(2);
// }
Job job = Job.getInstance(conf, "Word Count");
job.setJarByClass(WordCount.class); //設置程序啓動類
job.setMapperClass(TokenizerMapper.class); //設置Mapper使用類
job.setReducerClass(IntSumReducer.class); //設置Reduce使用類
job.setOutputKeyClass(Text.class); //設置從Map中輸出的Key的數據類型
job.setOutputValueClass(IntWritable.class); //設置從Map中輸出的Value的數據類型
FileInputFormat.addInputPath(job, new Path("C:/Users/wrm/Desktop/1.txt")); //設置源目錄
FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.1.108:9000/file1/")); //設置目標目錄
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
二、自定義數據類型
Hadoop有八大數據類型分別對應Java中的:
Hadoop Java
BooleanWritable boolean
ByteWritable byte
DoubleWritable double
FloatWritable float
IntWritable int
LongWritable long
Text String
NullWritable Null
這八大數據類型肯定是不夠用的,那麼就必須像JavaBean一樣的自定義數據類型。而Hadoop是基於流操作的,所以它的數據類型必須要有讀出和寫入的操作。如下實例是一個用戶的數據類型:
package com.yc.hadoop.bean;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
public class UserInfo implements WritableComparable<UserInfo>,Serializable {
/**
*
*/
private static final long serialVersionUID = -5923323181398894001L;
private String name;
private String id;
private String age;
private String sex;
public String getName() {
return name;
}
@Override
public String toString() {
return "name:" + name + " id:" + id + " age:" + age + " sex:" + sex;
}
public UserInfo() {
}
public UserInfo(String id,String name,String age, String sex) {
this.name = name;
this.id = id;
this.age = age;
this.sex = sex;
}
/**
* 必有,作用是從流中讀取出參數
*/
@Override
public void readFields(DataInput in) throws IOException {
this.name=in.readUTF();
this.id=in.readUTF();
this.age=in.readUTF();
this.sex=in.readUTF();
}
/**
* 必有:將對象寫入流中
*/
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(name);
out.writeUTF(id);
out.writeUTF(age);
out.writeUTF(sex);
}
/**
* 需做Key時必有,因爲Key在中間數據階段需要排序,必須要能判斷其大小
*/
@Override
public int compareTo(UserInfo o) {
return Integer.parseInt(this.id)-Integer.parseInt(o.id);
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((age == null) ? 0 : age.hashCode());
result = prime * result + ((id == null) ? 0 : id.hashCode());
result = prime * result + ((name == null) ? 0 : name.hashCode());
result = prime * result + ((sex == null) ? 0 : sex.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
UserInfo other = (UserInfo) obj;
if (age == null) {
if (other.age != null)
return false;
} else if (!age.equals(other.age))
return false;
if (id == null) {
if (other.id != null)
return false;
} else if (!id.equals(other.id))
return false;
if (name == null) {
if (other.name != null)
return false;
} else if (!name.equals(other.name))
return false;
if (sex == null) {
if (other.sex != null)
return false;
} else if (!sex.equals(other.sex))
return false;
return true;
}
public void setName(String name) {
this.name = name;
}
public void setId(String id) {
this.id = id;
}
public void setAge(String age) {
this.age = age;
}
public void setSex(String sex) {
this.sex = sex;
}
public String getId() {
return id;
}
public String getAge() {
return age;
}
public String getSex() {
return sex;
}
}
其的使用方法:
job.setOutputKeyClass(UsetInfo.class); //作爲Key時
job.setOutputValueClass(UserInfo.class); //作爲value時
以上,就是MapReduce的基本用法和自定義數據類型的用法。更多的加深實際上是在算法層面的。