Java大數據之路--MapReduce(1)

MapReduce(分佈式計算模型)

目錄

MapReduce(分佈式計算模型)

一、概述

二、入門案例

案例一、統計文件中的每一個單詞出現的次數(文件:words.txt)

案例二、找出最大值

案例三、輸出每一個單詞出現的文件(目錄:invert)


一、概述

  1. MapReduce是一種分佈式計算模型
  2. 由谷歌提出,基於GFS進行設計,主要用於搜索領域中解決海量數據的計算問題
  3. Doug Cutting根據《MapReduce: Simplified Data Processing on Large Clusters》設計實現了Hadoop中基於HDFSMapReduce
  4. MapReduce是由兩個階段組成:Map和Reduce,用戶只需要實現map以及reduce兩個函數,即可實現分佈式計算,這樣做的目的是簡化分佈式程序的開發和調試周期,Map(映射)階段和Reduce(規約)階段.
  5. MapReduce中的鍵值對默認以製表符隔開,_開頭的文件在MapReduce中會認爲是隱藏文件,默認不讀

二、入門案例

案例一、統計文件中的每一個單詞出現的次數(文件:words.txt)

Mapper

public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
    // key -- 當前行的開始位置在整個文件中的偏移量
    // value -- 當前行的內容
    // context -- 環境對象
    public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 獲取一行數據
        String line = value.toString();
        // 以空格爲單位進行切分,獲得單詞對應的數組
        String[] arr = line.split(" ");
        // 遍歷數組,輸出這個詞對應的頻率
        for (String str : arr) {
            context.write(new Text(str), new LongWritable(1));
        }
    }
}

 

Reducer

public class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
    // key 鍵
    // it 集合的迭代器
    // context 環境對象
    public void reduce(Text key, Iterable<LongWritable> values, Context context)
        throws IOException, InterruptedException {
        // 定義變量記錄次數
        long sum = 0;
        // 循環 遍歷集合,進行累加的操作,得到當前單詞出現的總次數
        for (LongWritable val : values) {
            // 記錄總次數
            sum += val.get();
        }
        // 輸出數據,key是單詞,value是在map階段這個單詞出現的總的次數
        context.write(key, new LongWritable(sum));
    }
}

 

Driver

public class WordCountDriver {

    public static void main(String[] args) throws Exception {
        // 獲取當前的默認配置
        Configuration conf = new Configuration();
        // 獲取代表當前mapreduce作業的JOB對象
         Job job = Job.getInstance(conf);
        // 指定當前程序的入口類
        job.setJarByClass(cn.zyj.wc.WordCountDriver.class);
        // 設置要執行的Mapper類
        job.setMapperClass(WordCountMapper.class);
        // 設置要執行的Reducerr類
        job.setReducerClass(WordCountReducer.class);
        // 設置Mapper的結果類型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);
        // 設置Reducer的結果類型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        // 設置輸入路徑
        //  如果輸入的是文件,那麼讀取的是指定的文件
        // 如果輸入的是目錄,則讀取當前目錄下的所有的文件
        FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.32.138:9000/mr/words.txt"));
        // 設置輸出路徑
        FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.32.138:9000/wcresult"));
        // 執行job
        if (!job.waitForCompletion(true))
            return;
    }
}
words.txt
hello tom hello bob
hello joy
hello rose
hello joy
hello jerry
hello tom
hello rose
hello joy

案例二、找出最大值

Mapper:

import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class MaxMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

	public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
		String[] sp = value.toString().split(" ");
			context.write(new Text(sp[0]), new IntWritable(Integer.parseInt(sp[1])));
		
	}

}

Reducer:

package cn.zyj.maxDemo;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/**
 * 泛型表示輸入輸出兩個k-v
 * @author Administrator
 *
 */
public class MaxReduce extends Reducer<Text, IntWritable, Text, IntWritable> {

	public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
		// process values
		//第一種方法採用基本類型
		//		int max = 0;
		//		for (IntWritable val : values) {
		//			if (Integer.parseInt(val.toString())>max) {
		//				max = Integer.parseInt(val.toString());
		//			}
		//		}
		//		context.write(key, new IntWritable(max));
		//第二種方法迭代器,需要注意的問題,採用地址複用
		// 在MapReduce中,爲了減少對象的創建和銷燬,採用了地址複用機制
		// 在迭代過程中,被迭代的對象只創建一次
		IntWritable max = new IntWritable(0);
		// key = Bob
		// values = 684 512 340 312
		// IntWritable val = new IntWritable();
		// val.set(684);
		// val.get() > max.get() -> 684 > 0 -> true
		// max = val; - 將val賦值給max,給的是地址,所以max和val的指向地址一致
		// val.set(512);
		// val.get() > max.get() -> 512 > 512 -> false
		// 最後max的值是最後一個被迭代的值

		for (IntWritable val : values) {
			if (val.get()>max.get()) {
				//max=val;
				max.set(val.get());
			}
		}
		context.write(key, max);
	}

}

Driver:

package cn.zyj.maxDemo;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class MaxDriver {

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job job = Job.getInstance();
		job.setJarByClass(cn.zyj.maxDemo.MaxDriver.class);
		// TODO: specify a mapper
		job.setMapperClass(MaxMapper.class);
		// TODO: specify a reducer
		job.setReducerClass(MaxReduce.class);
		// TODO: specify output types
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		// TODO: specify input and output DIRECTORIES (not files)
		FileInputFormat.setInputPaths(job, new Path("hdfs://10.42.3.8:9000/txt/score2.txt"));
		FileOutputFormat.setOutputPath(job, new Path("hdfs://10.42.3.8:9000/result/max1"));

		if (!job.waitForCompletion(true))
			return;
	}

}
score2.txt
Bob 684
Alex 265
Grace 543
Henry 341
Adair 345
Chad 664
Colin 464
Eden 154
Grover 630
Bob 340
Alex 367
Grace 567
Henry 367
Adair 664
Chad 543
Colin 574
Eden 663
Grover 614
Bob 312
Alex 513
Grace 641
Henry 467
Adair 613
Chad 697
Colin 271
Eden 463
Grover 452
Bob 548
Alex 285
Grace 554
Henry 596
Adair 681
Chad 584
Colin 699
Eden 708
Grover 345

案例三、輸出每一個單詞出現的文件(目錄:invert)

Mapper:

package cn.zyj.file;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class fileMapper extends Mapper<LongWritable, Text, Text, Text> {

	public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

		String[] val = value.toString().split(" ");

		for (String string : val) {
			FileSplit fs = (FileSplit) context.getInputSplit();
			String name = fs.getPath().getName();
			context.write(new Text(string), new Text(name));
		}

	}

}

Reducer:

package cn.zyj.file;

import java.io.IOException;
import java.util.HashSet;
import java.util.Set;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class fileReduce extends Reducer<Text, Text, Text, Text> {

	public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
		// process values
		Set<String> set = new HashSet<String>();
		for (Text val : values) {
			set.add(val.toString());
		}
		
		context.write(key, new Text(set.toString()));
	}

}

Driver:

package cn.zyj.file;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class fileDriver {

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf, "JobName");
		job.setJarByClass(cn.zyj.file.fileDriver.class);
		// TODO: specify a mapper
		job.setMapperClass(fileMapper.class);
		// TODO: specify a reducer
		job.setReducerClass(fileReduce.class);

		// TODO: specify output types
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);

		// TODO: specify input and output DIRECTORIES (not files)
		FileInputFormat.setInputPaths(job, new Path("hdfs://10.42.3.8:9000/txt/invert"));
		FileOutputFormat.setOutputPath(job, new Path("hdfs://10.42.3.8:9000/result/invert1"));

		if (!job.waitForCompletion(true))
			return;
	}

}
//a.txt

hello nio
hi concurrent
hello zookeeper
hello thrift
hi avro

//b.txt

hi hadoop
hello hdfs
hi mapreduce
hi yarn

//c.txt

hadoop hdfs
netty nio
serial avro
hadoop mapreduce
serial thrift

//d.txt

nio mina
proto serial
avro serial
observer zookeeper
ozone hadoop

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章