HDPCD-Java-複習筆記(8)- lab

Java lab booklet


Adding a Combiner

The combiner will greatly decrease the number of key/value pairs distributed across the network between the mappers and reducers.

WordCount main()中增加


job.setCombinerClass(IntSumReducer.class);  


Computing the Average of a Collection of Numbers

The MapReduce job will compute and output the average median income in the year 2000 of each of the 50 states and the District of Columbia.


Abbeville, SC,45001,6581,7471,6787,195278,302280,29673,40460,3042,3294
Acadia, LA,22001,13658,15450,16308,338561,618949,24788,40061,5686,5975
Accomack, VA,51001,9401,11507,10857,238824,444818,25404,38656,4720,5319


Notice the first value in each row is a county name, followed by the state. The third value is a unique ID for the county. The remaining values represent median incomes from various years. For example, the 10th value in each row is the median household income for that county for the year 2000. This is the column you are going to compute the average of for each state.

package average;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class AverageJob extends Configured implements Tool {
	public static class AveragePartitioner extends Partitioner<Text, Text> {

		@Override
		public int getPartition(Text key, Text value, int numPartitions) {
			if (numPartitions == 1) {
				return 0;
			}
			
			return (key.hashCode()&Integer.MAX_VALUE) % numPartitions;
		}

	}


	public enum Counters{MAP, COMBINE, REDUCE}

	public static class AverageMapper extends Mapper<LongWritable, Text, Text, Text> {
        private Text outputKey = new Text();
        private Text outputValue = new Text();
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
		    String[] words = StringUtils.split(value.toString(), '\\', ',');
		    for (int i = 0; i < words.length; i++) {
		    	//State column.
				if (i == 1) {
					outputKey.set(words[i].trim());
				}
				//Househould income column.
				if (i == 9) {
					outputValue.set(words[i].trim() + ",1");
				}
			}
		    context.getCounter(Counters.MAP).increment(1);
		    context.write(outputKey, outputValue);
		    
		}

		@Override
		protected void cleanup(Context context)
				throws IOException, InterruptedException {

		}


	}

	public static class AverageCombiner extends Reducer<Text, Text, Text, Text> {
		private Text outputValue = new Text();
                private long sum = 0;
                private int count = 0;
		@Override
		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			String[] tmp = new String[2];
			for (Text value : values) {
				tmp = StringUtils.split(value.toString(), ',');
				sum += Long.parseLong(tmp[0]);
				count += Integer.parseInt(tmp[1]);
			}
			outputValue.set(sum + "," + count);
			context.getCounter(Counters.COMBINE).increment(1);
			context.write(key, outputValue);
			sum = 0;
			count = 0;
		}		

		@Override
		protected void cleanup(Context context)
				throws IOException, InterruptedException {

		}
	}

	public static class AverageReducer extends Reducer<Text, Text, Text, DoubleWritable> {
		private DoubleWritable outputValue = new DoubleWritable();
		private double sum = 0;
		private int count = 0;
		@Override
		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			String[] tmp = new String[2];
			for (Text value : values) {
				tmp = StringUtils.split(value.toString(), ',');
				sum += Long.parseLong(tmp[0]);
				count += Integer.parseInt(tmp[1]);
			}
			outputValue.set(sum/count);
			context.getCounter(Counters.REDUCE).increment(1);
			context.write(key, outputValue);
			sum = 0;
			count = 0;
		}

		@Override
		protected void cleanup(Context context)
				throws IOException, InterruptedException {

		}
	}

	@Override
	public int run(String[] arg0) throws Exception {
		Configuration conf = super.getConf();
		Job job = Job.getInstance(conf, "AverageJob");
		job.setJarByClass(AverageJob.class);

		Path out = new Path("counties/output");
		FileInputFormat.setInputPaths(job, "counties");
		FileOutputFormat.setOutputPath(job, out);
		out.getFileSystem(conf).delete(out, true);

		job.setMapperClass(AverageMapper.class);
		job.setReducerClass(AverageReducer.class);
		job.setCombinerClass(AverageCombiner.class);
		job.setPartitionerClass(AveragePartitioner.class);
		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		job.setNumReduceTasks(5);


		return job.waitForCompletion(true)?0:1;

	}


	public static void main(String[] args) {
		int result = 0;
		try {
			result = ToolRunner.run(new Configuration(),  new AverageJob(), args);
		} catch (Exception e) {
			e.printStackTrace();
		}
		System.exit(result);

	}

}


Writing a Custom Partitioner

The Average MapReduce job will execute with five Reducers that are sent an evenly-distributed number of key/value pairs.

public static class AveragePartitioner extends Partitioner<Text, Text> {
	@Override
	public int getPartition(Text key, Text value, int numPartitions) {
		if (numPartitions == 1) {
			return 0;
		}
		return (key.hashCode()&Integer.MAX_VALUE) % numPartitions;
	}

}
job.setNumReduceTasks(5);


Writing a Custom Output Format

A MapReduce jobs that outputs the growth (or loss) of stock dividends.

exchange,stock_symbol,date,dividends
NYSE,AIT,2009-11-12,0.15
NYSE,AIT,2009-08-12,0.15
NYSE,AIT,2009-05-13,0.15
NYSE,AIT,2009-02-11,0.15

package customsort;

import java.io.IOException;

import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class DividendOutputFormat extends FileOutputFormat<NullWritable, DividendChange> {

	@Override
	public RecordWriter<NullWritable, DividendChange> getRecordWriter(
			TaskAttemptContext job) throws IOException, InterruptedException {
		int partition = job.getTaskAttemptID().getTaskID().getId();
		Path outputDir = FileOutputFormat.getOutputPath(job);
		Path filename = new Path(outputDir.getName() + Path.SEPARATOR + job.getJobName() + "_" + partition);
		FileSystem fs = filename.getFileSystem(job.getConfiguration());
		FSDataOutputStream dos = fs.create(filename);
		return new DividendRecordWriter(dos);
	}

}
package customsort;

import java.io.DataOutputStream;
import java.io.IOException;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

public class DividendRecordWriter extends RecordWriter<NullWritable, DividendChange> {
    public final String SEPERATOR = ",";
    private DataOutputStream out;
    
    public DividendRecordWriter(DataOutputStream out) {
		this.out = out;
	}
    
	@Override
	public void write(NullWritable key, DividendChange value)
			throws IOException, InterruptedException {
		StringBuilder result = new StringBuilder();
		result.append(value.getSymbol());
		result.append(SEPERATOR);
		result.append(value.getDate());
		result.append(SEPERATOR);
		result.append(value.getChange());
		result.append("\n");
		out.write(result.toString().getBytes());
	}

	@Override
	public void close(TaskAttemptContext context) throws IOException,
			InterruptedException {
		out.close();
	}

}
package customsort;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class DividendJob extends Configured implements Tool {

	
	public static class DividendGrowthMapper extends Mapper<LongWritable, Text, Stock, DoubleWritable> {
		private Stock outputKey = new Stock();
		private DoubleWritable outputValue = new DoubleWritable();
		private final String EXCHANGE = "exchange";
		
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String [] words = StringUtils.split(value.toString(),'\\',',');
			if(EXCHANGE.equals(words[0])) {
				return;
			}
			
			outputKey.setSymbol(words[1]);
			outputKey.setDate(words[2]);
			outputValue.set(Double.parseDouble(words[3]));
			context.write(outputKey, outputValue);
		}
	}

	public static class StockPartitioner extends Partitioner<Stock, DoubleWritable> {

		@Override
		public int getPartition(Stock key, DoubleWritable value, int numReduceTasks) {
			char firstLetter = key.getSymbol().trim().charAt(0);
			
			return (firstLetter - 'A') % numReduceTasks;
		}		
	}

	
	public static class DividendGrowthReducer extends Reducer<Stock, DoubleWritable, NullWritable, DividendChange> {
		private NullWritable outputKey = NullWritable.get();
		private DividendChange outputValue = new DividendChange();
		
		private MultipleOutputs < NullWritable , DividendChange > mos;
		
		@Override
		protected void reduce(Stock key, Iterable<DoubleWritable> values, Context context)
				throws IOException, InterruptedException {
			double previousDividend = 0.0;
			for(DoubleWritable dividend : values) {
				double currentDividend = dividend.get();
				double growth = currentDividend - previousDividend;
				if(Math.abs(growth) > 0.000001) {
					outputValue.setSymbol(key.getSymbol());
					outputValue.setDate(key.getDate());
					outputValue.setChange(growth);
					if (growth > 0) {
						mos.write("positive", outputKey, outputValue, "pos");
					}else {
						mos.write("negative", outputKey, outputValue, "nes");
					}
					
					//context.write(outputKey, outputValue);
					previousDividend = currentDividend;
				}
				
			}
		}
		
		@Override
		protected void setup(
				Reducer<Stock, DoubleWritable, NullWritable, DividendChange>.Context context)
				throws IOException, InterruptedException {
			mos = new MultipleOutputs<NullWritable, DividendChange>(context);
			super.setup(context);
		}
		
		@Override
		protected void cleanup(
				Reducer<Stock, DoubleWritable, NullWritable, DividendChange>.Context context)
				throws IOException, InterruptedException {
			mos.close();
			super.cleanup(context);
		}
		
	}

	@Override
	public int run(String[] args) throws Exception {
		Configuration conf = super.getConf();
		Job job = Job.getInstance(conf, "DividendJob");
		job.setJarByClass(DividendJob.class);
		
		Path out = new Path("growth");
		FileInputFormat.setInputPaths(job, new Path("dividends"));
		FileOutputFormat.setOutputPath(job, out);
		out.getFileSystem(conf).delete(out, true);
		
		job.setMapperClass(DividendGrowthMapper.class);
		job.setReducerClass(DividendGrowthReducer.class);
		job.setPartitionerClass(StockPartitioner.class);
		job.setGroupingComparatorClass(StockGroupComparator.class);
		job.setInputFormatClass(TextInputFormat.class);
		//job.setOutputFormatClass(DividendOutputFormat.class);
		
		MultipleOutputs.addNamedOutput(job, "positive", TextOutputFormat.class, NullWritable.class, DividendChange.class);
		MultipleOutputs.addNamedOutput(job, "negative", TextOutputFormat.class, NullWritable.class, DividendChange.class);
		
		//Drop off default empty files.
		LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
		
		job.setOutputKeyClass(NullWritable.class);
		job.setOutputValueClass(DividendChange.class);
		job.setMapOutputKeyClass(Stock.class);
		job.setMapOutputValueClass(DoubleWritable.class);
				
		job.setNumReduceTasks(3);

		return job.waitForCompletion(true)?0:1;

	}


	public static void main(String[] args) {
		int result = 0;
		try {
			result = ToolRunner.run(new Configuration(),  new DividendJob(), args);
		} catch (Exception e) {
			e.printStackTrace();
		}
		System.exit(result);

	}

}
package customsort;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class DividendChange implements Writable {
	private String symbol;
	private String date;
	private double change;

	public String getSymbol() {
		return symbol;
	}

	public void setSymbol(String symbol) {
		this.symbol = symbol;
	}

	public String getDate() {
		return date;
	}

	public void setDate(String date) {
		this.date = date;
	}

	public double getChange() {
		return change;
	}

	public void setChange(double change) {
		this.change = change;
	}
	@Override
	public String toString() {
		
		return symbol + "\t" + date + "\t" + change;
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(symbol);
		out.writeUTF(date);
		out.writeDouble(change);
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		symbol = in.readUTF();
		date = in.readUTF();
		change = in.readDouble();

	}

}
package customsort;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class Stock implements WritableComparable<Stock> {
	private String symbol;
	private String date;

	public String getSymbol() {
		return symbol;
	}

	public void setSymbol(String symbol) {
		this.symbol = symbol;
	}

	public String getDate() {
		return date;
	}

	public void setDate(String date) {
		this.date = date;
	}

	@Override
	public void write(DataOutput out) throws IOException {
            out.writeUTF(symbol);
            out.writeUTF(date);
	}

	@Override
	public void readFields(DataInput in) throws IOException {
            symbol = in.readUTF();
           date = in.readUTF();
	}

	@Override
	public int compareTo(Stock stock) {
		int response = this.symbol.compareTo(stock.symbol);
		if (response != 0) {
			return response;
		}else {
			response = this.date.compareTo(stock.date);
			return response;
		}
	}

}
package customsort;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class StockGroupComparator extends WritableComparator {
	
	protected StockGroupComparator(){
		super(Stock.class, true);
	}

	@SuppressWarnings("rawtypes")
	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		Stock lhs = (Stock)a;
		Stock rhs = (Stock)b;
		return lhs.getSymbol().compareTo(rhs.getSymbol());
	}
	
}



發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章