HDPCD-Java-複習筆記(9)-lab

Java lab booklet


Sorting Using a Composite Key

完整代碼參考前一個複習筆記

Define a Custom Key Class -- Stock.參考前一個複習筆記。

Writea Custom Partitioner -- StockPartitioner  -- getPartition  --


char firstLetter =key.getSymbol().trim().charAt(0);
return (firstLetter -'A') % numReduceTasks;

Define a Custom Value Class  -- DividendChange (The Reducer is going to output a custom value type that you define)

Add a toString method to DividendChange that looks like the following:

·        @Override

·        public String toString() {

·        return symbol + "\t" + date +"\t" + change;

}


No grouping will appear during the shuffle/sort phase. In this step, you are going to define a group comparator so that stocks with the same symbol are grouped together.

Write a Group Comparator -- StockGroupComparator that extends WritableComparator .


How CombineFileInputFormat Works

其中WordCountMapper, WordCountReducer請參考WordCount程序。

package wordcount;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;

public class MyCombinedFilesInputFormat extends CombineFileInputFormat<LongWritable, Text> {

	@SuppressWarnings({ "unchecked", "rawtypes" })
	@Override
	public RecordReader<LongWritable, Text> createRecordReader(
			InputSplit split, TaskAttemptContext context) throws IOException {
		return new CombineFileRecordReader(
					(CombineFileSplit) split,
					context,
					MyCombinedFilesRecordReader.class
					);
	}

	public static class MyCombinedFilesRecordReader extends RecordReader<LongWritable, Text> {
		private int index;
		private LineRecordReader reader;
		
		public MyCombinedFilesRecordReader(CombineFileSplit split, TaskAttemptContext context, Integer index) {
			this.index = index;
			reader = new LineRecordReader();
		}
		
		@Override
		public void initialize(InputSplit split, TaskAttemptContext context)
				throws IOException, InterruptedException {
			CombineFileSplit cfsplit = (CombineFileSplit) split;
			FileSplit fileSplit = new FileSplit(cfsplit.getPath(index),
			cfsplit.getOffset(index),
			cfsplit.getLength(index),
			cfsplit.getLocations()
					);
			reader.initialize(fileSplit, context);
		}

		@Override
		public boolean nextKeyValue() throws IOException, InterruptedException {
			return reader.nextKeyValue();
		}

		@Override
		public LongWritable getCurrentKey() throws IOException,
				InterruptedException {
			return reader.getCurrentKey();
		}

		@Override
		public Text getCurrentValue() throws IOException, InterruptedException {
			return reader.getCurrentValue();
		}

		@Override
		public float getProgress() throws IOException, InterruptedException {
			return reader.getProgress();
		}

		@Override
		public void close() throws IOException {
			reader.close();
		}
		
	}
}

package wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class WordCountJob extends Configured implements Tool {

	@Override
	public int run(String[] args) throws Exception {
		Job job = Job.getInstance(getConf(), "WordCountJob");
		Configuration conf = job.getConfiguration();
		job.setJarByClass(getClass());
		
		Path in = new Path(args[0]);
		Path out = new Path(args[1]);
		out.getFileSystem(conf).delete(out, true);
		FileInputFormat.setInputPaths(job, in);
		FileOutputFormat.setOutputPath(job, out);
		
		job.setMapperClass(WordCountMapper.class);
		job.setReducerClass(WordCountReducer.class);
		
		//job.setInputFormatClass(TextInputFormat.class);
		job.setInputFormatClass(MyCombinedFilesInputFormat.class);
		conf.set(FileInputFormat.SPLIT_MAXSIZE, "50000");
		job.setOutputFormatClass(TextOutputFormat.class);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		return job.waitForCompletion(true)?0:1;
	}

	public static void main(String[] args) {
		int result = 0;
		try {
			result = ToolRunner.run(new Configuration(), 
							new WordCountJob(),
							args);
		} catch (Exception e) {
			e.printStackTrace();
		}
		System.exit(result);
	}

}

Demo: Processing Multiple Inputs


ids_states.txt

1,CA
4,SD
1,NY
6,CO

names_ids.txt

4Rich
5 Barry
12 George
1 Ulf
2 Danielle
9 Tom
3 Manish
6 Mark

package demo;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class MultipleInputFiles extends Configured implements Tool {

	public static class NamesMapper extends Mapper<LongWritable, Text, Text, Text> {
		private Text outputValue = new Text();
		private Text outputKey = new Text();
		
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String currentLine = value.toString();
			String [] words = StringUtils.split(currentLine, '\\', '\t');
			outputKey.set(words[0]);
			outputValue.set(words[1]);
			context.write(outputKey, outputValue);
		}
	}
	
	public static class StatesMapper extends Mapper<LongWritable, Text, Text, Text> {
		private Text outputValue = new Text();
		private Text outputKey = new Text();
		
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String currentLine = value.toString();
			String [] words = StringUtils.split(currentLine, '\\', ',');
			outputKey.set(words[0]);
			outputValue.set(words[1]);
			context.write(outputKey, outputValue);
		}
	}

	public static class MultiInputReducer extends Reducer<Text, Text, Text, Text> {
		private Text outputValue = new Text();

		@Override
		protected void reduce(Text key, Iterable<Text> values,
				Context context)
				throws IOException, InterruptedException {
			StringBuilder output = new StringBuilder();
			for(Text value : values) {
				output.append(value.toString() + ",");
			}
			outputValue.set(output.toString());
			context.write(key, outputValue);
		}
	}
	
	@Override
	public int run(String[] args) throws Exception {
		Job job = Job.getInstance(getConf(), "MultipleInputFilesJob");
		Configuration conf = job.getConfiguration();
		job.setJarByClass(getClass());
		
		Path names = new Path("multiinputs/names_ids.txt");
		Path states = new Path("multiinputs/ids_states.txt");
		MultipleInputs.addInputPath(job, names, TextInputFormat.class, NamesMapper.class);
		MultipleInputs.addInputPath(job, states, TextInputFormat.class, StatesMapper.class);
		
		Path out = new Path("multiinputs/output");
		out.getFileSystem(conf).delete(out, true);
		FileOutputFormat.setOutputPath(job, out);
		
		job.setReducerClass(MultiInputReducer.class);
		job.setNumReduceTasks(1);
		job.setOutputFormatClass(TextOutputFormat.class);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		return job.waitForCompletion(true)?0:1;
	}

	public static void main(String[] args) {
		int result = 0;
		try {
			result = ToolRunner.run(new Configuration(), 
							new MultipleInputFiles(),
							args);
		} catch (Exception e) {
			e.printStackTrace();
		}
		System.exit(result);
	}

}

Lab: Writing a Custom InputFormat


exchange,stock_symbol,date,stock_price_open,stock_price_high,stock_price_low,stock_price_close,stock_volume,stock_price_adj_close
NYSE,           JEF,             2010-02-08,        25.40,                    25.49,                24.78,                    24.82,                         1134300,                   24.82
NYSE,           JEF,             2010-02-05,        24.91,                    25.19,                24.08,                    25.01,                         1765200,                   25.01
NYSE,           JEF,             2010-02-04,        26.01,                    26.20,                24.85,                    24.85,                         1414400,                   24.85


package average;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class MovingAveragePreprocessor extends Configured implements Tool {

	public static class PreprocessorMapper extends Mapper<Stock, StockPrices, Stock, DoubleWritable> {
		private DoubleWritable outputValue = new DoubleWritable();
		
		@Override
		protected void map(Stock key, StockPrices value, Context context)
				throws IOException, InterruptedException {
			outputValue.set(value.getClose());
			context.write(key, outputValue);
		}
		
		
	}
	
	

	@Override
	public int run(String[] arg0) throws Exception {
		Configuration conf = super.getConf();
		Job job = Job.getInstance(conf, "MovingAveragePreprocessor");
		job.setJarByClass(MovingAveragePreprocessor.class);

		Path out = new Path("closingprices");
		FileInputFormat.setInputPaths(job, "stocks");
		FileOutputFormat.setOutputPath(job, out);
		out.getFileSystem(conf).delete(out, true);

		job.setMapperClass(PreprocessorMapper.class);
		job.setReducerClass(Reducer.class);
		job.setInputFormatClass(StockInputFormat.class);		
		job.setOutputFormatClass(SequenceFileOutputFormat.class);
		job.setOutputKeyClass(Stock.class);
		job.setOutputValueClass(DoubleWritable.class);
		job.setMapOutputKeyClass(Stock.class);
		job.setMapOutputValueClass(DoubleWritable.class);

		job.setNumReduceTasks(1);
		
		return job.waitForCompletion(true) ? 0 : 1;
	}

	public static void main(String[] args) {
		int result = 0;
		try {
			result = ToolRunner.run(new Configuration(), new MovingAveragePreprocessor(), args);
		}
		catch (Exception e) {
			e.printStackTrace();
		}
		System.exit(result);

	}

}
package average;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.LineReader;
import org.apache.hadoop.util.StringUtils;

public class StockInputFormat extends FileInputFormat<Stock, StockPrices> {
	
	public static class StockReader extends RecordReader<Stock, StockPrices> {
        private Stock key = new Stock();
        private StockPrices value = new StockPrices();
        private LineReader in;
        private long start;
        private long end;
        private long currentPos;
        private Text line = new Text();
        
        
        
		@Override
		public void initialize(InputSplit split, TaskAttemptContext context)
				throws IOException, InterruptedException {
			FileSplit fileSplit = (FileSplit)split;
			Configuration configuration = context.getConfiguration();
			Path path = fileSplit.getPath();
			FSDataInputStream is = path.getFileSystem(configuration).open(path);
			in = new LineReader(is, configuration);
			start = fileSplit.getStart();
			end = start + fileSplit.getLength();
			is.seek(start);
			if (start != 0) {
				//
				start += in.readLine(new Text(), 0, (int)Math.min(Integer.MAX_VALUE, end - start));
			}
			currentPos = start;
		}

		@Override
		public boolean nextKeyValue() throws IOException, InterruptedException {
			if (currentPos > end) {
				return false;
			}
			currentPos += in.readLine(line);
			if (line.getLength() == 0) {
				return false;
			}
			if (line.toString().startsWith("exchange")) {
				currentPos += in.readLine(line);
			}
			String[] values = StringUtils.split(line.toString(), ',');
			key.setSymbol(values[1]);
			key.setDate(values[2]);
			value.setOpen(Double.parseDouble(values[3]));
			value.setHigh(Double.parseDouble(values[4]));
			value.setLow(Double.parseDouble(values[5]));
			value.setClose(Double.parseDouble(values[6]));
			value.setVolume(Integer.parseInt(values[7]));
			value.setAdjustedClose(Double.parseDouble(values[8]));
			return true;
		}

		@Override
		public Stock getCurrentKey() throws IOException, InterruptedException {
			return key;
		}

		@Override
		public StockPrices getCurrentValue() throws IOException,
				InterruptedException {
			return value;
		}

		@Override
		public float getProgress() throws IOException, InterruptedException {
			return currentPos/end;
		}

		@Override
		public void close() throws IOException {
			in.close();
		}

	}

	@Override
	public RecordReader<Stock, StockPrices> createRecordReader(
			InputSplit split, TaskAttemptContext context) throws IOException,
			InterruptedException {
		
		
		return new StockReader();
	}

}
package average;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class Stock implements WritableComparable<Stock> {
	private String symbol;
	private String date;

	@Override
	public void readFields(DataInput in) throws IOException {
		symbol = in.readUTF();
		date = in.readUTF();
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(symbol);
		out.writeUTF(date);
	}

	public String getSymbol() {
		return symbol;
	}

	public void setSymbol(String symbol) {
		this.symbol = symbol;
	}

	public String getDate() {
		return date;
	}

	public void setDate(String date) {
		this.date = date;
	}

	@Override
	public int compareTo(Stock arg0) {
		int response = symbol.compareTo(arg0.symbol);
		if(response == 0) {
			response = date.compareTo(arg0.date);
		}
		return response;
	}

	public String toString() {
		return symbol + "\t" + date;
	}
}
package average;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class StockPrices implements Writable {
	private double open, high, low, close, adjustedClose;
	private int volume;
	
	@Override
	public void readFields(DataInput in) throws IOException {
		open = in.readDouble();
		high = in.readDouble();
		low = in.readDouble();
		close = in.readDouble();
		adjustedClose = in.readDouble();
		volume = in.readInt();
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeDouble(open);
		out.writeDouble(high);
		out.writeDouble(low);
		out.writeDouble(close);
		out.writeDouble(adjustedClose);
		out.writeInt(volume);
	}

	public double getOpen() {
		return open;
	}

	public void setOpen(double open) {
		this.open = open;
	}

	public double getHigh() {
		return high;
	}

	public void setHigh(double high) {
		this.high = high;
	}

	public double getLow() {
		return low;
	}

	public void setLow(double low) {
		this.low = low;
	}

	public double getClose() {
		return close;
	}

	public void setClose(double close) {
		this.close = close;
	}

	public double getAdjustedClose() {
		return adjustedClose;
	}

	public void setAdjustedClose(double adjustedClose) {
		this.adjustedClose = adjustedClose;
	}

	public int getVolume() {
		return volume;
	}

	public void setVolume(int volume) {
		this.volume = volume;
	}
}







發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章