MapReduce考试代码总结

1. 利用数值概要完成comments数据集的中位数与标准差

package com.hdfsclient;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.GenericOptionsParser;
public class MedianStdDevTuple implements Writable{

    private float median = 0;
    private float stdDev=0;
    public float getMedian() {
        return median;
    }

    public void setMedian(float median) {
        this.median = median;
    }

    public float getStdDev() {
        return stdDev;
    }

    public void setStdDev(float stdDev) {
        this.stdDev = stdDev;
    }



    public void readFields(DataInput in) throws IOException{
        median=in.readFloat();
        stdDev=in.readFloat();
    }

    public void write(DataOutput out) throws IOException {
        out.writeFloat(median);
        out.writeFloat(stdDev);
    }

    public String toString(){///重载toString函数定义输出格式
        return median+"\t"+stdDev;
    }

    public static final String[] REDIS_INSTANCES = { "p0", "p1", "p2", "p3",
            "p4", "p6" };

    public static Map<String, String> transformXmlToMap(String xml) {
        Map<String, String> map = new HashMap<String, String>();
        try {
            String[] tokens = xml.trim().substring(5, xml.trim().length() - 3).split("\"");
            for (int i = 0; i < tokens.length - 1; i += 2) {
                String key = tokens[i].trim();
                String val = tokens[i + 1];
                map.put(key.substring(0, key.length() - 1), val);
            }
        } catch (StringIndexOutOfBoundsException e) {
            System.err.println(xml);
        }
        return map;
    }

    public static class MedianStdDevMapper extends Mapper <Object, Text, IntWritable, IntWritable> {
        private IntWritable outHour = new IntWritable();
        //private MedianStdDevTuple outCountAverage = new MedianStdDevTuple();
        private IntWritable outCommentLength = new IntWritable();
        private final static SimpleDateFormat frmt = new SimpleDateFormat ("yyyy-MM-dd'T'HH:mm:ss.SSS");

        public void map (Object key, Text value, Context context)
                throws IOException, InterruptedException {

            Map <String, String> parsed = transformXmlToMap (value.toString());

            String strDate = parsed.get("CreationDate");
            String text = parsed.get("Text");

            Date creationDate=null;
            try {
               creationDate = frmt.parse(strDate);
            } catch (ParseException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }

            outHour.set(creationDate.getHours());///时间戳

            outCommentLength.set(text.length());

            context.write(outHour, outCommentLength);
        }
    }

    public static class MedianStdDevReducer
            extends Reducer <IntWritable, IntWritable,IntWritable, MedianStdDevTuple> {
        private MedianStdDevTuple result = new MedianStdDevTuple();
        private ArrayList<Float> commentLengths = new ArrayList<Float>();
        public void reduce (IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            float sum = 0;
            float count = 0;
            commentLengths.clear();
            result.setStdDev(0);
            for (IntWritable val : values) {
                commentLengths.add((float)val.get());
                sum += val.get();
                count++;
            }
            Collections.sort(commentLengths);
            if(count%2==0){
                result.setMedian((commentLengths.get((int)count/2-1)+commentLengths.get((int)count/2))/2.0f);
            }else{
                result.setMedian(commentLengths.get((int)count/2));
            }
            float mean = sum/count;
            float sumOfSquares = 0.0f;
            for(Float f : commentLengths){
                sumOfSquares+=(f-mean)*(f-mean);
            }
            result.setStdDev((float)Math.sqrt(sumOfSquares/(count-1)));
            context.write(key,result);
        }
    }

    /*============================================================================================================*/
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        //FileUtil.fullyDelete(new File("output7"));
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

        if (otherArgs.length != 2) {
            System.err.println("Usage: mergesort <in> <out>");
            System.exit(2);
        }

        Job job = Job.getInstance();
        job.setJarByClass(MedianStdDevTuple.class);
        job.setMapperClass(MedianStdDevMapper.class);
        job.setReducerClass(MedianStdDevReducer.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(MedianStdDevTuple.class);
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

2. Reduce和分层

Map实现
package four;


import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;


public class JoinMapper extends Mapper<LongWritable, Text, IntWritable, Text>{
	 
	//定义文件名称标识
	private static final String LEFT_FILENAME = "product_info.txt";
	private static final String RIGHT_FILENAME = "product_quantity.txt";
	
	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		
		//从输入分片信息中取得文件路径
		//FileSplit 是 抽象类InputSplit 的实现类,记录了文件的具体切片信息。
		String filePath = ((FileSplit)context.getInputSplit()).getPath().toString();
		//文件标识
		String fileFlag = null;
		//输出键(学号)
		String outKey = null;
		//输出值(姓名 或 课程)
		String outValue = null;
		//行记录的信息
		String[] infos = value.toString().split(",");
		
		//判断行记录所来自的文件
		if (filePath.contains(LEFT_FILENAME)) {
			if(infos.length == 22){
				String temp = null;
				fileFlag = LEFT_FILENAME;
				outKey = infos[0];
				temp = infos[1]+"  "+infos[2]+"   "+infos[3];
				outValue = temp;
			}
		} 
		else if (filePath.contains(RIGHT_FILENAME)) {
			if(infos.length == 9 ){
				String temp = null;
				temp = infos[1]+"  "+infos[2]+"   "+infos[3];
				fileFlag = RIGHT_FILENAME;
				outKey = infos[0];
				outValue = temp;
			}
		}
		
		//输出键值对,并在值上标记文件名
		context.write(new IntWritable(Integer.parseInt(outKey)), new Text(outValue + "\t" + fileFlag));
	}
	

	
}

Reduce实现
package four;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;


public class JoinReducer extends Reducer<IntWritable, Text, Text, Text>{
	
	
	
	//定义文件名称标识
	private static final String LEFT_FILENAME = "product_info.txt";
	private static final String RIGHT_FILENAME = "product_quantity.txt";
	
        private static int num = 0;
 
	@Override
	protected void reduce(IntWritable key, Iterable<Text> values, Context context)
			throws IOException, InterruptedException {
		
                //计数reduce调用次数,输出key
		{
			num++;
			System.out.println(num + " " +key);
		}
 
		//学生姓名
		String studentName = null;
		//学生课程名数组
		List<String> studentClassNames = new ArrayList<String>();
		
		//根据文件名标识信息,将姓名、课程归类
		for (Text value : values) {
			String[] infos = value.toString().split("\t"); 
			if(LEFT_FILENAME.equals(infos[1])) {
				studentName = infos[0];
			}
			else if (RIGHT_FILENAME.equals(infos[1])){
				studentClassNames.add(infos[0]);
			}
		}
                //去除无法建立内连接的信息
		if (studentName == null || studentClassNames.size() == 0) {
			return;
		}
		
		//将姓名-课程 键值对遍历输出
		for (int i = 0; i < studentClassNames.size(); i++) {
			context.write(new Text(studentName), new Text(studentClassNames.get(i)));
		}
		
	}
 
}

分区实现:
package four;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

public class MyPartitioneryy extends Partitioner<IntWritable, Text> {  
    @Override  
    public int getPartition(IntWritable key, Text value, int numPartitions) {  
        int result = 0;  
        /*********************************************************************/  
        /***key.toString().equals("long")  must use toString()!!!!  ***/  
        /***开始的时候我没有用 ,导致都在一个区里,结果也在一个reduce输出文件中。  ***/  
        /********************************************************************/  
       if (key.get() <= 1000) {
		result = 0;
	   }
       else if (key.get() <= 2000) {
		result = 1;
	}
       else if (key.get() <= 3000) {
		result = 2;
	} else {
result = 3;
	}
       
        return result;  
    }  
} 
驱动
package four;



import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;


import two.MyPartitioner;
import two.MyPartitioner.MyPartitionerMap;
import two.MyPartitioner.MyPartitionerPar;
import two.MyPartitioner.MyPartitionerReduce;
 
/*
 * join的驱动类
 */
public class MR_Join {
 
	public static void main(String[] args) throws Exception {  
        Configuration conf = new Configuration();  
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();  
        if (otherArgs.length != 2) {  
            System.err.println("Usage: MyPartitioner <in> <out>");  
            System.exit(2);  
        }
        conf.set("mapred.jar","mp1.jar");
        Job job = new Job(conf, "MyPartitioner");  
        job.setNumReduceTasks(4); 

        job.setJarByClass(MR_Join.class); 
        job.setMapperClass(JoinMapper.class); 
        job.setMapOutputKeyClass(IntWritable.class);  
        job.setMapOutputValueClass(Text.class);  
          
        job.setPartitionerClass(MyPartitioneryy.class);  //
        job.setReducerClass(JoinReducer.class);  
          
        job.setOutputKeyClass(Text.class);  
        job.setOutputValueClass(Text.class);  
          
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));  
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));  
        System.exit(job.waitForCompletion(true) ? 0 : 1);  
    }  
}


3分箱

package binning;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.lang.StringEscapeUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class shijian5 {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

Configuration conf = new Configuration();
Job job = new Job(conf, "binningByTag");

job.setJarByClass(BinningByTags.class);
job.setMapperClass(BinningByTagsMapper.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
MultipleOutputs.addNamedOutput(job, "bins", TextOutputFormat.class, Text.class, NullWritable.class);
MultipleOutputs.setCountersEnabled(job, true);
job.setNumReduceTasks(0);
job.waitForCompletion(true);

}

public static class BinningByTagsMapper extends Mapper<Object, Text, Text, NullWritable> {

private MultipleOutputs<Text, NullWritable> mos = null;

protected void setup(Context context) {
// Create a new MultipleOutputs using the context object
mos = new MultipleOutputs<Text, NullWritable>(context);
}

public void map(Object text, Text value, Context context) throws IOException, InterruptedException {

String datas[] = value.toString().split("\t");

String tag = datas[2];
if (tag.equalsIgnoreCase("hadoop")) {
mos.write("bins", value, NullWritable.get(), "hadoop-tag");
}
else if (tag.equalsIgnoreCase("hive")) {
mos.write("bins", value, NullWritable.get(), "hive-tag");
}
else if (tag.equalsIgnoreCase("pig")) {
mos.write("bins", value, NullWritable.get(), "pig-tag");
}
else if (tag.equalsIgnoreCase("hbase")) {
mos.write("bins", value, NullWritable.get(), "hbase-tag");
}
else{
mos.write("bins", value, NullWritable.get(), "other");
}
}

protected void cleanup(Context context) throws IOException, InterruptedException {
// Close multiple outputs otherwise you will not get any values
mos.close();
}

}

}

4倒排索引

import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class InvertedIndexMapper extends Mapper<Text, Text, Text, Text> 
{	@Override
	protected void map(Text key, Text value, Context context)  
			throws IOException, InterruptedException 
// default RecordReader: LineRecordReader;     key: line offset;     value: line string
	{	FileSplit fileSplit = (FileSplit)context.getInputSplit();
		String fileName = fileSplit.getPath().getName();
		Text word = new Text();
		Text fileName_lineOffset = new Text(fileName+”@”+key.toString());
		StringTokenizer itr = new StringTokenizer(value.toString());
		for(; itr.hasMoreTokens(); ) 
		{      word.set(itr.nextToken());
		        context.write(word, fileName_lineOffset);
		}
	}
}

import java.io.IOException;
import java.util.Collections;
import java.util.Iterator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class InvertedIndexReducer extends Reducer<Text, Text, Text, Text> 
{	@Override
	protected void reduce(Text key, Iterable<Text> values, Context context)
			throws IOException, InterruptedException 
 	{	Iterator<Text> it = values.iterator();
		StringBuilder all = new StringBuilder();
		if(it.hasNext()) 
          all.append(it.next().toString());
		for(; it.hasNext(); ) 
		{	all.append(;");
			all.append(it.next().toString());					}
		context.write(key, new Text(all.toString()));
	} 
}

5.笛卡尔积Reduce实现

package com.hadoop.reducejoin.test;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

/*
 * 两个大表
 * 通过笛卡尔积实现 reduce join
 * 适用场景:两个表的连接字段key都不唯一(包含一对多,多对多的关系)
 */
public class ReduceJoinByCartesianProduct {
    /**
    为来自不同表(文件)的key/value对打标签以区别不同来源的记录。
    然后用连接字段作为key,其余部分和新加的标志作为value,最后进行输出。
    */
    public static class ReduceJoinByCartesianProductMapper extends Mapper<Object,Text,Text,Text>{
        private Text joinKey=new Text();
        private Text combineValue=new Text();
        
        @Override
        protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            String pathName=((FileSplit)context.getInputSplit()).getPath().toString();
            //如果数据来自于records,加一个records的标记
            if(pathName.endsWith("records.txt")){
                String line = value.toString();
                String[] valueItems = line.split("\\s+");
                //过滤掉脏数据
                if(valueItems.length!=3){
                    return;
                }
                joinKey.set(valueItems[0]);
                combineValue.set("records.txt" + valueItems[1] + "\t" + valueItems[2]);
            }else if(pathName.endsWith("station.txt")){
                //如果数据来自于station,加一个station的标记
                String line = value.toString();
                String[] valueItems = line.split("\\s+");
                //过滤掉脏数据
                if(valueItems.length!=2){
                    return;
                }
                joinKey.set(valueItems[0]);
                combineValue.set("station.txt" + valueItems[1]);
            }
            context.write(joinKey,combineValue);
        }
    }
    /*
     * reduce 端做笛卡尔积
     */
     public static class ReduceJoinByCartesianProductReducer extends Reducer<Text,Text,Text,Text>{
            private List<String> leftTable=new ArrayList<String>();
            private List<String> rightTable=new ArrayList<String>();
            private Text result=new Text();
            @Override
            protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
                //一定要清空数据
                leftTable.clear();
                rightTable.clear();
                //相同key的记录会分组到一起,我们需要把相同key下来自于不同表的数据分开,然后做笛卡尔积
                for(Text value : values){
                    String val=value.toString();
                    if(val.startsWith("station.txt")){
                        leftTable.add(val.replaceFirst("station.txt",""));
                    }else if(val.startsWith("records.txt")){
                        rightTable.add(val.replaceFirst("records.txt",""));
                    }
                }
                //笛卡尔积
                for(String leftPart:leftTable){
                    for(String rightPart:rightTable){
                        result.set(leftPart+"\t"+rightPart);
                        context.write(key, result);
                    }
                }
            }
        }
     
     public static void main(String[] arg0) throws Exception{
            Configuration conf = new Configuration();
            String[] args = {"hdfs://sparks:9000/middle/reduceJoin/records.txt"
                    ,"hdfs://sparks:9000/middle/reduceJoin/station.txt"
                    ,"hdfs://sparks:9000/middle/reduceJoin/JoinByCartesian-out"
            };
            String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
            if (otherArgs.length < 2) {
                System.err.println("Usage: reducejoin <in> [<in>...] <out>");
                System.exit(2);
            }
            
            //输出路径
            Path mypath = new Path(otherArgs[otherArgs.length - 1]);
            FileSystem hdfs = mypath.getFileSystem(conf);// 创建输出路径
            if (hdfs.isDirectory(mypath)) {
                hdfs.delete(mypath, true);
            }
            Job job = Job.getInstance(conf, "ReduceJoinByCartesianProduct");
            job.setJarByClass(ReduceJoinByCartesianProduct.class);
            job.setMapperClass(ReduceJoinByCartesianProductMapper.class);
            job.setReducerClass(ReduceJoinByCartesianProductReducer.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            //添加输入路径
            for (int i = 0; i < otherArgs.length - 1; ++i) {
                FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
            }
            //添加输出路径
            FileOutputFormat.setOutputPath(job,
                    new Path(otherArgs[otherArgs.length - 1]));
            System.exit(job.waitForCompletion(true) ? 0 : 1);
        }
}

ReduceJoinByCartesianProduct
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章