將一批電話通信清單,記錄了用戶A撥打用戶B的記錄,需要做一個倒排索引,記錄撥打給用戶B的所有用戶A。如
原有的txt 爲:
首先,我們應該把源文件傳到Hdfs上,然後將原始數據進行分割,將被叫作爲KEY,主叫作爲Value,將撥打相同被叫的主叫號碼彙總起來輸出到HDFS。程序如下:
package com.xxs;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;//此處爲導入的包,一般爲固定的。
public class MapTest_2 extends Configured implements Tool{
enum Counter
{
LINESKIP,
}//出錯的行,出錯計數器
public static class Map extends Mapper<LongWritable,Text,Text,Text>
{
public void map(LongWritable key,Text value,Context context)throws IOException,InterruptedException
{
String line=value.toString();
try
{
String [] lineSplit=line.split(" ");
String anum=lineSplit[0];
String bnum=lineSplit[1];
context.write(new Text(bnum),new Text(anum));
}
catch(java.lang.ArrayIndexOutOfBoundsException e)
{
context.getCounter(Counter.LINESKIP).increment(1);
return;
}
}
}
public static class Reduce extends Reducer<Text,Text,Text,Text>
{
public void reduce(Text key,Iterable<Text>values,Context context)throws IOException,InterruptedException
{
String valueString;
String out="";
for(Text value:values)
{
valueString=value.toString();
out+=valueString+"|";
}
context.write(key, new Text(out));
}
}
public int run(String[] args)throws Exception
{
Configuration conf=getConf();
Job job=new Job(conf,"MapTest_2");//任務名
job.setJarByClass(MapTest_2.class);//指定class
FileInputFormat.addInputPath(job, new Path(args[0]));//輸入路徑
FileOutputFormat.setOutputPath(job,new Path(args[1]));//輸出路徑
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.waitForCompletion(true);
return job.isSuccessful()?0:1;
}
public static void main(String[] args)throws Exception
{
int res=ToolRunner.run(new Configuration(),new MapTest_2(),args);
System.exit(res);
}
}
設置run-run configurations的Arguments爲hdfs://192.168.187.128:9000/user/xxs/input/Text_2.txt hdfs://192.168.187.128:9000/user/xxs/out
其中 out目錄不能已經存在。
得到最終的結果: