hadoop在過濾重複數據的問題中出現了一些問題,沒有將相同的數據去掉,而是排好序都呈現了出來,於是我又寫了一個字符計數的程序,也是這種效果,沒有將同一個key的value放在一起,效果圖如下
這個是原始數據
這個是處理之後的數據
僅僅是將每行的數據進行切分了,沒有將key相同的放在一起。
原始代碼如下
package ccnu.eisr;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class DataDeduplicationMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//這是mapreduce讀取到的一行字符串
String line = value.toString();
String[] words = line.split(" ");
for (String word : words) {
//將單詞輸出爲key,次數輸出爲value,這行數據會輸到reduce中
context.write(new Text(word), new LongWritable(1));
}
}
}
package ccnu.eisr;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class DataDeduplicationReducer extends Reducer<Text, Text, Text, LongWritable>{
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long count = 0;
for (LongWritable num : values) {
count += num.get();
}
context.write(key, new LongWritable(count));
}
}
package ccnu.eisr;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class DataDeduplicationRunner {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job wcjob = Job.getInstance(conf,"dedup");
//設置wcjob中的資源所在的jar包
wcjob.setJarByClass(DataDeduplicationRunner.class);
//設置wcjob要使用的哪個mapper類
wcjob.setMapperClass(DataDeduplicationMapper.class);
//設置wcjob要使用的哪個reducer類
wcjob.setReducerClass(DataDeduplicationReducer.class);
wcjob.setCombinerClass(DataDeduplicationReducer.class);
//wcjob的mapper類輸出的kv數據類型
wcjob.setMapOutputKeyClass(Text.class);
wcjob.setMapOutputValueClass(LongWritable.class);
//wcjob的reducer類輸出的kv數據類型
wcjob.setOutputKeyClass(Text.class);
wcjob.setOutputValueClass(LongWritable.class);
//指定要處理的原始數據所存放的路徑
FileInputFormat.setInputPaths(wcjob, "hdfs://127.0.0.1:9000/datadedup");
//指定要處理之後的結果輸出到哪個路徑
FileOutputFormat.setOutputPath(wcjob, new Path("hdfs://127.0.0.1:9000/output"));
boolean res = wcjob.waitForCompletion(true);
System.exit(res?0:1);
}
}
嘗試了很多種方法,感覺是不是機器出問題了,最壞的方法服務器都重啓了,結果還是結果不對,經過一天多的折騰,最後靈感出現了,我將reducer的輸出改了一下,發現結果還是沒變,想到了可能是沒有走我寫的reducer,而是走了默認的reducer,因爲reducer是繼承了org.apache.hadoop.mapreduce.Reducer,可能是繼承出現了問題,於是加上了覆蓋,@Override,果然就報錯了,說類型不匹配,仔細一看,還真是出問題了,都是自己的馬虎,導致了這個小問題。
正確代碼如下
package ccnu.eisr;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class DataDeduplicationMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//這是mapreduce讀取到的一行字符串
String line = value.toString();
String[] words = line.split(" ");
for (String word : words) {
//將單詞輸出爲key,次數輸出爲value,這行數據會輸到reduce中
context.write(new Text(word), new LongWritable(1));
}
}
}
package ccnu.eisr;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class DataDeduplicationReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long count = 0;
for (LongWritable num : values) {
count += num.get();
}
context.write(key, new LongWritable(count));
}
}
package ccnu.eisr;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class DataDeduplicationRunner {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job wcjob = Job.getInstance(conf,"dedup");
//設置wcjob中的資源所在的jar包
wcjob.setJarByClass(DataDeduplicationRunner.class);
//設置wcjob要使用的哪個mapper類
wcjob.setMapperClass(DataDeduplicationMapper.class);
//設置wcjob要使用的哪個reducer類
wcjob.setReducerClass(DataDeduplicationReducer.class);
wcjob.setCombinerClass(DataDeduplicationReducer.class);
//wcjob的mapper類輸出的kv數據類型
wcjob.setMapOutputKeyClass(Text.class);
wcjob.setMapOutputValueClass(LongWritable.class);
//wcjob的reducer類輸出的kv數據類型
wcjob.setOutputKeyClass(Text.class);
wcjob.setOutputValueClass(LongWritable.class);
//指定要處理的原始數據所存放的路徑
FileInputFormat.setInputPaths(wcjob, "hdfs://127.0.0.1:9000/datadedup");
//指定要處理之後的結果輸出到哪個路徑
FileOutputFormat.setOutputPath(wcjob, new Path("hdfs://127.0.0.1:9000/output"));
boolean res = wcjob.waitForCompletion(true);
System.exit(res?0:1);
}
}
總結一下,出現沒有執行自己寫的代碼,很有可能是自己的方法名或者方法的參數不對,出現了方法重載的情況,這裏一定要重寫它原來的方法,類名一樣,參數類型也要一樣。最好在方法上加上@Override。