hadoop執行過程中reduce沒有執行,沒有將key相同的value放在一起的問題

hadoop在過濾重複數據的問題中出現了一些問題,沒有將相同的數據去掉,而是排好序都呈現了出來,於是我又寫了一個字符計數的程序,也是這種效果,沒有將同一個key的value放在一起,效果圖如下

這個是原始數據

這個是處理之後的數據

僅僅是將每行的數據進行切分了,沒有將key相同的放在一起。

原始代碼如下

package ccnu.eisr;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class DataDeduplicationMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
		//這是mapreduce讀取到的一行字符串
        String line = value.toString();
        String[] words = line.split(" ");
        for (String word : words) {
            //將單詞輸出爲key,次數輸出爲value,這行數據會輸到reduce中
            context.write(new Text(word), new LongWritable(1));
        }
		
	}
	
}
package ccnu.eisr;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class DataDeduplicationReducer extends Reducer<Text, Text, Text, LongWritable>{
	protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
		long count = 0;
        for (LongWritable num : values) {
            count += num.get();
        }
        context.write(key, new LongWritable(count));
	}
}
package ccnu.eisr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class DataDeduplicationRunner {
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job wcjob = Job.getInstance(conf,"dedup");
		
		//設置wcjob中的資源所在的jar包
		wcjob.setJarByClass(DataDeduplicationRunner.class);
		//設置wcjob要使用的哪個mapper類
		wcjob.setMapperClass(DataDeduplicationMapper.class);
		//設置wcjob要使用的哪個reducer類
		wcjob.setReducerClass(DataDeduplicationReducer.class);
		
		wcjob.setCombinerClass(DataDeduplicationReducer.class);
		//wcjob的mapper類輸出的kv數據類型
		wcjob.setMapOutputKeyClass(Text.class);
		wcjob.setMapOutputValueClass(LongWritable.class);
		
		//wcjob的reducer類輸出的kv數據類型
		wcjob.setOutputKeyClass(Text.class);
		wcjob.setOutputValueClass(LongWritable.class);
		
		//指定要處理的原始數據所存放的路徑
		FileInputFormat.setInputPaths(wcjob, "hdfs://127.0.0.1:9000/datadedup");
		//指定要處理之後的結果輸出到哪個路徑
		FileOutputFormat.setOutputPath(wcjob, new Path("hdfs://127.0.0.1:9000/output"));
		
		boolean res = wcjob.waitForCompletion(true);
		System.exit(res?0:1);
	}
}

嘗試了很多種方法,感覺是不是機器出問題了,最壞的方法服務器都重啓了,結果還是結果不對,經過一天多的折騰,最後靈感出現了,我將reducer的輸出改了一下,發現結果還是沒變,想到了可能是沒有走我寫的reducer,而是走了默認的reducer,因爲reducer是繼承了org.apache.hadoop.mapreduce.Reducer,可能是繼承出現了問題,於是加上了覆蓋,@Override,果然就報錯了,說類型不匹配,仔細一看,還真是出問題了,都是自己的馬虎,導致了這個小問題。

正確代碼如下

package ccnu.eisr;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class DataDeduplicationMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
	@Override
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
		//這是mapreduce讀取到的一行字符串
        String line = value.toString();
        String[] words = line.split(" ");
        for (String word : words) {
            //將單詞輸出爲key,次數輸出爲value,這行數據會輸到reduce中
            context.write(new Text(word), new LongWritable(1));
        }
		
	}
	
}
package ccnu.eisr;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class DataDeduplicationReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
	@Override
	protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
		long count = 0;
        for (LongWritable num : values) {
            count += num.get();
        }
        context.write(key, new LongWritable(count));
	}
}
package ccnu.eisr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class DataDeduplicationRunner {
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job wcjob = Job.getInstance(conf,"dedup");
		
		//設置wcjob中的資源所在的jar包
		wcjob.setJarByClass(DataDeduplicationRunner.class);
		//設置wcjob要使用的哪個mapper類
		wcjob.setMapperClass(DataDeduplicationMapper.class);
		//設置wcjob要使用的哪個reducer類
		wcjob.setReducerClass(DataDeduplicationReducer.class);
		
		wcjob.setCombinerClass(DataDeduplicationReducer.class);
		//wcjob的mapper類輸出的kv數據類型
		wcjob.setMapOutputKeyClass(Text.class);
		wcjob.setMapOutputValueClass(LongWritable.class);
		
		//wcjob的reducer類輸出的kv數據類型
		wcjob.setOutputKeyClass(Text.class);
		wcjob.setOutputValueClass(LongWritable.class);
		
		//指定要處理的原始數據所存放的路徑
		FileInputFormat.setInputPaths(wcjob, "hdfs://127.0.0.1:9000/datadedup");
		//指定要處理之後的結果輸出到哪個路徑
		FileOutputFormat.setOutputPath(wcjob, new Path("hdfs://127.0.0.1:9000/output"));
		
		boolean res = wcjob.waitForCompletion(true);
		System.exit(res?0:1);
	}
}

總結一下,出現沒有執行自己寫的代碼,很有可能是自己的方法名或者方法的參數不對,出現了方法重載的情況,這裏一定要重寫它原來的方法,類名一樣,參數類型也要一樣。最好在方法上加上@Override。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章