運用hadoop計算TF-IDF

這幾天一直在忙着找暑假實習，實在沒精力來寫新的文章。剛好這幾天放假，我把前幾天做了的另一個例子拿出來跟大家分享一下。

這個例子是使用hadoop來實現TF-IDF。

TF-IDF（term frequency–inverse document frequency）是一種用於資訊檢索與資訊探勘的常用加權技術。具體的信息就麻煩大家自己百度一下了。

因爲要實現的細節比較多，所以我直接將代碼放上來。大家可以參看代碼裏面的註釋，我寫的比較詳細。

我採用了兩個MapReduce任務順序執行來實現TF-IDF功能。

public class My_TDIF { //part1------------------------------------------------------------------------ public static class Mapper_Part1 extends Mapper<LongWritable, Text, Text, Text> { String File_name = ""; //保存文件名，根據文件名區分所屬文件 int all = 0; //單詞總數統計 static Text one = new Text("1"); String word; public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { FileSplit split = (FileSplit) context.getInputSplit(); String str = split.getPath().toString(); File_name = str.substring(str.lastIndexOf("/")+1); //獲取文件名 StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word = File_name ; word += " "; word += itr.nextToken(); //將文件名加單詞作爲key es: test1 hello 1 all++; context.write(new Text(word), one); } } public void cleanup(Context context) throws IOException, InterruptedException { //Map的最後，我們將單詞的總數寫入。下面需要用總單詞數來計算。 String str = ""; str += all; context.write(new Text(File_name + " " + "!"), new Text(str)); //主要這裏值使用的 "!"是特別構造的。因爲!的ascii比所有的字母都小。 } } public static class Combiner_Part1 extends Reducer<Text, Text, Text, Text> { float all = 0; public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { int index = key.toString().indexOf(" "); //因爲!的ascii最小，所以在map階段的排序後，!會出現在第一個 if (key.toString().substring(index + 1, index + 2).equals("!")){ for (Text val : values) { //獲取總的單詞數。 all = Integer.parseInt(val.toString()); } //這個key-value被拋棄 return; } float sum = 0; //統計某個單詞出現的次數 for (Text val : values) { sum += Integer.parseInt(val.toString()); } //跳出循環後，某個單詞數出現的次數就統計完了，所有 TF(詞頻) = sum / all float tmp = sum / all; String value = ""; value += tmp; //記錄詞頻 //將key中單詞和文件名進行互換。es: test1 hello -> hello test1 String p[] = key.toString().split(" "); String key_to = ""; key_to += p[1]; key_to += " "; key_to += p[0]; context.write(new Text(key_to), new Text(value)); } } public static class Reduce_Part1 extends Reducer<Text, Text, Text, Text> { public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { for (Text val : values) { context.write(key, val); } } } public static class MyPartitoner extends Partitioner<Text, Text>{ //實現自定義的Partitioner @Override public int getPartition(Text key, Text value, int numPartitions) { //我們將一個文件中計算的結果作爲一個文件保存 //es： test1 test2 String ip1 = key.toString(); ip1 = ip1.substring(0, ip1.indexOf(" ")); Text p1 = new Text(ip1); return Math.abs((p1.hashCode() * 127) % numPartitions); } } //part2----------------------------------------------------- public static class Mapper_Part2 extends Mapper<LongWritable, Text, Text, Text>{ public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException{ String val = value.toString().replaceAll(" ", " "); //將vlaue中的TAB分割符換成空格 es: Bank test1 0.11764706 -> Bank test1 0.11764706 int index = val.indexOf(" "); String s1 = val.substring(0, index); //獲取單詞作爲key es: hello String s2 = val.substring(index + 1); //其餘部分作爲value es: test1 0.11764706 s2 += " "; s2 += "1"; //統計單詞在所有文章中出現的次數, “1” 表示出現一次。 es: test1 0.11764706 1 context.write(new Text(s1), new Text(s2)); } } public static class Reduce_Part2 extends Reducer<Text, Text, Text, Text>{ int file_count; public void reduce(Text key, Iterable<Text>values, Context context)throws IOException, InterruptedException{ //同一個單詞會被分成同一個group file_count = context.getNumReduceTasks(); //獲取總文件數 float sum = 0; List<String> vals = new ArrayList<String>(); for (Text str : values){ int index = str.toString().lastIndexOf(" "); sum += Integer.parseInt(str.toString().substring(index + 1)); //統計此單詞在所有文件中出現的次數 vals.add(str.toString().substring(0, index)); //保存 } float tmp = sum / file_count; //單詞在所有文件中出現的次數除以總文件數 = DF for (int j = 0;j < vals.size(); j++){ String val = vals.get(j); String end = val.substring(val.lastIndexOf(" ")); float f_end = Float.parseFloat(end); //讀取TF val += " "; val += tmp; val += " "; val += f_end / tmp; // f_end / tmp = tf-idf值 context.write(key, new Text(val)); } } } public static void main(String[] args) throws Exception { Path tmp = new Path("tmp"); //設置中間文件臨時存儲目錄 //part1---------------------------------------------------- Configuration conf1 = new Configuration(); //設置文件個數，在計算DF(文件頻率)時會使用 FileSystem hdfs = FileSystem.get(conf1); FileStatus p[] = hdfs.listStatus(new Path(args[1])); //獲取輸入文件夾內文件的個數，然後來設置NumReduceTasks Job job1 = new Job(conf1, "My_tdif_part1"); job1.setJarByClass(My_TDIF.class); job1.setMapperClass(Mapper_Part1.class); job1.setCombinerClass(Combiner_Part1.class); //combiner在本地執行，效率要高點。 job1.setReducerClass(Reduce_Part1.class); job1.setMapOutputKeyClass(Text.class); job1.setMapOutputValueClass(Text.class); job1.setOutputKeyClass(Text.class); job1.setOutputValueClass(Text.class); job1.setNumReduceTasks(p.length); job1.setPartitionerClass(MyPartitoner.class); //使用自定義MyPartitoner FileInputFormat.addInputPath(job1, new Path(args[1])); FileOutputFormat.setOutputPath(job1, tmp); job1.waitForCompletion(true); //part2---------------------------------------- Configuration conf2 = new Configuration(); Job job2 = new Job(conf2, "My_tdif_part2"); job2.setJarByClass(My_TDIF.class); job2.setMapOutputKeyClass(Text.class); job2.setMapOutputValueClass(Text.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(Text.class); job2.setMapperClass(Mapper_Part2.class); job2.setReducerClass(Reduce_Part2.class); //需要提醒下，我這裏沒有使用自定義Partitioner,默認的Partitioner會根據key來劃分，而我們正 //好需要這種方式來將所有文件中同一個單詞化爲同一個組，方便我們統計一個單詞在所以文件中出現的次數。 job2.setNumReduceTasks(p.length); FileInputFormat.setInputPaths(job2, tmp); FileOutputFormat.setOutputPath(job2, new Path(args[2])); job2.waitForCompletion(true); hdfs.delete(tmp, true); } }

最後，我再針對此代碼補充幾點。

1:因爲hadoop對處理小文件的效率比較低，如果需要處理很多小文件，我推薦大家可以將很多小文件合併成一個SequenceFile,用文件名作爲key,內容作爲value。然後處理SequenceFile文件，提高效率

2: 本代碼中還有很多可以改進的地方。比如：本代碼的最後輸出包含了很多中間數據，大家可以根據自己的需要刪除不需要的數據。

本代碼是根據輸入的文件數來確定ReduceTask數和最後的文件輸出數，大家也可以自己修改。最後文件的輸出內容也是無序的，如果大家需要可以參看本博的另一篇博文（http://blog.csdn.net/jackydai987/archive/2011/03/12/6244725.aspx）進行多文件輸出。

3:本文只是提供一個簡單的思路，如果代碼中有什麼不對的地方請各位大大不要見笑，和給偶提出來，在下感激不盡。

運用hadoop計算TF-IDF

Shellcode 編碼、解碼

搭建Hadoop環境（在winodws環境下用虛擬機虛擬兩個ubuntu系統進行搭建）

VC按鈕切換界面

VC下劃分窗口並固定

windows下安裝nutch

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結