Inverted Indexing for Text Retrieval
Inverted Indexing
Baseline Algorithm
MainIdea:map的輸入爲文檔編號和文檔的內容,輸出爲[ 詞 , (文檔編號,詞頻)],reduce將同一個詞的所有文檔編號和詞頻聚集,然後按文檔編號排序,最後輸出的是按文檔編號由小到大排序的項。
Inverted Indexing: Revised Implementation
package InvertedIndexing;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.HashMap;
/**
* Created with IntelliJ IDEA.
* User: ubuntu
* Date: 13-11-22
* Time: 上午10:07
* To change this template use File | Settings | File Templates.
*/
public class RevisedInvertedIndexing extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "RevisedInvertedIndexing");
job.setJarByClass(RevisedInvertedIndexing.class);
job.setInputFormatClass(MyInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
MyInputFormat.addInputPath(job, new Path(args[0]));
TextOutputFormat.setOutputPath(job,new Path(args[1]));
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReduce.class);
job.setMapOutputKeyClass(Item.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setPartitionerClass(MyPartitioner.class);
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new RevisedInvertedIndexing(), args);
System.exit(exitCode);
}
public static class Item implements WritableComparable {
private String term;
private String docName;
public Item() {
this.term = "";
}
public String getTerm() {
return term;
}
public void setTerm(String term) {
this.term = term;
}
public String getDocName() {
return docName;
}
public void setDocName(String docName) {
this.docName = docName;
}
@Override
public int compareTo(Object o) {
Item that = (Item) o;
int cmp = this.getTerm().compareTo(that.getTerm());
if (cmp != 0) {
return cmp;
} else {
return this.getDocName().compareTo(that.getDocName());
}
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(term);
out.writeUTF(docName);
}
@Override
public void readFields(DataInput in) throws IOException {
this.term = in.readUTF();
this.docName = in.readUTF();
}
@Override
public int hashCode() {
return term.hashCode() + docName.hashCode();
}
@Override
public boolean equals(Object obj) {
if (obj == null) {
return false;
}
Item that = (Item) obj;
if (this.getTerm().equals(that.getTerm())) {
return this.getDocName().equals(that.getDocName());
} else {
return false;
}
}
@Override
public String toString() {
return "("+term.toString()+","+docName.toString()+")";
}
}
public static class MyPartitioner extends Partitioner<Item, LongWritable> {
@Override
public int getPartition(Item term, LongWritable i, int numPartitions) {
return term.getTerm().hashCode() * 12 % numPartitions;
}
}
public static class MyMapper extends Mapper<Text, Text, Item, LongWritable> {
private Item outKey = new Item();
private LongWritable outValue = new LongWritable();
private HashMap<String, Long> hashMap;
private String docName;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
hashMap = new HashMap<String, Long>();
}
@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
String[] words = value.toString().split(" ");
if (docName == null) {
docName = key.toString();
}
for (String word : words) {
if (word.startsWith("\"")) {
word = word.substring(1);
}
while (word.endsWith("\"") || word.endsWith(".") || word.endsWith(",") || word.endsWith(" ")) {
word = word.substring(0, word.length() - 1);
}
addItem(word);
}
}
private void addItem(String word) {
if (hashMap.containsKey(word)) {
hashMap.put(word, hashMap.get(word) + 1);
} else {
hashMap.put(word, 1L);
}
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
for (String s : hashMap.keySet()) {
outKey.setTerm(s);
outKey.setDocName(docName);
outValue.set(hashMap.get(s));
context.write(outKey,outValue);
// System.out.println(outKey.toString()+"-"+outValue.toString()+"\t");
}
}
}
public static class MyReduce extends Reducer<Item, LongWritable, Text, Text> {
private String preTerm = null;
private String term;
private StringBuilder outValueString = new StringBuilder();
private Text outKey = new Text();
private Text outValue = new Text();
@Override
protected void reduce(Item key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
term = key.getTerm();
// if (term.equals(preTerm) || preTerm == null) {
// preTerm = term;
//
// } else {
// outKey.set(preTerm);
// outValue.set(outValueString.toString());
// context.write(outKey,outValue);
// preTerm = term;
// outValueString.delete(0, outValueString.length());
// }
if (term.equals(preTerm) == false && preTerm != null) {
outKey.set(preTerm);
outValue.set(outValueString.toString());
context.write(outKey,outValue);
outValueString.delete(0, outValueString.length());
}
preTerm = term;
long sum = 0;
for (LongWritable value : values) {
sum += value.get();
}
if (outValueString.length() == 0) {
outValueString.append(key.getDocName() + "-" + sum);
}else {
outValueString.append(" " + key.getDocName() + "-" + sum);
}
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
outKey.set(term);
outValue.set(outValueString.toString());
context.write(outKey,outValue);
}
}
public static class MyInputFormat extends FileInputFormat<Text, Text> {
@Override
public RecordReader<Text, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
return new MyRecordReader();
}
}
public static class MyRecordReader extends RecordReader<Text, Text> {
private LineRecordReader lineRecordReader;
private FileSplit fileSplit;
private Text key;
private Text value;
private TaskAttemptContext taskAttemptContext;
public MyRecordReader() {
key = new Text();
value = new Text();
}
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
fileSplit = (FileSplit) split;
key.set(fileSplit.getPath().getName());
taskAttemptContext = context;
lineRecordReader = new LineRecordReader();
lineRecordReader.initialize(split, context);
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
boolean hasNext = lineRecordReader.nextKeyValue();
value = lineRecordReader.getCurrentValue();
return hasNext;
}
@Override
public Text getCurrentKey() throws IOException, InterruptedException {
return key;
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return lineRecordReader.getProgress();
}
@Override
public void close() throws IOException {
}
}
}
注意:自定義的RecordReader用了裝飾模式,持有一個LineRecordReader,只用LineRecordReader獲取文本內容作爲value,而Key則是通過split獲取文件名(這裏沒有使用文檔編號,而直接採用了文件名),Mapper中使用了in-mapper combining