SemiJoin,也叫半連接,是從分佈式數據庫中借鑑過來的方法。它的產生動機是:對於reduce side join,跨機器的數據傳輸量非常大,這成了join操作的一個瓶頸,如果能夠在map端過濾掉不會參加join操作的數據,則可以大大節省網絡IO。實現方法很簡單:選取一個小表,假設是File1,將其參與join的key抽取出來,保存到文件File3中,File3文件一般很小,可以放到內存中。在map階段,使用DistributedCache將File3複製到各個TaskTracker上,然後將File2中不在File3中的key對應的記錄過濾掉,剩下的reduce階段的工作與reduce side join相同。此實例中,還是採用第一個實例中的數據,假如我們只過濾sex爲1的user,並將key存於user_id文件中(注意:每行的數據一定要帶上雙引號啊),如下:
"ID"
"1"
"2"
"3"
"5"
"6"
"8"
"9"
完整代碼如下,此實例中我們採用新的API來寫:
public class SemiJoin extends Configured implements Tool
{
public static class MapClass extends Mapper<LongWritable, Text, Text, Text>
{
// 用於緩存user_id文件中的數據
private Set<String> userIds = new HashSet<String>();
private Text key = new Text();
private Text value = new Text();
private String[] keyValue;
// 此方法會在map方法執行之前執行
@Override
protected void setup(Context context) throws IOException, InterruptedException
{
BufferedReader in = null;
try
{
// 從當前作業中獲取要緩存的文件
Path[] paths = DistributedCache.getLocalCacheFiles(context.getConfiguration());
String userId = null;
for (Path path : paths)
{
if (path.toString().contains("user_id"))
{
in = new BufferedReader(new FileReader(path.toString()));
while (null != (userId = in.readLine()))
{
userIds.add(userId);
}
}
}
}
catch (IOException e)
{
e.printStackTrace();
}
finally
{
try
{
if(in != null)
{
in.close();
}
}
catch (IOException e)
{
e.printStackTrace();
}
}
}
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException
{
// 在map階段過濾掉不需要的數據
this.keyValue = value.toString().split(",");
if(userIds.contains(keyValue[0]))
{
this.key.set(keyValue[0]);
this.value.set(keyValue[1]);
context.write(this.key, this.value);
}
}
}
public static class Reduce extends Reducer<Text, Text, Text, Text>
{
private Text value = new Text();
private StringBuilder sb;
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException
{
sb = new StringBuilder();
for(Text val : values)
{
sb.append(val.toString());
sb.append(",");
}
this.value.set(sb.deleteCharAt(sb.length()-1).toString());
context.write(key, this.value);
}
}
public int run(String[] args) throws Exception
{
Job job = new Job(getConf(), "SemiJoin");
job.setJobName("SemiJoin");
job.setJarByClass(SemiJoin.class);
job.setMapperClass(MapClass.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
String[] otherArgs = new GenericOptionsParser(job.getConfiguration(), args).getRemainingArgs();
// 我們把第一個參數的地址作爲要緩存的文件路徑
DistributedCache.addCacheFile(new Path(otherArgs[0]).toUri(), job.getConfiguration());
FileInputFormat.addInputPath(job, new Path(otherArgs[1]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String[] args) throws Exception
{
int res = ToolRunner.run(new Configuration(), new SemiJoin(), args);
System.exit(res);
}
}