同樣:
package org.apache.nutch.crawl;
Job
先從它的Job開始入“眼”:
currentJob = new NutchJob(getConf(), "generate: " + getConf().get(BATCH_ID));
Collection<WebPage.Field> fields = getFields(currentJob);
/**
* public static <K, V> void initMapperJob(
* Job job,
* Collection<WebPage.Field> fields,
* Class<K> outKeyClass,
* Class<V> outValueClass,
* Class<? extends GoraMapper<String, WebPage, K, V>> mapperClass,
* Class<? extends Partitioner<K, V>> partitionerClass,
* boolean reuseObjects)
*/
StorageUtils.initMapperJob(
currentJob,
fields,
SelectorEntry.class,
WebPage.class,
GeneratorMapper.class,
SelectorEntryPartitioner.class,
true);
StorageUtils.initReducerJob(currentJob, GeneratorReducer.class);
currentJob.waitForCompletion(true);
可以看出來它的Map的輸出爲<SelectorEntry, WebPage>
對,WebPage
大家已經瞭解,就是對於一個頁面本身的信息還有其他信息(比如抓取時間)的一個封裝。那麼SelectorEntry這個Class長什麼樣呢?
SelectorEntry
public static class SelectorEntry implements
WritableComparable<SelectorEntry> {
String url;
float score;
/**
* 首先按網頁分值排序,分值高的靠前;然後再按照url的字典序排序。
*/
public int compareTo(SelectorEntry se) {
if (se.score > score)
return 1;
else if (se.score == score)
return url.compareTo(se.url);
return -1;
}
/**
* 依靠hashCode和equals判斷兩個對象是否相同,用於去重的目的。
*/
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + url.hashCode();
result = prime * result + Float.floatToIntBits(score);
return result;
}
@Override
public boolean equals(Object obj) {
SelectorEntry other = (SelectorEntry) obj;
if (!url.equals(other.url))
return false;
if (Float.floatToIntBits(score) != Float.floatToIntBits(other.score))
return false;
return true;
}
}
首先呢,由於SelectorEntry
類是對WritableComparable
類的一個實現,肯定得凸顯出Writable和Comparable這兩大特性。關於Writable呢,自然就是readFields和write這兩個方法;而Comparable呢,自然必須得有compareTo這個方法。而且又由於SelectorEntry
是作爲鍵key
來出現,是個複合鍵,所以它要能夠讓reduce進行去重,這個功能的實現就是靠重載hashCode和equals方法來實現的。這個不懂的同學可以去看看Hadoop MapReduce怎麼定製其中的Partitioner,SortComparator和GroupingComparator類,其實看MapReduce自帶的SecondarySort這個實例就明白了。
SelectorEntryPartitioner
接着我們就看他的SelectorEntryPartitioner
類:
public static class SelectorEntryPartitioner extends
Partitioner<SelectorEntry, WebPage> implements Configurable {
private URLPartitioner partitioner = new URLPartitioner();
@Override
public int getPartition(SelectorEntry selectorEntry, WebPage page,
int numReduces) {
return partitioner.getPartition(selectorEntry.url, numReduces);
}
}
public int getPartition(String urlString, int numReduceTasks) {
if (numReduceTasks == 1) {
// this check can be removed when we use Hadoop with MAPREDUCE-1287
return 0;
}
int hashCode;
URL url = null;
try {
urlString = normalizers.normalize(urlString,
URLNormalizers.SCOPE_PARTITION);
hashCode = urlString.hashCode(); /**默認哈希值是正規化之後的String的哈希值*/
url = new URL(urlString);
} catch (MalformedURLException e) {
LOG.warn("Malformed URL: '" + urlString + "'");
hashCode = urlString.hashCode();
}
if (url != null) {
if (mode.equals(PARTITION_MODE_HOST)) {
hashCode = url.getHost().hashCode(); /**如果計數模式是host,則哈希值爲url中host的哈希值*/
} else if (mode.equals(PARTITION_MODE_DOMAIN)) {
hashCode = URLUtil.getDomainName(url).hashCode(); /**如果計數模式是domain,則哈希值爲url中domain的哈希值*/
} else { // MODE IP
try {
InetAddress address = InetAddress.getByName(url.getHost());
hashCode = address.getHostAddress().hashCode(); /**如果計數模式是ip,則哈希值爲hostAddress的哈希值*/
} catch (UnknownHostException e) {
GeneratorJob.LOG.info("Couldn't find IP for host: " + url.getHost());
}
}
}
// make hosts wind up in different partitions on different runs
hashCode ^= seed; /**按位異或,hash的手段*/
return (hashCode & Integer.MAX_VALUE) % numReduceTasks; /**把SelectorEntry分入不同的reduce任務中[0..numReduceTask-1]*/
}
這個Partitioner就按照不同的計數模式將url分到了不同的reduce任務中了。假如計數模式是host,它就會將同一host的url分到同一個task中去。
GeneratorMapper
public class GeneratorMapper extends
GoraMapper<String, WebPage, SelectorEntry, WebPage> {
private URLFilters filters; /**插件*/
private URLNormalizers normalizers; /**插件*/
private boolean filter; /**是否過濾*/
private boolean normalise; /**是否規範化*/
private FetchSchedule schedule; /**還是插件*/
private ScoringFilters scoringFilters;/**仍然插件*/
private long curTime; /**Generate的時間*/
private SelectorEntry entry = new SelectorEntry();
private int maxDistance; /**該Url到種子Url最短路徑的最大距離*/
@Override
public void map(String reversedUrl, WebPage page, Context context)
throws IOException, InterruptedException {
String url = TableUtil.unreverseUrl(reversedUrl);
if (Mark.GENERATE_MARK.checkMark(page) != null) {/**已經生成過了*/
return;
}
// filter on distance
if (maxDistance > -1) { /**-1 if unlimited.*/
CharSequence distanceUtf8 = page.getMarkers().get(DbUpdaterJob.DISTANCE);
if (distanceUtf8 != null) {
int distance = Integer.parseInt(distanceUtf8.toString());
if (distance > maxDistance) { /**距離種子頁面太遠(i.e.太不相關了)*/
return;
}
}
}
// If filtering is on don't generate URLs that don't pass URLFilters
try {
if (normalise) {
url = normalizers.normalize(url,
URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
}
if (filter && filters.filter(url) == null)
return;
} catch (URLFilterException e) {
return;
} catch (MalformedURLException e) {
return;
}
// check fetch schedule
if (!schedule.shouldFetch(url, page, curTime)) { /**默認調用org.apache.nutch.crawl.DefaultFetchSchedule*/
return;
}
float score = page.getScore();
try {
/**
* 該方法爲排序並且選擇分值Top N的頁面得出一個排序的分值
*/
score = scoringFilters.generatorSortValue(url, page, score);
} catch (ScoringFilterException e) {
// ignore
}
entry.set(url, score);
context.write(entry, page); /**Map的輸出<SelectorEntry(entry), WebPage(page)>*/
}
}
該Mapper類就是將Injector步中注入的Urls作爲輸入,然後做簡單的篩選然後輸出<SelectorEntry, WebPage>
作爲Reducer的輸入。
我們可以看看Job中StorageUtils.initMapperJob()
又調用了GoraMapper.initMapperJob()
。這個方法中有一句關鍵的話:
GoraMapper
//set the input via GoraInputFormat
GoraInputFormat.setInput(job, query, dataStore, reuseObjects);
即設置input是從GoraInputFormat中來的。
GeneratorReducer
接下來我們看一看GeneratorReducer
類:
public class GeneratorReducer extends
GoraReducer<SelectorEntry, WebPage, String, WebPage> {
private long limit;
private long maxCount;
protected static long count = 0;
private boolean byDomain = false;
private Map<String, Integer> hostCountMap = new HashMap<String, Integer>();
private Utf8 batchId;
@Override
protected void reduce(SelectorEntry key, Iterable<WebPage> values,
Context context) throws IOException, InterruptedException {
for (WebPage page : values) {
if (count >= limit) { /**如果超過限制則退出*/
return;
}
if (maxCount > 0) { /**如果設定了最大值*/
String hostordomain;
if (byDomain) {
hostordomain = URLUtil.getDomainName(key.url);
} else {
hostordomain = URLUtil.getHost(key.url);
}
Integer hostCount = hostCountMap.get(hostordomain);
if (hostCount == null) {
hostCountMap.put(hostordomain, 0);
hostCount = 0;
}
if (hostCount >= maxCount) { /**如果某一個hostordomain的數量超過最大值則退出*/
return;
}
hostCountMap.put(hostordomain, hostCount + 1);
}
Mark.GENERATE_MARK.putMark(page, batchId); /**設置標記,已經生成過了*/
page.setBatchId(batchId);
try {
context.write(TableUtil.reverseUrl(key.url), page); /**輸出<String(reversedUrl), WebPage(page)>到dataStore,如HBase*/
} catch (MalformedURLException e) {
context.getCounter("Generator", "MALFORMED_URL").increment(1);
continue;
}
context.getCounter("Generator", "GENERATE_MARK").increment(1);
count++;
}
}
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
Configuration conf = context.getConfiguration();
long totalLimit = conf
.getLong(GeneratorJob.GENERATOR_TOP_N, Long.MAX_VALUE); /**一共需要產生Top ?的Urls*/
if (totalLimit == Long.MAX_VALUE) {
limit = Long.MAX_VALUE;
} else {
limit = totalLimit / context.getNumReduceTasks(); /**將total值平均分到每一個Reduce任務中*/
}
maxCount = conf.getLong(GeneratorJob.GENERATOR_MAX_COUNT, -2); /**在一個fetchlist上的最大Url數*/
batchId = new Utf8(conf.get(GeneratorJob.BATCH_ID));
String countMode = conf.get(GeneratorJob.GENERATOR_COUNT_MODE,
GeneratorJob.GENERATOR_COUNT_VALUE_HOST);
if (countMode.equals(GeneratorJob.GENERATOR_COUNT_VALUE_DOMAIN)) { /**是否是byDomain模式*/
byDomain = true;
}
}
}
可以看出來,每個Reduce任務只輸出Top limit的<String(reversedUrl), WebPage(page)>
對。這樣分配Reduce的任務並不一定完全均勻,但是也已經可以了。
GoraReducer
/**
* Initializes the Reducer, and sets output parameters for the job.
* @param dataStore the datastore as the output /**the most important one.*/
*/
public static <K1, V1, K2, V2 extends Persistent>
void initReducerJob(
Job job,
DataStore<K2,V2> dataStore,
Class<? extends GoraReducer<K1, V1, K2, V2>> reducerClass,
boolean reuseObjects) {
GoraOutputFormat.setOutput(job, dataStore, reuseObjects);
job.setReducerClass(reducerClass);
}
總結一下,GeneratorJob就是從數據庫中產生要抓取的Top N頁面放到抓取隊列(fetchlist)中去。
References
1. Nutch 2.0 之 抓取流程簡單分析
2. Nutch2 之 GeneratorJob