源碼分析
package org.apache.nutch.crawl;
首先呢,InjectorJob類存在於包org.apache.nutch.crawl內。
public class InjectorJob extends NutchTool implements Tool
它擴展了NutchTool類並實現了Tool類。並實現了NutchTool類的run(Map<String, Object>)方法和Tool類的run(String[])方法,Tool類即Hadoop util中的Tool類。沒啥好說的。
接着呢,我們來看一看它的Mapper類和reducer類。
public static class UrlMapper extends
Mapper<LongWritable, Text, String, WebPage> {
private URLNormalizers urlNormalizers;
private int interval;
private float scoreInjected;
private URLFilters filters;
private ScoringFilters scfilters;
private long curTime;
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
/**
* 一些準備工作。很多默認配置都在nutch根目錄下conf文件夾中的nutch-default.xml中。
*/
urlNormalizers = new URLNormalizers(context.getConfiguration(),
URLNormalizers.SCOPE_INJECT); /**規範化Url*/
interval = context.getConfiguration().getInt("db.fetch.interval.default",
2592000); /**兩次抓取同一個頁面之間的默認時間間隔,30天。*/
filters = new URLFilters(context.getConfiguration()); /**過濾不合法的Url*/
scfilters = new ScoringFilters(context.getConfiguration()); /**一個計算分值的類*/
scoreInjected = context.getConfiguration().getFloat("db.score.injected",
1.0f); /**被injector增加的新頁面的分值(score)*/
curTime = context.getConfiguration().getLong("injector.current.time",
System.currentTimeMillis()); /**注入(inject)的時間*/
}
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String url = value.toString().trim(); // value is line of text
if (url != null && (url.length() == 0 || url.startsWith("#"))) {
/* Ignore line that start with # */
return;
}
// if tabs : metadata that could be stored
// must be name=value and separated by \t
float customScore = -1f;
int customInterval = interval;
Map<String, String> metadata = new TreeMap<String, String>(); /**用來映射元數據的name和value值*/
if (url.indexOf("\t") != -1) {
String[] splits = url.split("\t");
url = splits[0];
for (int s = 1; s < splits.length; s++) { /**對於每一個被split的name value值*/
// find separation between name and value
int indexEquals = splits[s].indexOf("=");
if (indexEquals == -1) {
// skip anything without a =
continue;
}
String metaname = splits[s].substring(0, indexEquals);
String metavalue = splits[s].substring(indexEquals + 1);
/**
* 對於一個特定Url,用戶自定義的分值
* public static String nutchScoreMDName = "nutch.score";
*/
if (metaname.equals(nutchScoreMDName)) {
try {
customScore = Float.parseFloat(metavalue);
} catch (NumberFormatException nfe) {
}
}
/**
* 對於一個特定Url,用戶自定義的抓取間隔
* public static String nutchFetchIntervalMDName = "nutch.fetchInterval";
*/
else if (metaname.equals(nutchFetchIntervalMDName)) {
try {
customInterval = Integer.parseInt(metavalue);
} catch (NumberFormatException nfe) {
}
} else
metadata.put(metaname, metavalue);
}
}
try {
/**
* 這裏就是核心的規範化和過濾Url的過程了
*/
url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
url = filters.filter(url); // filter the url
} catch (Exception e) {
url = null;
}
if (url == null) {
context.getCounter("injector", "urls_filtered").increment(1); /**被過濾掉的Url數目加一*/
return;
} else { // if it passes
/**
* 如果Url通過,則將從該Url中所獲取的信息連同其他信息一同封裝於WebPage類中用於保存。
*/
String reversedUrl = TableUtil.reverseUrl(url); // collect it
WebPage row = WebPage.newBuilder().build();
row.setFetchTime(curTime);
row.setFetchInterval(customInterval);
// now add the metadata
Iterator<String> keysIter = metadata.keySet().iterator();
while (keysIter.hasNext()) {
String keymd = keysIter.next();
String valuemd = metadata.get(keymd);
row.getMetadata().put(new Utf8(keymd),
ByteBuffer.wrap(valuemd.getBytes()));
}
if (customScore != -1)
row.setScore(customScore);
else
row.setScore(scoreInjected);
try {
/**
* 當注入新頁面的時候,計算出一個新的初始值
*/
scfilters.injectedScore(url, row);
} catch (ScoringFilterException e) {
}
}
context.getCounter("injector", "urls_injected").increment(1); /**已注入Url數目加一*/
row.getMarkers()
.put(DbUpdaterJob.DISTANCE, new Utf8(String.valueOf(0)));
Mark.INJECT_MARK.putMark(row, YES_STRING);
/**
* Mapper的輸出:<String(reversedUrl), WebPage(row)>
*/
context.write(reversedUrl, row);
}
}
}
首先呢,細心的同學已經發現在存儲的時候它已經將Url通過調用
String reversedUrl = TableUtil.reverseUrl(url);
變成reversedUrl了。
這是爲了反轉一個Url的域名,從而方便於在HBase中的存儲。因爲在同一域名內的掃描能更快一些。舉個自帶的例子:
"http://bar.foo.com:8983/to/index.html?a=b" 變成了
"com.foo.bar:8983:http/to/index.html?a=b"
然後呢,在成員變量裏還出現了三個比較陌生的類:URLNormalizers, URLFilters, ScoringFilters.
這三個類呢,其實都是Nucth插件的接口。稍後我們將會另起一篇文章,利用URLNormalizers或者URLFilters類專門來說一說Nutch的插件機制。
這裏先簡單說一說ScoringFilters類吧,因爲大家可能被我註釋中頁面的“分值(score)”這個說法弄暈了。比如當你沒有設置conf/nutch-default.xml中的scoring.filter.order屬性值,那麼程序就會默認調用src/plugin中自帶的scoring-opic插件,用其中的org.apache.nutch.scoring.opic.OPICScoringFilter類作爲ScoringFilter類的實現。這其實就是Nutch內部的頁面評分機制OPIC算法的調用,即Online Page Importance Computation算法。它實現的參考文獻是這一篇論文:Adaptive On-Line Page Importance Computation.
那它是個Filter也就不難理解了,它會將頁面中那些評分過低的過濾掉。
URLNormalizers類默認調用插件urlnormalizer-(pass|regex|basic).而什麼是將Url規範化呢?舉個例子:
urlnormalizer-basic就是用來——
remove dot segments in path: /./ or /../
remove default ports, e.g. 80 for protocol http://
URLFilters類默認調用插件urlfilter-regex. 而Nutch自帶五種過濾插件,分別爲:DomainURLFilter, RegexURLFilter, AutomatonURLFilter , PrefixURLFilter, SuffixURLFilter. 這5中過濾器的配置過濾規則的文件分別爲:domain-urlfilter.txt、regex-urlfilter.txt、automaton-urlfilter.txt、prefix-urlfilter.txt、suffix-urlfilter.txt。過濾器和過濾規則文件之間的關係同樣是通過來nutch-default.xml來定義的。屬性urlfilter.order則定義了過濾器的應用順序,所有過濾器都是與的關係。
接下來,我們看看它的Job。
currentJob = new NutchJob(getConf(), "inject " + input); /**NutchJob是對Hadoop Job的一個擴展*/
FileInputFormat.addInputPath(currentJob, input);
currentJob.setMapperClass(UrlMapper.class);
currentJob.setMapOutputKeyClass(String.class);
currentJob.setMapOutputValueClass(WebPage.class);
currentJob.setOutputFormatClass(GoraOutputFormat.class);
DataStore<String, WebPage> store = StorageUtils.createWebStore(
currentJob.getConfiguration(), String.class, WebPage.class);
GoraOutputFormat.setOutput(currentJob, store, true);
currentJob.setReducerClass(Reducer.class);
currentJob.setNumReduceTasks(0); /**這裏可以看出這個Job根本就沒有用到Reducer*/
currentJob.waitForCompletion(true); /**run job.*/
由於 此Job沒有reduce階段,結合上面的代碼,顯然可以看出輸出被寫入到了Gora的dataStore中。
總結一下,InjectJob就是從input中讀入種子Urls,然後對其進行規範化,過濾,再進行評分。最後進行存儲。
實戰演練
http://nutch.apache.org/ nutch.score=0.172 nutch.fetchInterval=3600
result:
http://nutch.apache.org/ key: org.apache.nutch:http/
baseUrl: null
status: 0 (null)
fetchTime: 1440072184529
prevFetchTime: 0
fetchInterval: 3600
retriesSinceFetch: 0
modifiedTime: 0
prevModifiedTime: 0
protocolStatus: (null)
parseStatus: (null)
title: null
score: 0.172
marker _injmrk_ : y
marker dist : 0
reprUrl: null
metadata _csh_ : >0 �
參考文獻:
Nutch 1.0 源代碼分析[1] Injector
源碼:injectedScore()初讀
Nutch 1.3 學習筆記 11-1 頁面評分機制 OPIC