同樣的:
package org.apache.nutch.fetcher;
Job
先看它的Job是個什麼情況:
currentJob = new NutchJob(getConf(), "fetch");
// for politeness, don't permit parallel execution of a single task
/**
* turn off Reduce的推測執行。推測執行就是當一個任務執行的比較慢,Job Tracker就會開啓一個新的duplicate tast去做它的活,
* 他們兩誰先完成就把對方kill掉。爲了politeness,不允許一個任務的併發執行,即不允許多個線程同時抓取同一個網頁。
*/
currentJob.setReduceSpeculativeExecution(false);
Collection<WebPage.Field> fields = getFields(currentJob);
MapFieldValueFilter<String, WebPage> batchIdFilter = getBatchIdFilter(batchId);
StorageUtils.initMapperJob(currentJob, fields, IntWritable.class,
FetchEntry.class, FetcherMapper.class, FetchEntryPartitioner.class,
batchIdFilter, false);
StorageUtils.initReducerJob(currentJob, FetcherReducer.class);
if (numTasks == null || numTasks < 1) { /**怎麼可以木有reduce任務呢?不可以!*/
currentJob.setNumReduceTasks(currentJob.getConfiguration().getInt(
"mapred.map.tasks", currentJob.getNumReduceTasks()));
} else {
currentJob.setNumReduceTasks(numTasks);
}
currentJob.waitForCompletion(true); /**run job*/
FetchEntry
public class FetchEntry extends Configured implements Writable {
private String key;
private WebPage page;
}
其中的key就是reversedUrl,沒啥說的。
FetchEntryPartitioner
public static class FetchEntryPartitioner extends
Partitioner<IntWritable, FetchEntry> implements Configurable {
private URLPartitioner partitioner = new URLPartitioner();
@Override
public int getPartition(IntWritable intWritable, FetchEntry fetchEntry,
int numReduces) {
String key = fetchEntry.getKey();
String url = TableUtil.unreverseUrl(key);
return partitioner.getPartition(url, numReduces);
}
}
getPartition同樣調用的是GeneratorJob中GeneratorMapper的getPartition方法。
FetcherMapper
下面我們看一看它的Mapper類:
public static class FetcherMapper extends
GoraMapper<String, WebPage, IntWritable, FetchEntry> {
private boolean shouldContinue;
private Utf8 batchId;
private Random random = new Random();
@Override
protected void setup(Context context) {
Configuration conf = context.getConfiguration();
shouldContinue = conf.getBoolean(RESUME_KEY, false); /**是否繼續*/
batchId = new Utf8(
conf.get(GeneratorJob.BATCH_ID, Nutch.ALL_BATCH_ID_STR));
}
@Override
protected void map(String key, WebPage page, Context context)
throws IOException, InterruptedException {
if (Mark.GENERATE_MARK.checkMark(page) == null) { /**跳過未生成的(上一步generator)*/
return;
}
if (shouldContinue && Mark.FETCH_MARK.checkMark(page) != null) { /**已經抓取過了*/
return;
}
context.write(new IntWritable(random.nextInt(65536)), new FetchEntry(
context.getConfiguration(), key, page)); /**Mapper輸出<IntWritable, FetchEntry>對,其中key是[0,65535]之間的隨機數,使得Mapper的輸出能夠大致均勻的分給每個reduce的任務*/
}
}
FetcherReducer
這個Job中最重要的大概就屬它的Reducer類了,其中用到了生產者/消費者模型。一個生產者對應於多個消費者。對於該模型,這裏是其理論部分,這裏是其實踐部分。
好了,大致瞭解了該模型之後,我們來看看該Reducer類中對應的生產者和消費者究竟是誰。
首先呢,該Reducer類中一種含有五個其他內部類,分別是FetchItem
, FetchItemQueue
, FetchItemQueues
, FetcherThread
和QueueFeeder
。其實呢,FetchItem
就是要被抓取的對象,FetchQueue
裏裝的就是來自於同一個host Id(可能是一個proto-hostname或者proto-domainname或者一個proto-IP對)的FetchItem
。FetchQueues
就是將不同的host Id來的裝進不同FetchQueue
的那些FetchItem
進行統一管理。所以將來FetchQueues
的對象就是臨界區。QueueFeeder
,顧名思義,生產者,而FetcherThread
則就是其中的消費者。
下面我們來具體看一看。
FetchItem
private static class FetchItem {
WebPage page;
String queueID;
String url;
URL u;
public FetchItem(String url, WebPage page, URL u, String queueID) {
this.page = page;
this.url = url;
this.u = u; /**url的URL對象*/
this.queueID = queueID;
}
/**
* Create an item. Queue id will be created based on <code>queueMode</code>
* argument, either as a protocol + hostname pair, protocol + IP address
* pair or protocol+domain pair.
*/
public static FetchItem create(String url, WebPage page, String queueMode) {
String queueID;
URL u = null;
try {
u = new URL(url);
} catch (final Exception e) {
return null;
}
final String proto = u.getProtocol().toLowerCase();
String host;
if (FetchItemQueues.QUEUE_MODE_IP.equalsIgnoreCase(queueMode)) { /**如果是byIP*/
try {
final InetAddress addr = InetAddress.getByName(u.getHost());
host = addr.getHostAddress();
} catch (final UnknownHostException e) {
return null;
}
} else if (FetchItemQueues.QUEUE_MODE_DOMAIN.equalsIgnoreCase(queueMode)) { /**如果是byDomain*/
host = URLUtil.getDomainName(u);
if (host == null) {
host = u.toExternalForm();
}
} else { /**否則就是byHost*/
host = u.getHost();
if (host == null) {
host = u.toExternalForm();
}
}
queueID = proto + "://" + host.toLowerCase(); /**queueID就是proto-host Id對*/
return new FetchItem(url, page, u, queueID);
}
}
用url和WebPage對象再知道queueMode就可以創建一個FetchItem
對象了。
FetchItemQueue
/**
* This class handles FetchItems which come from the same host ID (be it a
* proto/hostname or proto/IP pair). It also keeps track of requests in
* progress and elapsed time between requests.
*/
private static class FetchItemQueue {
List<FetchItem> queue = Collections /**用一個LinkedList創建一個queue隊列,每一個queue收集同QueueID的FetchItem*/
.synchronizedList(new LinkedList<FetchItem>());
Set<FetchItem> inProgress = Collections /**用一個HashSet創建一個inProgress隊列,用於收集正在抓取的FetchItem*/
.synchronizedSet(new HashSet<FetchItem>());
AtomicLong nextFetchTime = new AtomicLong();
long crawlDelay; /**存儲抓取時間間隔,當maxThreads==1時有效*/
long minCrawlDelay; /**存儲最小抓取時間間隔,當maxThreads>1時有效*/
int maxThreads; /**最大同時工作線程數*/
public FetchItemQueue(Configuration conf, int maxThreads, long crawlDelay,
long minCrawlDelay) {
this.maxThreads = maxThreads;
this.crawlDelay = crawlDelay;
this.minCrawlDelay = minCrawlDelay;
// ready to start
setEndTime(System.currentTimeMillis() - crawlDelay);
}
/**後面的就是一些set和get的方法,nothing special*/
public int getQueueSize() {
return queue.size();
}
public int getInProgressSize() {
return inProgress.size();
}
public void finishFetchItem(FetchItem it, boolean asap) {
if (it != null) {
inProgress.remove(it); /**將其從inProgress中移除*/
setEndTime(System.currentTimeMillis(), asap); /**設置結束時間*/
}
}
public void addFetchItem(FetchItem it) {
if (it == null)
return;
queue.add(it); /**將其加入queue隊列*/
}
@SuppressWarnings("unused")
public void addInProgressFetchItem(FetchItem it) {
if (it == null)
return;
inProgress.add(it); /**將其加入inProgress隊列*/
}
public FetchItem getFetchItem() {
if (inProgress.size() >= maxThreads) /**如果正在處理隊列中個數大於最大線程數*/
return null;
final long now = System.currentTimeMillis();
if (nextFetchTime.get() > now) /**如果下次抓取時間還沒到*/
return null;
FetchItem it = null;
if (queue.size() == 0) /**如果queue隊列爲空*/
return null;
try {
it = queue.remove(0); /**將其從queue隊列移到inProgress隊列*/
inProgress.add(it);
} catch (final Exception e) { }
return it;
}
public synchronized void dump() {
for (int i = 0; i < queue.size(); i++) {
final FetchItem it = queue.get(i); /**dump掉其實就只是在日誌中記錄下來*/
LOG.info(" " + i + ". " + it.url);
}
}
private void setEndTime(long endTime) {
setEndTime(endTime, false); /**默認非立即*/
}
private void setEndTime(long endTime, boolean asap) {
if (!asap) /**下次抓取時間爲 當前結束時間+抓取時間間隔*/
nextFetchTime.set(endTime
+ (maxThreads > 1 ? minCrawlDelay : crawlDelay));
else /**if as soon as possible, then 設置爲當前結束時間*/
nextFetchTime.set(endTime);
}
public synchronized int emptyQueue() { /**清空隊列並返回之前的隊列大小*/
int presize = queue.size();
queue.clear();
return presize;
}
}
FetchItemQueues
private static class FetchItemQueues {
Map<String, FetchItemQueue> queues = new HashMap<String, FetchItemQueue>();
AtomicInteger totalSize = new AtomicInteger(0);
int maxThreads;
String queueMode;
long crawlDelay;
long minCrawlDelay;
Configuration conf;
long timelimit = -1;
}
FetchItemQueues
主要就是對FetchItem
的一個封裝。
FetcherThread
/**
* This class picks items from queues and fetches the pages.
*/
private class FetcherThread extends Thread {
private final URLFilters urlFilters;
private final URLNormalizers normalizers;
private final ProtocolFactory protocolFactory; /**插件Protocol*/
private final long maxCrawlDelay; /**如果robots.txt中的Crawl-Delay比此值大,則跳過此頁面;若設置爲-1,則不論robots中延遲爲多少都一直等*/
@SuppressWarnings("unused")
private final boolean byIP; /**是否通過byIP方式*/
private String reprUrl;
private final Context context;
private final boolean ignoreExternalLinks;
public FetcherThread(Context context, int num) {
this.setDaemon(true); // don't hang JVM on exit
this.setName("FetcherThread" + num); // use an informative name
this.context = context;
Configuration conf = context.getConfiguration();
this.urlFilters = new URLFilters(conf);
this.protocolFactory = new ProtocolFactory(conf); /**默認使用protocol-http,在nutch-default中的Plugin.includes中設置*/
this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000; /**默認30s*/
// backward-compatible default setting
this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", true); /**默認是byIP*/
/**
* If true, outlinks leading from a page to external hosts
* will be ignored. This is an effective way to limit the crawl to include
* only initially injected hosts, without creating complex URLFilters.
*/
this.ignoreExternalLinks = conf.getBoolean("db.ignore.external.links",
false);
}
@Override
@SuppressWarnings("fallthrough")
public void run() {
activeThreads.incrementAndGet(); // count threads
FetchItem fit = null;
try {
while (true) {
fit = fetchQueues.getFetchItem();
if (fit == null) { /**如果沒有獲取到FetchItem,則*/
if (feeder.isAlive() || fetchQueues.getTotalSize() > 0) { /**如果不是線程非alive並且不是因爲隊列爲空了,那麼就是該旋轉等待(spin-wait)*/
}
// spin-wait.
spinWaiting.incrementAndGet();
try {
Thread.sleep(500);
} catch (final Exception e) {
}
spinWaiting.decrementAndGet();
continue;
} else { /**要不然就是全都做完了*/
// all done, finish this thread
return;
}
}
lastRequestStart.set(System.currentTimeMillis()); /**設置上次請求開始時間*/
if (fit.page.getReprUrl() == null) {
reprUrl = fit.url;
} else {
reprUrl = TableUtil.toString(fit.page.getReprUrl());
}
try {
// fetch the page
final Protocol protocol = this.protocolFactory.getProtocol(fit.url);
final BaseRobotRules rules = protocol.getRobotRules(fit.url,
fit.page);
if (!rules.isAllowed(fit.u.toString())) { /**訪問被拒*/
// unblock
fetchQueues.finishFetchItem(fit, true);
if (LOG.isDebugEnabled()) {
LOG.debug("Denied by robots.txt: " + fit.url);
}
output(fit, null, ProtocolStatusUtils.STATUS_ROBOTS_DENIED,
CrawlStatus.STATUS_GONE);
continue;
}
if (rules.getCrawlDelay() > 0) {
if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) { /**等待時間太久,跳過*/
// unblock
fetchQueues.finishFetchItem(fit, true);
LOG.debug("Crawl-Delay for " + fit.url + " too long ("
+ rules.getCrawlDelay() + "), skipping");
output(fit, null, ProtocolStatusUtils.STATUS_ROBOTS_DENIED,
CrawlStatus.STATUS_GONE);/**記錄*/
continue;
} else { /**重新設置抓取延遲*/
final FetchItemQueue fiq = fetchQueues
.getFetchItemQueue(fit.queueID);
fiq.crawlDelay = rules.getCrawlDelay();
if (LOG.isDebugEnabled()) {
LOG.info("Crawl delay for queue: " + fit.queueID
+ " is set to " + fiq.crawlDelay
+ " as per robots.txt. url: " + fit.url);
}
}
}
final ProtocolOutput output = protocol.getProtocolOutput(fit.url,
fit.page);
final ProtocolStatus status = output.getStatus();
final Content content = output.getContent();
// unblock queue
fetchQueues.finishFetchItem(fit);
context.getCounter("FetcherStatus",
ProtocolStatusUtils.getName(status.getCode())).increment(1);
int length = 0;
if (content != null && content.getContent() != null)
length = content.getContent().length;
updateStatus(length);
/************/
switch (status.getCode()) {
case ProtocolStatusCodes.WOULDBLOCK:
// retry ?
fetchQueues.addFetchItem(fit);
break;
case ProtocolStatusCodes.SUCCESS: // got a page
output(fit, content, status, CrawlStatus.STATUS_FETCHED);
break;
case ProtocolStatusCodes.MOVED: // redirect
case ProtocolStatusCodes.TEMP_MOVED:
byte code;
boolean temp;
if (status.getCode() == ProtocolStatusCodes.MOVED) {
code = CrawlStatus.STATUS_REDIR_PERM;
temp = false;
} else {
code = CrawlStatus.STATUS_REDIR_TEMP;
temp = true;
}
final String newUrl = ProtocolStatusUtils.getMessage(status);
handleRedirect(fit.url, newUrl, temp, FetcherJob.PROTOCOL_REDIR,
fit.page);
output(fit, content, status, code);
break;
case ProtocolStatusCodes.EXCEPTION:
logFetchFailure(fit.url, ProtocolStatusUtils.getMessage(status));
/* FALLTHROUGH */
case ProtocolStatusCodes.RETRY: // retry
case ProtocolStatusCodes.BLOCKED:
output(fit, null, status, CrawlStatus.STATUS_RETRY);
break;
case ProtocolStatusCodes.GONE: // gone
case ProtocolStatusCodes.NOTFOUND:
case ProtocolStatusCodes.ACCESS_DENIED:
case ProtocolStatusCodes.ROBOTS_DENIED:
output(fit, null, status, CrawlStatus.STATUS_GONE);
break;
case ProtocolStatusCodes.NOTMODIFIED:
output(fit, null, status, CrawlStatus.STATUS_NOTMODIFIED);
break;
default:
if (LOG.isWarnEnabled()) {
LOG.warn("Unknown ProtocolStatus: " + status.getCode());
}
output(fit, null, status, CrawlStatus.STATUS_RETRY);
}
/************/
} catch (final Throwable t) { // unexpected exception
// unblock
fetchQueues.finishFetchItem(fit);
LOG.error("Unexpected error for " + fit.url, t);
output(fit, null, ProtocolStatusUtils.STATUS_FAILED,
CrawlStatus.STATUS_RETRY);
}
}/**end of while(true)*/
} catch (final Throwable e) {
LOG.error("fetcher throwable caught", e);
} finally { /**整個進程結束*/
if (fit != null)
fetchQueues.finishFetchItem(fit);
activeThreads.decrementAndGet(); // count threads
LOG.info("-finishing thread " + getName() + ", activeThreads="
+ activeThreads);
}
}
private void output(FetchItem fit, Content content, ProtocolStatus pstatus,
byte status) throws IOException, InterruptedException {
fit.page.setStatus((int) status);
final long prevFetchTime = fit.page.getFetchTime();
fit.page.setPrevFetchTime(prevFetchTime);
fit.page.setFetchTime(System.currentTimeMillis());
if (pstatus != null) {
fit.page.setProtocolStatus(pstatus);
}
if (content != null) {
fit.page.setContent(ByteBuffer.wrap(content.getContent()));
fit.page.setContentType(new Utf8(content.getContentType()));
fit.page.setBaseUrl(new Utf8(content.getBaseUrl()));
}
Mark.FETCH_MARK.putMark(fit.page, Mark.GENERATE_MARK.checkMark(fit.page));
String key = TableUtil.reverseUrl(fit.url);
if (parse) {
if (!skipTruncated
|| (skipTruncated && !ParserJob.isTruncated(fit.url, fit.page))) {
parseUtil.process(key, fit.page);
}
}
// remove content if storingContent is false. Content is added to fit.page
// above
// for ParseUtil be able to parse it.
if (content != null && !storingContent) {
fit.page.setContent(ByteBuffer.wrap(new byte[0]));
}
context.write(key, fit.page);/**整個Reducer的輸出*/
}
}
具體有關程序中 switch (status.getCode())部分各種status標識的含義參見這裏。
QueueFeeder
/**
* This class feeds the queues with input items, and re-fills them as items
* are consumed by FetcherThread-s.
*/
private static class QueueFeeder extends Thread {
private final Context context;
private final FetchItemQueues queues;
private final int size;
private Iterator<FetchEntry> currentIter;
boolean hasMore;
private long timelimit = -1;
public QueueFeeder(Context context, FetchItemQueues queues, int size)
throws IOException, InterruptedException {
this.context = context;
this.queues = queues;
this.size = size;
this.setDaemon(true);
this.setName("QueueFeeder");
hasMore = context.nextKey();
if (hasMore) {
currentIter = context.getValues().iterator();
}
// the value of the time limit is either -1 or the time where it should
// finish
timelimit = context.getConfiguration().getLong("fetcher.timelimit", -1);
}
@Override
public void run() {
int cnt = 0;
int timelimitcount = 0;
try {
while (hasMore) {
if (System.currentTimeMillis() >= timelimit && timelimit != -1) {/**到了時間限制,跳過未處理的*/
// enough .. lets' simply
// read all the entries from the input without processing them
while (currentIter.hasNext()) {
currentIter.next();
timelimitcount++;
}
hasMore = context.nextKey();
if (hasMore) {
currentIter = context.getValues().iterator();
}
continue;
}
int feed = size - queues.getTotalSize();
if (feed <= 0) {
// queues are full - spin-wait until they have some free space
try {
Thread.sleep(1000);
} catch (final Exception e) {
}
;
continue;
}
if (LOG.isDebugEnabled()) {
LOG.debug("-feeding " + feed + " input urls ...");
}
while (feed > 0 && currentIter.hasNext()) {
FetchEntry entry = currentIter.next();
final String url = TableUtil.unreverseUrl(entry.getKey());
queues.addFetchItem(url, entry.getWebPage());
feed--;
cnt++;
}
if (currentIter.hasNext()) { /**一個list處理完處理另一個list*/
continue; // finish items in current list before reading next key
}
hasMore = context.nextKey();
if (hasMore) {
currentIter = context.getValues().iterator();
}
}
} catch (Exception e) {
return;
}
}
}
FetcherReducer
中的run方法
@Override
public void run(Context context) throws IOException, InterruptedException {
int maxFeedPerThread = conf.getInt("fetcher.queue.depth.multiplier", 50);
feeder = new QueueFeeder(context, fetchQueues, threadCount
* maxFeedPerThread);
feeder.start();
for (int i = 0; i < threadCount; i++) { // spawn threads
FetcherThread ft = new FetcherThread(context, i);
fetcherThreads.add(ft);
ft.start();
}
do { // wait for threads to exit
...
} while (activeThreads.get() > 0);
}
總之,FetcherJob是nutch中相對來說最核心的一個部分,要想完全喫透它需要大量的實踐經驗方可。紙上談兵要不得。
References
Nutch 1.3 學習筆記 5 Fetcher流程
Nutch 1.3 學習筆記 5-1 FetchThread
Nutch 2.0 之 抓取流程簡單分析