【Nutch】GeneratorJob

同樣:

package org.apache.nutch.crawl;

Job

先從它的Job開始入“眼”:

    currentJob = new NutchJob(getConf(), "generate: " + getConf().get(BATCH_ID));
    Collection<WebPage.Field> fields = getFields(currentJob);
    /**
     * public static <K, V> void initMapperJob(
     * Job job,
     * Collection<WebPage.Field> fields, 
     * Class<K> outKeyClass,
     * Class<V> outValueClass,
     * Class<? extends GoraMapper<String, WebPage, K, V>> mapperClass,
     * Class<? extends Partitioner<K, V>> partitionerClass,
     * boolean reuseObjects)
     */
    StorageUtils.initMapperJob(
    currentJob,
    fields,
    SelectorEntry.class,
    WebPage.class,
    GeneratorMapper.class,
    SelectorEntryPartitioner.class,
    true);
    StorageUtils.initReducerJob(currentJob, GeneratorReducer.class);
    currentJob.waitForCompletion(true);

可以看出來它的Map的輸出爲<SelectorEntry, WebPage>對,WebPage大家已經瞭解,就是對於一個頁面本身的信息還有其他信息(比如抓取時間)的一個封裝。那麼SelectorEntry這個Class長什麼樣呢?

SelectorEntry

  public static class SelectorEntry implements
      WritableComparable<SelectorEntry> {

    String url;
    float score;

    /**
     * 首先按網頁分值排序,分值高的靠前;然後再按照url的字典序排序。
     */
    public int compareTo(SelectorEntry se) {
      if (se.score > score)
        return 1;
      else if (se.score == score)
        return url.compareTo(se.url);
      return -1;
    }
    /**
     * 依靠hashCode和equals判斷兩個對象是否相同,用於去重的目的。
     */
    @Override
    public int hashCode() {
      final int prime = 31;
      int result = 1;
      result = prime * result + url.hashCode();
      result = prime * result + Float.floatToIntBits(score);
      return result;
    }

    @Override
    public boolean equals(Object obj) {
      SelectorEntry other = (SelectorEntry) obj;
      if (!url.equals(other.url))
        return false;
      if (Float.floatToIntBits(score) != Float.floatToIntBits(other.score))
        return false;
      return true;
    }
  }

首先呢,由於SelectorEntry類是對WritableComparable類的一個實現,肯定得凸顯出Writable和Comparable這兩大特性。關於Writable呢,自然就是readFields和write這兩個方法;而Comparable呢,自然必須得有compareTo這個方法。而且又由於SelectorEntry是作爲鍵key來出現,是個複合鍵,所以它要能夠讓reduce進行去重,這個功能的實現就是靠重載hashCode和equals方法來實現的。這個不懂的同學可以去看看Hadoop MapReduce怎麼定製其中的Partitioner,SortComparator和GroupingComparator類,其實看MapReduce自帶的SecondarySort這個實例就明白了。

SelectorEntryPartitioner

接着我們就看他的SelectorEntryPartitioner類:

public static class SelectorEntryPartitioner extends
      Partitioner<SelectorEntry, WebPage> implements Configurable {
    private URLPartitioner partitioner = new URLPartitioner();

    @Override
    public int getPartition(SelectorEntry selectorEntry, WebPage page,
        int numReduces) {
      return partitioner.getPartition(selectorEntry.url, numReduces);
    }
}

  public int getPartition(String urlString, int numReduceTasks) {
    if (numReduceTasks == 1) {
      // this check can be removed when we use Hadoop with MAPREDUCE-1287
      return 0;
    }

    int hashCode;
    URL url = null;
    try {
      urlString = normalizers.normalize(urlString,
          URLNormalizers.SCOPE_PARTITION);
      hashCode = urlString.hashCode();          /**默認哈希值是正規化之後的String的哈希值*/
      url = new URL(urlString);
    } catch (MalformedURLException e) {
      LOG.warn("Malformed URL: '" + urlString + "'");
      hashCode = urlString.hashCode();
    }

    if (url != null) {
      if (mode.equals(PARTITION_MODE_HOST)) {
        hashCode = url.getHost().hashCode();    /**如果計數模式是host,則哈希值爲url中host的哈希值*/
      } else if (mode.equals(PARTITION_MODE_DOMAIN)) {
        hashCode = URLUtil.getDomainName(url).hashCode();   /**如果計數模式是domain,則哈希值爲url中domain的哈希值*/
      } else { // MODE IP
        try {
          InetAddress address = InetAddress.getByName(url.getHost());
          hashCode = address.getHostAddress().hashCode();   /**如果計數模式是ip,則哈希值爲hostAddress的哈希值*/
        } catch (UnknownHostException e) {
          GeneratorJob.LOG.info("Couldn't find IP for host: " + url.getHost());
        }
      }
    }

    // make hosts wind up in different partitions on different runs
    hashCode ^= seed;   /**按位異或,hash的手段*/
    return (hashCode & Integer.MAX_VALUE) % numReduceTasks; /**把SelectorEntry分入不同的reduce任務中[0..numReduceTask-1]*/
  }

這個Partitioner就按照不同的計數模式將url分到了不同的reduce任務中了。假如計數模式是host,它就會將同一host的url分到同一個task中去。

GeneratorMapper

public class GeneratorMapper extends
    GoraMapper<String, WebPage, SelectorEntry, WebPage> {

  private URLFilters filters;           /**插件*/
  private URLNormalizers normalizers;   /**插件*/
  private boolean filter;               /**是否過濾*/
  private boolean normalise;            /**是否規範化*/
  private FetchSchedule schedule;       /**還是插件*/
  private ScoringFilters scoringFilters;/**仍然插件*/
  private long curTime;                 /**Generate的時間*/
  private SelectorEntry entry = new SelectorEntry();
  private int maxDistance;              /**該Url到種子Url最短路徑的最大距離*/

  @Override
  public void map(String reversedUrl, WebPage page, Context context)
      throws IOException, InterruptedException {
    String url = TableUtil.unreverseUrl(reversedUrl);

    if (Mark.GENERATE_MARK.checkMark(page) != null) {/**已經生成過了*/
      return;
    }

    // filter on distance
    if (maxDistance > -1) {                         /**-1 if unlimited.*/
      CharSequence distanceUtf8 = page.getMarkers().get(DbUpdaterJob.DISTANCE);
      if (distanceUtf8 != null) {
        int distance = Integer.parseInt(distanceUtf8.toString());
        if (distance > maxDistance) {               /**距離種子頁面太遠(i.e.太不相關了)*/
          return;
        }
      }
    }

    // If filtering is on don't generate URLs that don't pass URLFilters
    try {
      if (normalise) {
        url = normalizers.normalize(url,
            URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
      }
      if (filter && filters.filter(url) == null)
        return;
    } catch (URLFilterException e) {
      return;
    } catch (MalformedURLException e) {
      return;
    }

    // check fetch schedule
    if (!schedule.shouldFetch(url, page, curTime)) {    /**默認調用org.apache.nutch.crawl.DefaultFetchSchedule*/
      return;
    }
    float score = page.getScore();
    try {
        /**
         * 該方法爲排序並且選擇分值Top N的頁面得出一個排序的分值
         */
      score = scoringFilters.generatorSortValue(url, page, score);
    } catch (ScoringFilterException e) {
      // ignore
    }
    entry.set(url, score);
    context.write(entry, page);     /**Map的輸出<SelectorEntry(entry), WebPage(page)>*/
  }
}

該Mapper類就是將Injector步中注入的Urls作爲輸入,然後做簡單的篩選然後輸出<SelectorEntry, WebPage>作爲Reducer的輸入。
我們可以看看Job中StorageUtils.initMapperJob()又調用了GoraMapper.initMapperJob()。這個方法中有一句關鍵的話:

GoraMapper

    //set the input via GoraInputFormat
    GoraInputFormat.setInput(job, query, dataStore, reuseObjects);

即設置input是從GoraInputFormat中來的。

GeneratorReducer

接下來我們看一看GeneratorReducer類:

public class GeneratorReducer extends
    GoraReducer<SelectorEntry, WebPage, String, WebPage> {

  private long limit;
  private long maxCount;
  protected static long count = 0;
  private boolean byDomain = false;
  private Map<String, Integer> hostCountMap = new HashMap<String, Integer>();
  private Utf8 batchId;

  @Override
  protected void reduce(SelectorEntry key, Iterable<WebPage> values,
      Context context) throws IOException, InterruptedException {
    for (WebPage page : values) {
      if (count >= limit) {     /**如果超過限制則退出*/
        return;
      }
      if (maxCount > 0) {       /**如果設定了最大值*/
        String hostordomain;
        if (byDomain) {
          hostordomain = URLUtil.getDomainName(key.url);
        } else {
          hostordomain = URLUtil.getHost(key.url);
        }

        Integer hostCount = hostCountMap.get(hostordomain);
        if (hostCount == null) {
          hostCountMap.put(hostordomain, 0);
          hostCount = 0;
        }
        if (hostCount >= maxCount) {    /**如果某一個hostordomain的數量超過最大值則退出*/
          return;
        }
        hostCountMap.put(hostordomain, hostCount + 1);
      }

      Mark.GENERATE_MARK.putMark(page, batchId);    /**設置標記,已經生成過了*/
      page.setBatchId(batchId);
      try {
        context.write(TableUtil.reverseUrl(key.url), page); /**輸出<String(reversedUrl), WebPage(page)>到dataStore,如HBase*/
      } catch (MalformedURLException e) {
        context.getCounter("Generator", "MALFORMED_URL").increment(1);
        continue;
      }
      context.getCounter("Generator", "GENERATE_MARK").increment(1);
      count++;
    }
  }

  @Override
  protected void setup(Context context) throws IOException,
      InterruptedException {
    Configuration conf = context.getConfiguration();
    long totalLimit = conf
        .getLong(GeneratorJob.GENERATOR_TOP_N, Long.MAX_VALUE);         /**一共需要產生Top ?的Urls*/
    if (totalLimit == Long.MAX_VALUE) {
      limit = Long.MAX_VALUE;
    } else {
      limit = totalLimit / context.getNumReduceTasks();                 /**將total值平均分到每一個Reduce任務中*/
    }
    maxCount = conf.getLong(GeneratorJob.GENERATOR_MAX_COUNT, -2);      /**在一個fetchlist上的最大Url數*/
    batchId = new Utf8(conf.get(GeneratorJob.BATCH_ID));
    String countMode = conf.get(GeneratorJob.GENERATOR_COUNT_MODE,
        GeneratorJob.GENERATOR_COUNT_VALUE_HOST);
    if (countMode.equals(GeneratorJob.GENERATOR_COUNT_VALUE_DOMAIN)) {  /**是否是byDomain模式*/
      byDomain = true;
    }
  }
}

可以看出來,每個Reduce任務只輸出Top limit的<String(reversedUrl), WebPage(page)>對。這樣分配Reduce的任務並不一定完全均勻,但是也已經可以了。

GoraReducer

  /**
   * Initializes the Reducer, and sets output parameters for the job. 
   * @param dataStore the datastore as the output /**the most important one.*/
   */
  public static <K1, V1, K2, V2 extends Persistent>
  void initReducerJob(
      Job job, 
      DataStore<K2,V2> dataStore,
      Class<? extends GoraReducer<K1, V1, K2, V2>> reducerClass, 
      boolean reuseObjects) {

    GoraOutputFormat.setOutput(job, dataStore, reuseObjects);
    job.setReducerClass(reducerClass);
  }

總結一下,GeneratorJob就是從數據庫中產生要抓取的Top N頁面放到抓取隊列(fetchlist)中去。

References

1. Nutch 2.0 之 抓取流程簡單分析
2. Nutch2 之 GeneratorJob


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章