nutch 插件 實例

package com.sg123.nutch.plugin.parse.html;

import java.util.Enumeration;
import java.util.Properties;

import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.protocol.Content;
import org.w3c.dom.DocumentFragment;
/**
 * 提取name=“description”的Meta標籤的“content”屬性值
 * @author lvshow
 *
 */
public class Description implements HtmlParseFilter {//實現HtmlParseFilter接口

	
	@Override
	public ParseResult filter(Content content, ParseResult parseResult,
			HTMLMetaTags metaTags, DocumentFragment doc) {

		String description = null;

		Properties generalMetaTags = metaTags.getGeneralTags();

		for (Enumeration tagNames = generalMetaTags.propertyNames(); tagNames
				.hasMoreElements();) {
			if (tagNames.nextElement().toString().equalsIgnoreCase("description")) {
				description = generalMetaTags.getProperty("description");
				if(description==null){
					description = generalMetaTags.getProperty("Description");
				}
				System.out.println("找到了 " + description);
			}
		}

		if (description == null) {
			System.out.println("沒有description標籤");
		} else {
			System.out.println("添加description! " + description);
			//parseResult中的Map的key爲Url,content的getUrl方法可以得到當前的url
			parseResult.get(content.getUrl()).getData().getParseMeta().set(
					"description", description);
		}

		return parseResult;
	}

	@Override
	public Configuration getConf() {
		// TODO Auto-generated method stub
		return null;
	}

	@Override
	public void setConf(Configuration conf) {
		// TODO Auto-generated method stub

	}

}

以上爲插件類

plugin.xml如下:

<?xml version="1.0" encoding="UTF-8"?>
<plugin
 id="description"
 name="description Parser/Filter"
 version="1.0.0"
 provider-name="nutch.org">
    
    <runtime>
        <library name="description.jar">
            <export name="*"/>
        </library>
    </runtime>
    
    
    <extension id="com.sg123.nutch.plugin.parse.html.descriptionfilter"
     name="description Parser"
     point="org.apache.nutch.parse.HtmlParseFilter">
        <implementation id="Description"
         class="com.sg123.nutch.plugin.parse.html.Description">
        </implementation>
    </extension>
    
</plugin>


把插件類編譯打包

然後放入plugins目錄


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章