package com.sg123.nutch.plugin.parse.html;
import java.util.Enumeration;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.protocol.Content;
import org.w3c.dom.DocumentFragment;
/**
* 提取name=“description”的Meta標籤的“content”屬性值
* @author lvshow
*
*/
public class Description implements HtmlParseFilter {//實現HtmlParseFilter接口
@Override
public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {
String description = null;
Properties generalMetaTags = metaTags.getGeneralTags();
for (Enumeration tagNames = generalMetaTags.propertyNames(); tagNames
.hasMoreElements();) {
if (tagNames.nextElement().toString().equalsIgnoreCase("description")) {
description = generalMetaTags.getProperty("description");
if(description==null){
description = generalMetaTags.getProperty("Description");
}
System.out.println("找到了 " + description);
}
}
if (description == null) {
System.out.println("沒有description標籤");
} else {
System.out.println("添加description! " + description);
//parseResult中的Map的key爲Url,content的getUrl方法可以得到當前的url
parseResult.get(content.getUrl()).getData().getParseMeta().set(
"description", description);
}
return parseResult;
}
@Override
public Configuration getConf() {
// TODO Auto-generated method stub
return null;
}
@Override
public void setConf(Configuration conf) {
// TODO Auto-generated method stub
}
}
以上爲插件類
plugin.xml如下:
<?xml version="1.0" encoding="UTF-8"?>
<plugin
id="description"
name="description Parser/Filter"
version="1.0.0"
provider-name="nutch.org">
<runtime>
<library name="description.jar">
<export name="*"/>
</library>
</runtime>
<extension id="com.sg123.nutch.plugin.parse.html.descriptionfilter"
name="description Parser"
point="org.apache.nutch.parse.HtmlParseFilter">
<implementation id="Description"
class="com.sg123.nutch.plugin.parse.html.Description">
</implementation>
</extension>
</plugin>
把插件類編譯打包
然後放入plugins目錄