任務1:
抽取頁面中的所有鏈接
LinkBean lb = new LinkBean ();
lb.setURL ("http://sthaboutme.sinaapp.com/");
URL[] urls = lb.getLinks ();
for (int i = 0; i < urls.length; i++)
System.out.println (urls[i]);
任務2:
抽取頁面中滿足既定條件的鏈接
try {
Parser parser = new Parser("http://sthaboutme.sinaapp.com");
String matchPattern = "http://sthaboutme.sinaapp.com/?";
NodeFilter filter = new LinkRegexFilter(matchPattern);
NodeList nlist = parser.extractAllNodesThatMatch(filter);
System.out.println(nlist.size());
for(int i= 0 ;i < nlist.size();i++)
{
LinkTag link =(LinkTag)nlist.elementAt(i);
System.out.println(link.getLink());
}
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
任務3:
抽取頁面中滿足多條件的鏈接
try {
Parser parser = new Parser("http://sthaboutme.sinaapp.com");
String StrContain = "http://";
String StrNotContain ="#";
NodeFilter filter1 = new LinkRegexFilter(StrContain);
NodeFilter filter2 = new StringFilter(StrNotContain)
{
public boolean accept (Node node)
{
boolean ret = true;
if (LinkTag.class.isAssignableFrom (node.getClass ()))
{
String link = ((LinkTag)node).getLink ();
if (link.indexOf (mPattern) > -1)
{
ret = false;
// System.out.print(mPattern);
}
}
return ret;
};
};
AndFilter andFilter = new AndFilter(filter1,filter2);
NodeList nlist = parser.extractAllNodesThatMatch(andFilter);
System.out.println(nlist.size());
for(int i= 0 ;i < nlist.size();i++)
{
LinkTag link =(LinkTag)nlist.elementAt(i);
System.out.println(link.getLink());
}
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}