使用Maven導入以下兩個包:
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.5.2</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.5.2</version>
</dependency>
這次弄了兩個小案例,都是爬的小說網,第一個是起點的列表頁
用firebug我們可以看到:
此時用WebMagic註解方式即可,方便簡單:
package com.zab.webmagic;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.ConsolePageModelPipeline;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl;
@TargetUrl("http://a.qidian.com/")
@ExtractBy(value = "//ul[@class=\"all-img-list cf\"]/li",multi = true)
public class GithubRepoPageProcessor {
@ExtractBy("//div[@class=book-mid-info]/h4/a/text()")
private String title;
@ExtractBy("//div[@class=book-mid-info]/p[@class=author]/a[@class=name]/text()")
private String author;
@ExtractBy("//div[@class=book-mid-info]/p[@class=author]/a[@class=go-sub-type]/text()")
private String type;
@ExtractBy("//div[@class=book-mid-info]/p[@class=author]/span/text()")
private String status;
@ExtractBy("//div[@class=book-mid-info]/p[@class=intro]/text()")
private String intro;
@ExtractBy("//div[@class=book-mid-info]/p[@class=update]/span/text()")
private String count;
public static void main(String[] args) {
// OOSpider.create(Site.me(), new ConsolePageModelPipeline(), Qidian.class).addUrl("http://a.qidian.com/").thread(4).run();
OOSpider ooSpider = OOSpider.create(Site.me().setSleepTime(100), new ConsolePageModelPipeline(), GithubRepoPageProcessor.class);
GithubRepoPageProcessor qidian= ooSpider.get("http://a.qidian.com/");
System.out.println(qidian);
}
}
此時,我們就已經得到結果了
第二個例子是創世網的列表頁:
同樣的用firebug查看:
代碼爲:
package com.zab.webmagic;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.ConsolePageModelPipeline;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl;
@TargetUrl("http://chuangshi.qq.com/bk/")
@ExtractBy(value = "//div[@class='leftlist']/table/tbody/tr",multi = true)
public class ChuangShi {
@ExtractBy("//a[@class=green]/text()")
private String title;
@ExtractBy("//a[@class=grey3]/text()")
private String author;
@ExtractBy("//a[@class=grey2]/text()")
private String type;
public static void main(String[] args) {
// OOSpider.create(Site.me(), new ConsolePageModelPipeline(), Qidian.class).addUrl("http://a.qidian.com/").thread(4).run();
OOSpider ooSpider = OOSpider.create(Site.me().setCharset("utf-8"), new ConsolePageModelPipeline(), ChuangShi.class);
ChuangShi qidian= ooSpider.get("http://chuangshi.qq.com/bk/");
System.out.println(qidian);
}
}
結果顯示: