WebMagic爬蟲案例

使用Maven導入以下兩個包:

       <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.5.2</version>
        </dependency>

        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.5.2</version>
        </dependency>


這次弄了兩個小案例,都是爬的小說網,第一個是起點的列表頁


用firebug我們可以看到:

此時用WebMagic註解方式即可,方便簡單:

package com.zab.webmagic;


import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.ConsolePageModelPipeline;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl;


@TargetUrl("http://a.qidian.com/")
@ExtractBy(value = "//ul[@class=\"all-img-list cf\"]/li",multi = true)
public class GithubRepoPageProcessor {
    @ExtractBy("//div[@class=book-mid-info]/h4/a/text()")
    private String title;
    @ExtractBy("//div[@class=book-mid-info]/p[@class=author]/a[@class=name]/text()")
    private String author;

    @ExtractBy("//div[@class=book-mid-info]/p[@class=author]/a[@class=go-sub-type]/text()")
    private String type;
    @ExtractBy("//div[@class=book-mid-info]/p[@class=author]/span/text()")
    private String status;

    @ExtractBy("//div[@class=book-mid-info]/p[@class=intro]/text()")
    private String intro;
    @ExtractBy("//div[@class=book-mid-info]/p[@class=update]/span/text()")
    private String count;
    public static void main(String[] args) {
//        OOSpider.create(Site.me(), new ConsolePageModelPipeline(), Qidian.class).addUrl("http://a.qidian.com/").thread(4).run();
       OOSpider ooSpider = OOSpider.create(Site.me().setSleepTime(100), new ConsolePageModelPipeline(), GithubRepoPageProcessor.class);

       GithubRepoPageProcessor qidian= ooSpider.get("http://a.qidian.com/");

        System.out.println(qidian);
    }

}

 此時,我們就已經得到結果了



第二個例子是創世網的列表頁:

同樣的用firebug查看:

代碼爲:

package com.zab.webmagic;

import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.ConsolePageModelPipeline;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl;


@TargetUrl("http://chuangshi.qq.com/bk/")
@ExtractBy(value = "//div[@class='leftlist']/table/tbody/tr",multi = true)
public class ChuangShi {
    @ExtractBy("//a[@class=green]/text()")
    private String title;
    @ExtractBy("//a[@class=grey3]/text()")
    private String author;

    @ExtractBy("//a[@class=grey2]/text()")
    private String type;
    
    public static void main(String[] args) {
//        OOSpider.create(Site.me(), new ConsolePageModelPipeline(), Qidian.class).addUrl("http://a.qidian.com/").thread(4).run();
       OOSpider ooSpider = OOSpider.create(Site.me().setCharset("utf-8"), new ConsolePageModelPipeline(), ChuangShi.class);

       ChuangShi qidian= ooSpider.get("http://chuangshi.qq.com/bk/");

        System.out.println(qidian);
    }

}

結果顯示:



發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章