Jsoup爬蟲的基本使用

什麼是Jsoup?

jsoup 是一款Java 的HTML解析器,可直接解析某個URL地址、HTML文本內容。它提供了一套非常省力的API,可通過DOM,CSS以及類似於jQuery的操作方法來取出和操作數據(簡稱爬蟲)。

基本使用

新建一個maven項目

<dependencies>
    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpclient</artifactId>
        <version>4.0.1</version>
    </dependency>
    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpcore</artifactId>
        <version>4.0.1</version>
    </dependency>
    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpmime</artifactId>
        <version>4.0.1</version>
    </dependency>
    <dependency>
        <groupId>commons-codec</groupId>
        <artifactId>commons-codec</artifactId>
        <version>1.4</version>
    </dependency>
    <dependency>
        <groupId>commons-logging</groupId>
        <artifactId>commons-logging</artifactId>
        <version>1.1.1</version>
    </dependency>
    <dependency>
        <groupId>commons-io</groupId>
        <artifactId>commons-io</artifactId>
        <version>1.4</version>
    </dependency>
    <dependency>
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.11.3</version>
    </dependency>
    <dependency>
        <groupId>org.apache.commons</groupId>
        <artifactId>commons-lang3</artifactId>
        <version>3.1</version>
    </dependency>
    <dependency>
        <groupId>junit</groupId>
        <artifactId>junit</artifactId>
        <version>4.11</version>
        <scope>compile</scope>
    </dependency>
</dependencies>

測試類

 @Test
    public void test111() throws Exception{
//        1、爬取的url
        String targetUrl = "https://zhipeng0908.gitee.io";
//        2、獲取connection,CrawlerUtil工具類在下方
        Connection connect = CrawlerUtil.getConnection(targetUrl);
//        4、執行
        Connection.Response response = connect.method(Connection.Method.GET).execute();
//        5、處理爬蟲結果
//        得到dom
        Document document = response.parse();
//        <body></body>
        Element bodyElement = document.body();
        // .post-header爲這個html中一個div的類名
//        Elements 類繼承了ArrayList類
        Elements cardElement = bodyElement.select(".post-header");
//        處理結果,獲得文本內容
        for (Element blog : cardElement) {
            Elements titleElement = blog.select(".post-title");
            String title = titleElement.text();
            Elements timeElement = blog.select(".post-meta > span.post-time > time");
            String time = timeElement.text();
            Elements linkElement = blog.select(".post-title-link");
            String link = linkElement.attr("href");
            System.out.println("博客標題:"+title + "\t" + "url:" + (targetUrl+link) + "\t"+"發佈時間:"+time);
        }
    }

工具類

public static Connection getConnection(String targetUrl){
        Connection connect = Jsoup.connect(targetUrl);
//        3、僞造請求頭
        connect.header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
        connect.header("Accept-Encoding","gzip, deflate, br");
        connect.header("Accept-Language","zh-CN,zh;q=0.9");
        connect.header("Cache-Control","no-cache");
        connect.header("Connection","keep-alive");
        connect.header("Cookie","_ga=GA1.2.2130438396.1588431092; Hm_lvt_ec661610f14acf2457496da3a87d804d=1588840665,1589378478; Hm_lpvt_ec661610f14acf2457496da3a87d804d=1589378528");
        connect.header("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36");
        return connect;
    }

結果
在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章