使用java爬蟲刷閱讀量

1.編寫一個Tool用來解析url獲取文本
(注:從博客列表點擊某條博客進去時,記得要先設置請求頭,也就是setHead()那部分內容,需要根據自己的信息進行編寫,這裏我已經將個人的刪除,因爲其中加載的是動態數據)

package 刷博客閱讀量;


import org.apache.http.Consts;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.client.params.HttpClientParams;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;

/**
 * 1.創建HttpUtil類來寫一個簡單的http請求方法,訪問地址
 */
public class Tool {
//    獲取頁面數據
//這個是獲取博客的列表的url
    public static String doGet(String url) {
        String body = "";
        //連接
        HttpClient httpClient = HttpClientBuilder.create().build();
//        HttpClientParams.setCookiePolicy(httpClient.getParams(), CookiePolicy.BROWSER_COMPATIBILITY);
        //請求
        HttpGet httpGet = new HttpGet(url);
        //設置瀏覽器代理
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36");
        try{
            //響應
            HttpResponse httpResponse = httpClient.execute(httpGet);
            HttpEntity httpEntity = httpResponse.getEntity();
            body = EntityUtils.toString(httpEntity, Consts.UTF_8);
//            System.out.println("body:"+body);
//
//                parse(body);

            //釋放連接
            httpGet.releaseConnection();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return body;
    }

    //    獲取頁面數據
    //這個跟前面的doGet是一樣的,只是輸出內容過多,debug測試加的
    //我把他加進來容易修改點,若不需要可以刪掉,並在第三步將doget2()改爲doget()
    public static String doGet2(String url) {
        String body = "";
        //連接
        HttpClient httpClient = HttpClientBuilder.create().build();
//        HttpClientParams.setCookiePolicy(httpClient.getParams(), CookiePolicy.BROWSER_COMPATIBILITY);
        //請求
        HttpGet httpGet = new HttpGet(url);
        //設置瀏覽器代理
//        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36");
        //設置頭部信息進行模擬登錄(添加登錄後的Cookie)
        httpGet.setHeader("Accept", "");
        httpGet.setHeader("Accept-Encoding", "");
        httpGet.setHeader("Accept-Language", "");
        httpGet.setHeader("Cookie", "");
        httpGet.setHeader("User-Agent", "");
        try{
            //響應
            HttpResponse httpResponse = httpClient.execute(httpGet);
            HttpEntity httpEntity = httpResponse.getEntity();
            body = EntityUtils.toString(httpEntity, Consts.UTF_8);
//            System.out.println("body:"+body);
//
//                parse(body);

            //釋放連接
            httpGet.releaseConnection();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return body;
    }

//    獲取a標籤
    public static Elements parse(String html) {
        //解析html獲取Document
        Document doc= Jsoup.parse(html);
        //獲取spu
        Elements eles=doc.select("h4>a");
        return eles;
    }
}


2.開啓Springboot的定時調度

package 刷博客閱讀量;

import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableScheduling;

/**
 * 2.創建定時任務Scheduled,每隔1分鐘來請求一次去訪問博客
 * 可以用Spirng的@Scheduled來完成這個定時任務,我的項目是Spring boot,
 * 在啓動類DemoApplication上方加上@EnableScheduling 來開啓定時任務。
 */
@SpringBootApplication
@EnableScheduling//開啓定時任務
public class DemoApplication {

    public static void main(String[] args) {
        SpringApplication.run(DemoApplication.class, args);
    }

}


3.編寫定時調度的內容

(記得把url改成要查找的地址! 這裏的url是點擊----》”我的博客“的url)

package 刷博客閱讀量;

import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;

import static 刷博客閱讀量.Tool.doGet;
import static 刷博客閱讀量.Tool.doGet2;
import static 刷博客閱讀量.Tool.parse;

/**
 * 3.創建一個SchedulingTest類來完成這個任務
 */
@Component
public class SchedulingTest {
    private int i = 0;

    @Scheduled(fixedRate = 60 * 1000)//具體時間間隔,60*1000也就是1分鐘執行一次
    void doSomethingWith() {
        String url = "個人查找的url地址";
        String body=Tool.doGet(url);
        Elements elements=parse(body);
        for (Element ele:elements)
        {
            String urlEle=ele.attr("href");
            System.out.println("urlEle:"+urlEle);
            doGet2(urlEle);
        }
        i++;
        System.out.println("第" + i + "次訪問");
    }
}


4.編寫pom文件
(emm…這個纔是第一步)

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.itcast</groupId>
    <artifactId>algorithm</artifactId>
    <version>1.0-SNAPSHOT</version>

    <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>2.1.6.RELEASE</version>
        <!--<relativePath/> &lt;!&ndash; lookup parent from repository &ndash;&gt;-->
    </parent>

<dependencies>
    <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
    <dependency>
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.11.3</version>
    </dependency>

    <dependency>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-web</artifactId>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpcore -->
    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpcore</artifactId>
        <version>4.4.10</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpclient</artifactId>
        <version>4.5.3</version>
    </dependency>
</dependencies>
</project>
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章