使用java爬虫刷阅读量

1.编写一个Tool用来解析url获取文本
(注:从博客列表点击某条博客进去时,记得要先设置请求头,也就是setHead()那部分内容,需要根据自己的信息进行编写,这里我已经将个人的删除,因为其中加载的是动态数据)

package 刷博客阅读量;


import org.apache.http.Consts;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.client.params.HttpClientParams;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;

/**
 * 1.创建HttpUtil类来写一个简单的http请求方法,访问地址
 */
public class Tool {
//    获取页面数据
//这个是获取博客的列表的url
    public static String doGet(String url) {
        String body = "";
        //连接
        HttpClient httpClient = HttpClientBuilder.create().build();
//        HttpClientParams.setCookiePolicy(httpClient.getParams(), CookiePolicy.BROWSER_COMPATIBILITY);
        //请求
        HttpGet httpGet = new HttpGet(url);
        //设置浏览器代理
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36");
        try{
            //响应
            HttpResponse httpResponse = httpClient.execute(httpGet);
            HttpEntity httpEntity = httpResponse.getEntity();
            body = EntityUtils.toString(httpEntity, Consts.UTF_8);
//            System.out.println("body:"+body);
//
//                parse(body);

            //释放连接
            httpGet.releaseConnection();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return body;
    }

    //    获取页面数据
    //这个跟前面的doGet是一样的,只是输出内容过多,debug测试加的
    //我把他加进来容易修改点,若不需要可以删掉,并在第三步将doget2()改为doget()
    public static String doGet2(String url) {
        String body = "";
        //连接
        HttpClient httpClient = HttpClientBuilder.create().build();
//        HttpClientParams.setCookiePolicy(httpClient.getParams(), CookiePolicy.BROWSER_COMPATIBILITY);
        //请求
        HttpGet httpGet = new HttpGet(url);
        //设置浏览器代理
//        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36");
        //设置头部信息进行模拟登录(添加登录后的Cookie)
        httpGet.setHeader("Accept", "");
        httpGet.setHeader("Accept-Encoding", "");
        httpGet.setHeader("Accept-Language", "");
        httpGet.setHeader("Cookie", "");
        httpGet.setHeader("User-Agent", "");
        try{
            //响应
            HttpResponse httpResponse = httpClient.execute(httpGet);
            HttpEntity httpEntity = httpResponse.getEntity();
            body = EntityUtils.toString(httpEntity, Consts.UTF_8);
//            System.out.println("body:"+body);
//
//                parse(body);

            //释放连接
            httpGet.releaseConnection();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return body;
    }

//    获取a标签
    public static Elements parse(String html) {
        //解析html获取Document
        Document doc= Jsoup.parse(html);
        //获取spu
        Elements eles=doc.select("h4>a");
        return eles;
    }
}


2.开启Springboot的定时调度

package 刷博客阅读量;

import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableScheduling;

/**
 * 2.创建定时任务Scheduled,每隔1分钟来请求一次去访问博客
 * 可以用Spirng的@Scheduled来完成这个定时任务,我的项目是Spring boot,
 * 在启动类DemoApplication上方加上@EnableScheduling 来开启定时任务。
 */
@SpringBootApplication
@EnableScheduling//开启定时任务
public class DemoApplication {

    public static void main(String[] args) {
        SpringApplication.run(DemoApplication.class, args);
    }

}


3.编写定时调度的内容

(记得把url改成要查找的地址! 这里的url是点击----》”我的博客“的url)

package 刷博客阅读量;

import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;

import static 刷博客阅读量.Tool.doGet;
import static 刷博客阅读量.Tool.doGet2;
import static 刷博客阅读量.Tool.parse;

/**
 * 3.创建一个SchedulingTest类来完成这个任务
 */
@Component
public class SchedulingTest {
    private int i = 0;

    @Scheduled(fixedRate = 60 * 1000)//具体时间间隔,60*1000也就是1分钟执行一次
    void doSomethingWith() {
        String url = "个人查找的url地址";
        String body=Tool.doGet(url);
        Elements elements=parse(body);
        for (Element ele:elements)
        {
            String urlEle=ele.attr("href");
            System.out.println("urlEle:"+urlEle);
            doGet2(urlEle);
        }
        i++;
        System.out.println("第" + i + "次访问");
    }
}


4.编写pom文件
(emm…这个才是第一步)

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.itcast</groupId>
    <artifactId>algorithm</artifactId>
    <version>1.0-SNAPSHOT</version>

    <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>2.1.6.RELEASE</version>
        <!--<relativePath/> &lt;!&ndash; lookup parent from repository &ndash;&gt;-->
    </parent>

<dependencies>
    <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
    <dependency>
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.11.3</version>
    </dependency>

    <dependency>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-web</artifactId>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpcore -->
    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpcore</artifactId>
        <version>4.4.10</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpclient</artifactId>
        <version>4.5.3</version>
    </dependency>
</dependencies>
</project>
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章