webMagic 代理池

package com.example.csdn.bean;

import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.AfterExtractor;
import us.codecraft.webmagic.model.OOSpider;

import java.io.IOException;
import java.util.*;


public class Main implements AfterExtractor {
    //失敗次數
    private static int errorCount = 0;
    //重新抓取闕值
    private static final float reLoadProxy = 0.45f;
    //最大代理數
    private static final int maxProxySize = 200;

    private static Map<String,MyProxy> proxyArr = new HashMap<>();

    private static Map<Integer, String> blogUrl = new HashMap<>();

    private static int blogUrlSize = 0;
    private static final String blogHome = "https://blog.csdn.net/qq_36183235";

    private static Logger logger = Logger.getLogger(Main.class);

    public static void main(String[] args) {
        while (true) {

            loadBlogUrl();
            loadProxy();


            for (final String key : proxyArr.keySet()) {
                if (!proxyArr.get(key).getState())
                    break;
                System.getProperties().setProperty("https.proxyHost", proxyArr.get(key).getAddr());
                System.getProperties().setProperty("https.proxyPort", proxyArr.get(key).getPort());
                try {
                    Jsoup.connect(blogUrl.get(randomBlogUrl()))
                            .userAgent("Mozilla")
                            .cookie("auth", "token")
                            .timeout(5000)
                            .get();
                    System.out.println("complete !");
                } catch (IOException e) {
                    errorCount++;
                    proxyArr.get(key).setState(false);
                    e.printStackTrace();
                }
                sleepThread((new Random().nextInt(100)) + 30);
            }
        }
    }

    private static void loadBlogUrl() {
        try {
            Document doc = Jsoup.connect(blogHome).post();
            Elements h4 = doc.body().getElementsByClass("article-list").select("h4");
            for (int i = 0; i < h4.size(); i++) {
                blogUrl.put(i, h4.get(i).select("a").attr("href"));
            }
            blogUrlSize = h4.size();
            System.out.println("blog !");
        } catch (IOException e) {
            e.printStackTrace();
        }
    }


    public static void loadProxy() {
        if(errorCount > maxProxySize * reLoadProxy)
            proxyArr.clear();

        if(proxyArr.size() >= maxProxySize)
            return;


        OOSpider.create(Site.me().setSleepTime(60*1000)
                , Main.class)
                .setIsExtractLinks(false)
                .addUrl(links().toArray(new String [0]))
                .run();
    }

    public static List<String> links (){
        List<String > var1 = new  ArrayList<>();
        for (int i = 1;i<maxProxySize/15 ; i++) {
            var1.add(String.format("https://www.kuaidaili.com/free/inha/%s/",i));
        }
        return  var1;
    }

    public static void sleepThread(int s) {
        try {
            long ms = s * 1000;
            Thread.sleep(ms);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }

    public static int randomBlogUrl() {
        return new Random().nextInt(blogUrlSize);
    }

    @Override
    public void afterProcess(Page page) {

        if (proxyArr.size() >= maxProxySize)
            return;
        for (int i = 1; i < 15; i++) {
            MyProxy proxy = new MyProxy();
            String addr = String.valueOf(page.getHtml().xpath(String.format("//*[@id=\"list\"]/table/tbody/tr[%s]/td[1]/text()", i)));
            proxy.setAddr(addr);
            proxy.setPort(String.valueOf(page.getHtml().xpath(String.format("//*[@id=\"list\"]/table/tbody/tr[%s]/td[2]/text()", i))));
            proxy.setState(true);
            proxyArr.put(addr,proxy);
        }
        System.out.println("get proxy! Size : "+proxyArr.size());
    }

    static class MyProxy {

        private String addr;
        private String port;
        private Boolean state;

        public String getAddr() {
            return addr;
        }

        public void setAddr(String addr) {
            this.addr = addr;
        }

        public String getPort() {
            return port;
        }

        public void setPort(String port) {
            this.port = port;
        }

        public Boolean getState() {
            return state;
        }

        public void setState(Boolean state) {
            this.state = state;
        }
    }

}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章