java爬蟲採集某保險公司產品數據

1、主方法

package com.sprider.pingan;

import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import static com.sprider.utils.SetGetHead.setPinanHead;

/***  財產險
 *
 */
public class Caichan {

    private final static ArrayBlockingQueue<String> arrayBlockingQueue = new ArrayBlockingQueue<String>(100);
    private final static ExecutorService threadPool = Executors.newFixedThreadPool(10);
    //抓取首頁
    private final static  String indexUrl = "http://baoxian.pingan.com/product/allbaoxianlist.shtml";

    public static void main(String[] args) throws Exception {

        // 使用線程技術消費隊列的數據。
        threadPool.execute(new Runnable() {
            public void run() {
                while (true) {
                    System.out.println("等待~~~,還有商品抓取::" + arrayBlockingQueue.size());
                    try {
                        Thread.sleep(1000);
                    } catch (InterruptedException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }

            }
        });

        for (int i = 0; i <= 9; i++) {
            // 使用線程技術消費隊列的數據-----------------專門用來消費數據
            threadPool.execute(new Runnable() {
                public void run() {
                    while (true) {
                        try {
                            String pid = arrayBlockingQueue.take();
                            parserProductDetail(pid);
                        } catch (Exception e) {
                            // TODO Auto-generated catch block
                            e.printStackTrace();
                        }
                    }

                }
            });
        }

        // 生產者
        // 第一個事情:解析首頁的信息,得到產品列表
        parserIndex();
        // 第二個事情:解析分頁的信息,得到產品列表
        //dopaging();
        System.out.println("------------------------------------`");

    }

    //解析首頁
    private static void parserIndex() throws Exception {
        // 1.指定url
        //http://baoxian.pingan.com/product/allbaoxianlist.shtml?WT.mc_id=property-home
        //拿到首頁html
        String indexHtml = getHtml();
        getSearchResultInfo(indexHtml);
    }

    //根據stringHtml 拿到產品信息
    private static void getSearchResultInfo(String indexHtml) {
        if (indexHtml != null) {
            Document indexDoc = Jsoup.parse(indexHtml);
            // 6.定位到產品列表
            Elements liLists = indexDoc.getElementsByClass("list_product_nr por");
            //解析a標籤
            Elements lists = liLists.select("a[href]");

            for (Element li : lists) {
                // 7.依次每個商品的詳情頁,並解析出數據
                try {
                    System.out.println("---------------start----------------");
                    System.out.println(li);
                    System.out.println("---------------a[href]----------------");
                    System.out.println(li.attr("href"));
                    System.out.println("---------------end----------------");
                    arrayBlockingQueue.put(li.attr("href"));
                } catch (InterruptedException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }
    }

    //get請求  訪問拿到統一格式的字符集string   UTF-8
    private static String getHtml() throws IOException, ClientProtocolException {
        // 2.將url對象封裝成httpget對象
        HttpGet indexHttpGet = new HttpGet(indexUrl);
        //設置請求頭
        setPinanHead(indexHttpGet);
        // 3.使用httpclient發起一個請求
        CloseableHttpClient indexHttpClient = HttpClients.createDefault();
        CloseableHttpResponse indexRes = indexHttpClient.execute(indexHttpGet);
        // 4.從響應結果中,獲得首頁的html文檔
        if (200 == indexRes.getStatusLine().getStatusCode()) {
            // 5.獲得首頁的信息,從首頁中找出商品的列表
            return EntityUtils.toString(indexRes.getEntity(), Charset.forName("utf-8"));
        }
        return null;
    }

    //根據頁面解析
    private static void parserProductDetail(String pId) throws Exception {
        // 1.指定url
        //http://baoxian.pingan.com/product/eshengpinganyiliaoxian.shtml?WT.mc_id=property-home
        String pUrl = "http://baoxian.pingan.com" + pId ;
        // 2.封裝成一個get請求
        HttpGet httpGet = new HttpGet(pUrl);
        //設置請求頭
        setPinanHead(httpGet);
        // 3.使用httpclient發起請求
        CloseableHttpClient detailHttpClient = HttpClients.createDefault();
        CloseableHttpResponse detailRes = detailHttpClient.execute(httpGet);
        // 4.得到響應結果
        if (200 == detailRes.getStatusLine().getStatusCode()) {
            String detailHtml = EntityUtils.toString(detailRes.getEntity(), Charset.forName("utf-8"));
            // 5.解析文檔
            Document detailDoc = Jsoup.parse(detailHtml);
			//後續一樣的解析
            //System.out.println(detailDoc);

        }

    }

}

2、設置請求頭

package com.sprider.utils;

import org.apache.http.client.methods.HttpGet;
import java.util.ArrayList;
import java.util.Random;

public class SetGetHead {

    /**
     * 設置http的HEAD
     *
     * @param httpGet  請求
     */
    public static void setPinanHead(HttpGet httpGet) {
        // 設置默認的請求參數
        httpGet.setHeader("User-Agent",getUserAgent());
        // 僞裝成瀏覽器
        httpGet.setHeader("Accept",
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
        httpGet.setHeader("GET", "/product/allbaoxianlist.shtml?WT.mc_id=property-home HTTP/1.1");
        httpGet.setHeader("Host", "baoxian.pingan.com");

        httpGet.setHeader("Proxy-Connection", "keep-alive");
        httpGet.setHeader("Cache-Control", "max-age=0");
        httpGet.setHeader("Upgrade-Insecure-Requests", "1");
        httpGet.setHeader("Accept-Encoding", "gzip, deflate");
        httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
        httpGet.setHeader("Cookie",
                "USER_TRACKING_COOKIE=0; Hm_lvt_2f53c35010dbe120000b9a32bd028225=1592893596; BIGipServerpa18-padweb_http_DMZ_PrdPool_20191218=622763735.60533.0000; routeopr=820dc71fbbc95c3f907bf22c598618b9; BIGipServerpa18-shop-nst_Prd_cloud_Pool=639475415.60533.0000; _ga=GA1.2.1026077564.1592893616; _gid=GA1.2.83529289.1592893616; USER_TRACKING_COOKIE=0; paid_test=09d54ee0-6223-6973-75b7-d1897b80c0f0; USER_TRACKING_COOKIE=172.25.100.204-1592893733346.346000000; MEDIA_SOURCE_NAME=property-home; adms_location=%E5%85%B6%E4%BB%96|0000000000000000$ALL$ALL$ALL; PA_Client_Source=direct; PA_GXH_PD=-1; PA_GXH_NSS=; PA_GXH_WSS=; WT-FPC=id=4.0.4.93-3678702496.30820674:lv=1592905458970:ss=1592905458970:fs=1592905458970:pn=1:vn=1; WEBTRENDS_ID=4.0.4.93-3678702496.30820674; inner_media=http://baoxian.pingan.com/product/jiankangbaoxian.shtml-%E9%A6%96%E9%A1%B5; Hm_lpvt_2f53c35010dbe120000b9a32bd028225=1592906141; DT_C_ID=a4fb223ec3e5ff11-318372c65c18c021-1592966461200-1592966805142");
        httpGet.setHeader("referer", "http://baoxian.pingan.com/product/allbaoxianlist.shtml");
        //httpGet.setHeader("pragma", "no-cache");

    }

    //隨機返回一個agent
    private static String getUserAgent(){
        ArrayList<String> list = new ArrayList<String>(){{
            add("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/41.0.2228.0 Safari/537.36");
            add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/41.0.2227.1 Safari/537.36");
            add("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/41.0.2227.0 Safari/537.36");
            add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/41.0.2227.0 Safari/537.36");
            add("Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/41.0.2226.0 Safari/537.36");
            add("Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/41.0.2225.0 Safari/537.36");
            add("Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/41.0.2225.0 Safari/537.36");
            add("Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/41.0.2224.3 Safari/537.36");
            add("Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/40.0.2214.93 Safari/537.36");
            add("Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/40.0.2214.93 Safari/537.36");
            add("Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/37.0.2049.0 Safari/537.36");
            add("Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/37.0.2049.0 Safari/537.36");
            add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/36.0.1985.67 Safari/537.36");
            add("Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/36.0.1985.67 Safari/537.36");
            add("Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/35.0.3319.102 Safari/537.36");
            add("Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/35.0.2309.372 Safari/537.36");
            add("Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/35.0.2117.157 Safari/537.36");
            add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/35.0.1916.47 Safari/537.36");
            add("Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/34.0.1866.237 Safari/537.36");
            add("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
        }};
        int index = new Random().nextInt( list.size() );
        return list.get(index);
    }

}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章