1、主方法
package com.sprider.pingan;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import static com.sprider.utils.SetGetHead.setPinanHead;
public class Caichan {
private final static ArrayBlockingQueue<String> arrayBlockingQueue = new ArrayBlockingQueue<String>(100);
private final static ExecutorService threadPool = Executors.newFixedThreadPool(10);
private final static String indexUrl = "http://baoxian.pingan.com/product/allbaoxianlist.shtml";
public static void main(String[] args) throws Exception {
threadPool.execute(new Runnable() {
public void run() {
while (true) {
System.out.println("等待~~~,還有商品抓取::" + arrayBlockingQueue.size());
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
});
for (int i = 0; i <= 9; i++) {
threadPool.execute(new Runnable() {
public void run() {
while (true) {
try {
String pid = arrayBlockingQueue.take();
parserProductDetail(pid);
} catch (Exception e) {
e.printStackTrace();
}
}
}
});
}
parserIndex();
System.out.println("------------------------------------`");
}
private static void parserIndex() throws Exception {
String indexHtml = getHtml();
getSearchResultInfo(indexHtml);
}
private static void getSearchResultInfo(String indexHtml) {
if (indexHtml != null) {
Document indexDoc = Jsoup.parse(indexHtml);
Elements liLists = indexDoc.getElementsByClass("list_product_nr por");
Elements lists = liLists.select("a[href]");
for (Element li : lists) {
try {
System.out.println("---------------start----------------");
System.out.println(li);
System.out.println("---------------a[href]----------------");
System.out.println(li.attr("href"));
System.out.println("---------------end----------------");
arrayBlockingQueue.put(li.attr("href"));
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
private static String getHtml() throws IOException, ClientProtocolException {
HttpGet indexHttpGet = new HttpGet(indexUrl);
setPinanHead(indexHttpGet);
CloseableHttpClient indexHttpClient = HttpClients.createDefault();
CloseableHttpResponse indexRes = indexHttpClient.execute(indexHttpGet);
if (200 == indexRes.getStatusLine().getStatusCode()) {
return EntityUtils.toString(indexRes.getEntity(), Charset.forName("utf-8"));
}
return null;
}
private static void parserProductDetail(String pId) throws Exception {
String pUrl = "http://baoxian.pingan.com" + pId ;
HttpGet httpGet = new HttpGet(pUrl);
setPinanHead(httpGet);
CloseableHttpClient detailHttpClient = HttpClients.createDefault();
CloseableHttpResponse detailRes = detailHttpClient.execute(httpGet);
if (200 == detailRes.getStatusLine().getStatusCode()) {
String detailHtml = EntityUtils.toString(detailRes.getEntity(), Charset.forName("utf-8"));
Document detailDoc = Jsoup.parse(detailHtml);
}
}
}
2、設置請求頭
package com.sprider.utils;
import org.apache.http.client.methods.HttpGet;
import java.util.ArrayList;
import java.util.Random;
public class SetGetHead {
public static void setPinanHead(HttpGet httpGet) {
httpGet.setHeader("User-Agent",getUserAgent());
httpGet.setHeader("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
httpGet.setHeader("GET", "/product/allbaoxianlist.shtml?WT.mc_id=property-home HTTP/1.1");
httpGet.setHeader("Host", "baoxian.pingan.com");
httpGet.setHeader("Proxy-Connection", "keep-alive");
httpGet.setHeader("Cache-Control", "max-age=0");
httpGet.setHeader("Upgrade-Insecure-Requests", "1");
httpGet.setHeader("Accept-Encoding", "gzip, deflate");
httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
httpGet.setHeader("Cookie",
"USER_TRACKING_COOKIE=0; Hm_lvt_2f53c35010dbe120000b9a32bd028225=1592893596; BIGipServerpa18-padweb_http_DMZ_PrdPool_20191218=622763735.60533.0000; routeopr=820dc71fbbc95c3f907bf22c598618b9; BIGipServerpa18-shop-nst_Prd_cloud_Pool=639475415.60533.0000; _ga=GA1.2.1026077564.1592893616; _gid=GA1.2.83529289.1592893616; USER_TRACKING_COOKIE=0; paid_test=09d54ee0-6223-6973-75b7-d1897b80c0f0; USER_TRACKING_COOKIE=172.25.100.204-1592893733346.346000000; MEDIA_SOURCE_NAME=property-home; adms_location=%E5%85%B6%E4%BB%96|0000000000000000$ALL$ALL$ALL; PA_Client_Source=direct; PA_GXH_PD=-1; PA_GXH_NSS=; PA_GXH_WSS=; WT-FPC=id=4.0.4.93-3678702496.30820674:lv=1592905458970:ss=1592905458970:fs=1592905458970:pn=1:vn=1; WEBTRENDS_ID=4.0.4.93-3678702496.30820674; inner_media=http://baoxian.pingan.com/product/jiankangbaoxian.shtml-%E9%A6%96%E9%A1%B5; Hm_lpvt_2f53c35010dbe120000b9a32bd028225=1592906141; DT_C_ID=a4fb223ec3e5ff11-318372c65c18c021-1592966461200-1592966805142");
httpGet.setHeader("referer", "http://baoxian.pingan.com/product/allbaoxianlist.shtml");
}
//隨機返回一個agent
private static String getUserAgent(){
ArrayList<String> list = new ArrayList<String>(){{
add("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/41.0.2228.0 Safari/537.36");
add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/41.0.2227.1 Safari/537.36");
add("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/41.0.2227.0 Safari/537.36");
add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/41.0.2227.0 Safari/537.36");
add("Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/41.0.2226.0 Safari/537.36");
add("Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/41.0.2225.0 Safari/537.36");
add("Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/41.0.2225.0 Safari/537.36");
add("Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/41.0.2224.3 Safari/537.36");
add("Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/40.0.2214.93 Safari/537.36");
add("Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/40.0.2214.93 Safari/537.36");
add("Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/37.0.2049.0 Safari/537.36");
add("Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/37.0.2049.0 Safari/537.36");
add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/36.0.1985.67 Safari/537.36");
add("Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/36.0.1985.67 Safari/537.36");
add("Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/35.0.3319.102 Safari/537.36");
add("Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/35.0.2309.372 Safari/537.36");
add("Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/35.0.2117.157 Safari/537.36");
add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/35.0.1916.47 Safari/537.36");
add("Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML); like Gecko) Chrome/34.0.1866.237 Safari/537.36");
add("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
}};
int index = new Random().nextInt( list.size() );
return list.get(index);
}
}