閒來使用正則表達式做了一個HTML頁面的數據爬蟲,主要是根據頁面規則去匹配相應字段內容,記錄一下。
利用HttpGet獲取頁面內容,使用pattern獲取匹配內容
CloseableHttpClient client = HttpClientBuilder.create().build();
long t = new Date().getTime();
for (int i = 1; i <= max; i++) {
String uri = "http:///xxx.com/xxx/PG(?)tok=" + t;
uri = uri.replaceAll("(?)",i+"");
HttpGet hget = new HttpGet(uri);
hget.addHeader(HttpHeaders.ACCEPT, "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
hget.addHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36");
CloseableHttpResponse response = client.execute(hget);
String bodyAsString = EntityUtils.toString(response.getEntity());
StringBuffer ori = new StringBuffer(bodyAsString);
if (ori.length() > 0) {
//獲取指定a標籤內鏈接內容以及隨後的圖片內容
Pattern pattern = Pattern.compile("<a onclick=\"[^\"]*\"\\starget=\"_blank\"\\shref='([^']*)'>[\\s]*<img src=\"([^\"]*)");
Matcher matcher = pattern.matcher(ori.toString());
List<String> content = new ArrayList<>();
while (matcher.find()) {
String tmp = matcher.group(1);
String msg = matcher.group(2);
System.out.println(tmp +":"+msg );
}
}else{
continue;
}
}
}
try {
Thread.sleep(30000);//休息30s
} catch (InterruptedException e) {
e.printStackTrace();
}
}
隨後附上獲取某標題後內容(常用)
Pattern adminPricePattr = Pattern.compile("姓 名 :</span>[^>]*([^<]*)");