如果你需要爬取某些網頁的內容,但這些網站需要登錄,那就需要一些額外的步驟來由程序來完成這些登錄並爬取我們需要的網頁內容了,任意登錄頁面都是向服務器發送請求,如果我們能夠模擬向服務器發送請求,那麼自然登錄也就不在話下,通過Fiddler抓取我們需要的一些信息,很輕鬆的就能模擬出向服務器發送的請求,下面我們可以使用HTTPURLConnection進行模擬登陸並爬取我們需要的網頁內容。
- import java.io.BufferedReader;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.io.OutputStreamWriter;
- import java.net.HttpURLConnection;
- import java.net.URL;
- import java.util.List;
- import java.util.Map.Entry;
- public class INotesPost {
- public static void main(String[] args) throws Exception {
- String surl = "***?login";
- URL url = new URL(surl);
- HttpURLConnection connection = (HttpURLConnection) url.openConnection();
- connection.setDoOutput(true);
- connection.setDoInput(true);
- connection.setRequestMethod("POST");
- connection.setUseCaches(false);
- connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
- connection.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.1.4322; .NET4.0C; .NET4.0E)");
- connection.setRequestProperty("Accept-Language","zh-CN");
- connection.setRequestProperty("Accept-Encoding","gzip, deflate");
- OutputStreamWriter out = new OutputStreamWriter(
- connection.getOutputStream(), "UTF-8");
- // 其中的memberName和password可通過fiddler來抓取
- out.write("username=***&password=***");
- out.flush();
- out.close();
- connection.connect();
- InputStream in = connection.getInputStream();
- StringBuilder retStr = new StringBuilder();
- BufferedReader br = new BufferedReader(new InputStreamReader(in));
- String temp = br.readLine();
- while (temp != null) {
- retStr.append(temp);
- temp = br.readLine();
- }
- br.close();
- in.close();
- System.out.println(retStr);
- for(Entry<String, List<String>> header: connection.getHeaderFields().entrySet()){
- System.out.println(header.getKey() +" " + header.getValue());
- }
- }
- }
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.List;
import java.util.Map.Entry;
public class INotesPost {
public static void main(String[] args) throws Exception {
String surl = "***?login";
URL url = new URL(surl);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setDoOutput(true);
connection.setDoInput(true);
connection.setRequestMethod("POST");
connection.setUseCaches(false);
connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
connection.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.1.4322; .NET4.0C; .NET4.0E)");
connection.setRequestProperty("Accept-Language","zh-CN");
connection.setRequestProperty("Accept-Encoding","gzip, deflate");
OutputStreamWriter out = new OutputStreamWriter(
connection.getOutputStream(), "UTF-8");
// 其中的memberName和password可通過fiddler來抓取
out.write("username=***&password=***");
out.flush();
out.close();
connection.connect();
InputStream in = connection.getInputStream();
StringBuilder retStr = new StringBuilder();
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String temp = br.readLine();
while (temp != null) {
retStr.append(temp);
temp = br.readLine();
}
br.close();
in.close();
System.out.println(retStr);
for(Entry<String, List<String>> header: connection.getHeaderFields().entrySet()){
System.out.println(header.getKey() +" " + header.getValue());
}
}
}
在模擬登陸的時候,我們其實可以通過Fiddler來抓取網頁提交參數,直接將Cookie寫到我們的Connection的RequestProperty中去。
Fiddler抓取登錄參數
將抓取到的參數直接填充到Connection的RequestProperty屬性中去,輕鬆抓取網頁內容。如果我們抓取的頁面內容是中文的,注意charset的編碼方式,並在讀取頁面返回的字符流時進行對應的編碼:
- BufferedReader bufferedReader = new BufferedReader(
- new InputStreamReader(urlStream,"utf-8"));
BufferedReader bufferedReader = new BufferedReader(
new InputStreamReader(urlStream,"utf-8"));
下面是一段相對完整的代碼
- String s = "****";
- url = new URL(s);
- HttpURLConnection resumeConnection = (HttpURLConnection) url.openConnection();
- resumeConnection.setRequestProperty("Accept-Charset","utf-8");
- resumeConnection.setRequestProperty("Content-Type","text/html;utf-8");
- resumeConnection.setRequestProperty("Cookie","AttachmentAuth=77u/PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz48U1A+MCMuZnxpaWNwfDAwMDgyMzkwNSwwIy5mfGlpY3B8MDAwODIzOTA1LDEzMDc2NjgxMDQzODc3NDA0OCxUcnVlLEV0eHBYWVlYVHNYQ0hYR3hjRmZjdWowOXV6ekRXc01Hd0FLUzVkaFNmcEErcWo4S3pGTUYvYVRYZFJnWitSRW1pVmR4N0xKVzdoOUhzMitUamY5Z0E2VHY4a2hxeHNTQXlVRmhmQ1pwelBUOFBWQmc0NXI2cHo4eGZxZkEyNzAyOUo0eFBrcU9MM0dWNm1IVGdVNEZFT3E1OVIzSHA3dmZrS0tHR1YxNVJpTllKcXF1dUVCMmhlU1lGT0VLUjlBMitEQ00rMVlwdXBVTEJ0UGdWYk5lODBobEtydUttc1MyWWkrSmpXMFozTVVyRHJzN1VkU1VxNmdrYmo0dTB4OWNrTXRFZXJ1cUlZbDROb3N2UWhpSmNRTlVGcm9kNkVXaWhBL0tjUVpaZlY1UFJBREtjalZIYmx3dnRXMkIwZ1VPMVM3REJFa0VzOS9GQUViVzM2bnhJQT09LGh0dHA6Ly9vYS5zZGMuaWNiYzo4Mi9zdG9yYWdlL2F0dGFjaG1lbnQyLzIwMTUtMDUvZTY1Yjc3ZjUtNGZkMC00NDI2LWE1OWYtMjQxNTAxYWE0MjI1L+mZhOS7tjIu6L2v5Lu25byA5Y+R5Lit5b+D6K665paH5L2T5L6L6KaB5rGCLmRvYzwvU1A+");
- resumeConnection.setRequestProperty("Cookie","PortalAuth=77u/PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz48U1A+MCMuZnxpaWNwfDAwMDgyMzkwNSwwIy5mfGlpY3B8MDAwODIzOTA1LDEzMDc2NzA1NzM3NTI3MDY4NCxUcnVlLFFldU1Fa2xDelI0bEZaTTJkbVVtZGxPVmhsUVdwQWMzQlk2TCtWdlVOb1ZsRjVHZ1BMRVhMTTAwcHBKWW5WTGZLYzFPTTh2aGRydmRIVWVLR3JOb255dWpTS2lMeEhyQUlBbmtYZTVBTWlFVGpFMlF4bzRjWVRKeEhjNU5ScEhMSWJOWHdWckFTWHhuNUd5bURST0xTK2d3cUFWbThFUllPM3J1enR4aGgwT1VrTDJGMGkrUDdWcHViRm84blFrTXp4MFNyMXdtQzE3UEJkcGpGVU1nOW8xRkJoeHhzWElDdHhLVEpVSHRGMmpDNmNKS285bGJtTXZJZnlwR0k1VGpLd29TTUpaenhyb1BkQ3VOVW13Wk01T0ZEUExSK1lqajVCRitJSFc1enV0UlpXM08wWHhNaldIWk1nWHhncjF0dUc1b3E3RlRwOGhCMFVCWjAydDlGQT09LGh0dHA6Ly9vYS5zZGMuaWNiYy88L1NQPg==");
- resumeConnection.connect();
- InputStream urlStream = resumeConnection.getInputStream();
- BufferedReader bufferedReader = new BufferedReader(
- new InputStreamReader(urlStream,"utf-8"));
- String ss = null;
- StringBuilder total = new StringBuilder();
- while ((ss = bufferedReader.readLine()) != null) {
- total.append(ss);
- }
- bufferedReader.close();
- resumeConnection.disconnect();
- // System.out.println(total.toString());
String s = "****";
url = new URL(s);
HttpURLConnection resumeConnection = (HttpURLConnection) url.openConnection();
resumeConnection.setRequestProperty("Accept-Charset","utf-8");
resumeConnection.setRequestProperty("Content-Type","text/html;utf-8");
resumeConnection.setRequestProperty("Cookie","AttachmentAuth=77u/PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz48U1A+MCMuZnxpaWNwfDAwMDgyMzkwNSwwIy5mfGlpY3B8MDAwODIzOTA1LDEzMDc2NjgxMDQzODc3NDA0OCxUcnVlLEV0eHBYWVlYVHNYQ0hYR3hjRmZjdWowOXV6ekRXc01Hd0FLUzVkaFNmcEErcWo4S3pGTUYvYVRYZFJnWitSRW1pVmR4N0xKVzdoOUhzMitUamY5Z0E2VHY4a2hxeHNTQXlVRmhmQ1pwelBUOFBWQmc0NXI2cHo4eGZxZkEyNzAyOUo0eFBrcU9MM0dWNm1IVGdVNEZFT3E1OVIzSHA3dmZrS0tHR1YxNVJpTllKcXF1dUVCMmhlU1lGT0VLUjlBMitEQ00rMVlwdXBVTEJ0UGdWYk5lODBobEtydUttc1MyWWkrSmpXMFozTVVyRHJzN1VkU1VxNmdrYmo0dTB4OWNrTXRFZXJ1cUlZbDROb3N2UWhpSmNRTlVGcm9kNkVXaWhBL0tjUVpaZlY1UFJBREtjalZIYmx3dnRXMkIwZ1VPMVM3REJFa0VzOS9GQUViVzM2bnhJQT09LGh0dHA6Ly9vYS5zZGMuaWNiYzo4Mi9zdG9yYWdlL2F0dGFjaG1lbnQyLzIwMTUtMDUvZTY1Yjc3ZjUtNGZkMC00NDI2LWE1OWYtMjQxNTAxYWE0MjI1L+mZhOS7tjIu6L2v5Lu25byA5Y+R5Lit5b+D6K665paH5L2T5L6L6KaB5rGCLmRvYzwvU1A+");
resumeConnection.setRequestProperty("Cookie","PortalAuth=77u/PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz48U1A+MCMuZnxpaWNwfDAwMDgyMzkwNSwwIy5mfGlpY3B8MDAwODIzOTA1LDEzMDc2NzA1NzM3NTI3MDY4NCxUcnVlLFFldU1Fa2xDelI0bEZaTTJkbVVtZGxPVmhsUVdwQWMzQlk2TCtWdlVOb1ZsRjVHZ1BMRVhMTTAwcHBKWW5WTGZLYzFPTTh2aGRydmRIVWVLR3JOb255dWpTS2lMeEhyQUlBbmtYZTVBTWlFVGpFMlF4bzRjWVRKeEhjNU5ScEhMSWJOWHdWckFTWHhuNUd5bURST0xTK2d3cUFWbThFUllPM3J1enR4aGgwT1VrTDJGMGkrUDdWcHViRm84blFrTXp4MFNyMXdtQzE3UEJkcGpGVU1nOW8xRkJoeHhzWElDdHhLVEpVSHRGMmpDNmNKS285bGJtTXZJZnlwR0k1VGpLd29TTUpaenhyb1BkQ3VOVW13Wk01T0ZEUExSK1lqajVCRitJSFc1enV0UlpXM08wWHhNaldIWk1nWHhncjF0dUc1b3E3RlRwOGhCMFVCWjAydDlGQT09LGh0dHA6Ly9vYS5zZGMuaWNiYy88L1NQPg==");
resumeConnection.connect();
InputStream urlStream = resumeConnection.getInputStream();
BufferedReader bufferedReader = new BufferedReader(
new InputStreamReader(urlStream,"utf-8"));
String ss = null;
StringBuilder total = new StringBuilder();
while ((ss = bufferedReader.readLine()) != null) {
total.append(ss);
}
bufferedReader.close();
resumeConnection.disconnect();
// System.out.println(total.toString());
轉載:http://blog.csdn.net/kangkanglou/article/details/45895407