前兩天看到個問題,抓網頁時,返回403,瀏覽器打開沒問題。用的httpclient進行抓取,代碼很簡單。網上找了一版。
String url="http://localhost:8080/HttpClientDemo/test";
HttpGet httpRequest=new HttpGet(url);
HttpClient httpClient=new DefaultHttpClient();
HttpResponse response=httpClient.execute(httpRequest);
if(response.getStatusLine().getStatusCode()==HttpStatus.SC_OK){
String result=EntityUtils.toString(response.getEntity());
System.out.println(result);
}
要模擬瀏覽器,就要填http的參數,於是重寫一把。
HttpClient client = null;
// 對鏈接超過時的設置
HttpParams httpParams;
ClientConnectionManager connectionManager;
// 數據設置
// 最大鏈接數
int max_connection = Integer.parseInt(ReadSpiderConfig
.getValue("max_connections"));
// 獲取鏈接的最大等待時間
int wait_connection_timeout = Integer
.parseInt(ReadSpiderConfig.getValue("wait_connection_timeout"));
// 連接超時時間
int connection_timeout = Integer.parseInt(ReadSpiderConfig
.getValue("connection_timeout"));
// 讀取超時
int read_timeout = Integer.parseInt(ReadSpiderConfig
.getValue("read_timeout"));
httpParams = new BasicHttpParams();
// HttpConnectionParams.
ConnManagerParams.setMaxTotalConnections(httpParams, max_connection);
ConnManagerParams.setTimeout(httpParams, wait_connection_timeout);
//每個路由的最大鏈接個數,標誌對同一站點的併發請求
ConnPerRouteBean connPerRoute = new ConnPerRouteBean(100);
ConnManagerParams.setMaxConnectionsPerRoute(httpParams, connPerRoute);
HttpConnectionParams.setConnectionTimeout(httpParams,
connection_timeout);
HttpConnectionParams.setSoTimeout(httpParams, read_timeout);
SchemeRegistry registry = new SchemeRegistry();
registry.register(new Scheme("http", PlainSocketFactory
.getSocketFactory(), 80));
registry.register(new Scheme("https", SSLSocketFactory
.getSocketFactory(), 443));
connectionManager = new ThreadSafeClientConnManager(httpParams,
registry);
// httpParams.setParameter(ClientPNames.HANDLE_REDIRECTS,false);
client = new DefaultHttpClient(connectionManager, httpParams);