Java爬蟲歷險記 – (1)爬取百度首頁的logo
在這篇文章裏,介紹兩種方式來獲取百度網頁的logo: (1)Httpclient (2) jsoup + Httpclient ,詳細的運行結果可以參看文章末的參考資料。代碼使用的.jar包,如下圖:
第一種:只使用Httpclient
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
public class TestOne {
static String sendGet( String url ){
// 定義一個字符串用來存儲網頁內容
String result ="";
// 定義一個緩衝字符輸入流
BufferedReader in = null;
try{
// 將string轉成url對象
URL realUrl = new URL( url );
// 初始化一個鏈接到那個url的鏈接
URLConnection connection = realUrl.openConnection();
// 開始實際的連接
connection.connect();
// 初始化BufferedReader輸入流來讀取URL的響應
in = new BufferedReader( new InputStreamReader( connection.getInputStream() ) );
// 用來臨時存儲抓取的每一行數據
String line;
while( ( line = in.readLine() ) != null ){
//遍歷抓取到的每一行並將其存儲到result裏面
result += line;
}
}catch(Exception e){
System.out.println( "發送GET請求出現異常!" + e );
e.printStackTrace();
}
// 使用finally來關閉輸入流
finally{
try{
if( in != null ){
in.close();
}
}catch(Exception e){
e.printStackTrace();
}
}
return result;
}
static String RegexString( String targetStr , String patternStr ){
// 定義一個樣式模板,此中使用正則表達式,括號中是要抓取的內容
// 相當於埋好了陷阱匹配的地方就會掉下去
Pattern pattern = Pattern.compile( patternStr );
// 定義一個matcher用來做匹配
Matcher matcher = pattern.matcher( targetStr );
// 如果找到了
if( matcher.find() ){
return matcher.group(1);
}
return "Nothing";
}
public static String get(String url){
String filename = "";
String tergetUrl = "http://" + url;
try {
CloseableHttpClient httpclient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(tergetUrl);
CloseableHttpResponse response = httpclient.execute(httpGet);
try {
if (response != null && response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
System.out.println(response.getStatusLine());
HttpEntity entity = response.getEntity();
filename = download(entity);
}
} finally {
httpclient.close();
response.close();
}
}catch (Exception e){
e.printStackTrace();
}
return filename;
}
private static String download(HttpEntity resEntity) {
//圖片要保存的路徑
String dirPath = "d:\\pic\\";
//圖片名稱,可以自定義生成
String fileName = "b_logo.png";
//如果沒有目錄先創建目錄,如果沒有文件名先創建文件名
File file = new File(dirPath);
if(file == null || !file.exists()){
file.mkdir();
}
String realPath = dirPath.concat(fileName);
File filePath = new File(realPath);
if (filePath == null || !filePath.exists()) {
try {
filePath.createNewFile();
} catch (IOException e) {
e.printStackTrace();
}
}
//得到輸入流,然後把輸入流放入緩衝區中,緩衝區--->輸出流flush,關閉資源
BufferedOutputStream out = null;
InputStream in = null;
try {
if (resEntity == null) {
return null;
}
in = resEntity.getContent();
out = new BufferedOutputStream(new FileOutputStream(filePath));
byte[] bytes = new byte[1024];
int len = -1;
while((len = in.read(bytes)) != -1){
out.write(bytes,0,len);
}
out.flush();
out.close();
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (in != null) {
in.close();
}
} catch (IOException e) {
}
}
return filePath.toString();
}
public static void main( String[] args ) {
// TODO Auto-generated method stub
// 定義即將訪問的鏈接
String url = "https://www.baidu.com/";
// 訪問鏈接並獲取內容
String result = sendGet( url );
// 使用正則匹配圖片的src內容
String imgSrc = RegexString( result , "src=//(.+?) " );
System.out.println( imgSrc );
// 將圖片獲取到本地
get(imgSrc);
}
}
注意點:
(1)正則表達式匹配:”src=//(.+?) ” 在 ) 後面有一個空格,否則匹配不成功。
(2)要導入Httpclient的. jar包
(3)在get圖片時候,要將其路徑補充完整: String tergetUrl = “http://” + url;
第二種:jsoup + Httpclient
import java.io.*;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class TestTwo {
public static void main(String args[]){
String url = "http://www.baidu.com";
String result = getBaiduPic(url);
String picUrl = "Https:"+result;
//System.out.println(picUrl);
downPicture(picUrl);
}
public static String getBaiduPic(String url){
Document doc;
String result="";
try{
//從一個URL加載一個Document
doc = Jsoup.connect(url).get();
//使用選擇器語法來查找元素,類似於DOM的方法,返回一組元素,如下圖鍵值對
Elements ListDiv = doc.getElementsByAttributeValue("id", "lg");
for(Element element: ListDiv){
//根據標籤名稱在element中匹配符合條件的項
Elements links = element.getElementsByTag("img");
for(Element link: links){
//取出src屬性對應的值,也就是圖片鏈接
result = link.attr("src");
System.out.println(result);
}
}
}catch(Exception e){
e.printStackTrace();
}
return result;
}
public static void downPicture(String url){
java.io.InputStream inputStream = null;
OutputStream outputStream = null;
File file = new File("D://pic");
try {
//1.初始化HttpClient對象
CloseableHttpClient httpClient = HttpClientBuilder.create().build();
//2.創建一個HttpGet方法
HttpGet httpGet = new HttpGet(url);
//3.執行請求
HttpResponse response = httpClient.execute(httpGet);
//4.獲取返回狀態碼
int returnCode = response.getStatusLine().getStatusCode();
if(returnCode == 200){
//創建文件夾
file.mkdir();
HttpEntity entity = response.getEntity();
//初始化inputStream
inputStream = entity.getContent();
outputStream = new FileOutputStream(new File("D://pic//LOGO.jpg"));
int temp = -1;
while((temp = inputStream.read())!=-1){
outputStream.write(temp);
}
httpGet.abort();
}
}catch(Exception e){
e.printStackTrace();
}finally{
if(inputStream!=null){
try {
inputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if(outputStream!=null){
try {
outputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
注意點:
(1)在downPicture()中,java.io.InputStream inputStream = null; 由於 HttpEntity entity = response.getEntity(); 的影響。
參考資料:
(1)行走江湖的少俠哥 – 第2節—小任務,爬取百度LOGO鏈接並下載圖片 : http://blog.csdn.net/sinat_32588261/article/details/72287108
(2)Mr_river – Java簡單爬蟲系列 : https://my.oschina.net/u/2519530/blog/597359
(3)汪海的實驗室 – [Java]知乎下巴第1集:爬蟲世界百度不僅僅可以拿來測網速 : http://blog.csdn.net/pleasecallmewhy/article/details/17594303
(這個博客可以看下討論區)