Java爬蟲歷險記 -- （1）爬取百度首頁的logo

Java爬蟲歷險記 – （1）爬取百度首頁的logo

在這篇文章裏，介紹兩種方式來獲取百度網頁的logo：（1）Httpclient （2） jsoup + Httpclient ，詳細的運行結果可以參看文章末的參考資料。代碼使用的.jar包，如下圖：

第一種:只使用Httpclient

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

public class TestOne {

    static String sendGet( String url ){

        // 定義一個字符串用來存儲網頁內容
        String result ="";
        // 定義一個緩衝字符輸入流
        BufferedReader in = null;

        try{
            // 將string轉成url對象
            URL realUrl = new URL( url );
            // 初始化一個鏈接到那個url的鏈接
            URLConnection connection = realUrl.openConnection();
            // 開始實際的連接
            connection.connect();
            // 初始化BufferedReader輸入流來讀取URL的響應
            in = new BufferedReader( new InputStreamReader( connection.getInputStream() ) );
            // 用來臨時存儲抓取的每一行數據
            String line;
            while( ( line = in.readLine() ) != null ){
                //遍歷抓取到的每一行並將其存儲到result裏面
                result += line;
            }

        }catch(Exception e){
            System.out.println( "發送GET請求出現異常!" + e );
            e.printStackTrace();
        }
        // 使用finally來關閉輸入流
        finally{
            try{
                if( in != null ){
                    in.close();
                }
            }catch(Exception e){
                e.printStackTrace();
            }
        }
        return result;
    }

    static String RegexString( String targetStr , String patternStr ){
        // 定義一個樣式模板，此中使用正則表達式，括號中是要抓取的內容
        // 相當於埋好了陷阱匹配的地方就會掉下去
        Pattern pattern = Pattern.compile( patternStr );
        // 定義一個matcher用來做匹配
        Matcher matcher = pattern.matcher( targetStr );
        // 如果找到了
        if( matcher.find() ){
            return matcher.group(1);
        }
        return "Nothing";
    }

    public static String get(String url){
        String filename = "";
        String tergetUrl = "http://" + url;
        try {
            CloseableHttpClient httpclient = HttpClients.createDefault();
            HttpGet httpGet = new HttpGet(tergetUrl);
            CloseableHttpResponse response = httpclient.execute(httpGet);

            try {
                if (response != null && response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
                    System.out.println(response.getStatusLine());
                    HttpEntity entity = response.getEntity();
                    filename = download(entity);
                }
            } finally {
                httpclient.close();
                response.close();
            }
        }catch (Exception e){
            e.printStackTrace();
        }
        return filename;
    }

    private static String download(HttpEntity resEntity) {
        //圖片要保存的路徑
        String dirPath = "d:\\pic\\";
        //圖片名稱，可以自定義生成
        String fileName = "b_logo.png";
        //如果沒有目錄先創建目錄，如果沒有文件名先創建文件名
        File file = new File(dirPath);
        if(file == null || !file.exists()){
            file.mkdir();
        }
        String realPath = dirPath.concat(fileName);
        File filePath = new File(realPath);
        if (filePath == null || !filePath.exists()) {
            try {
                filePath.createNewFile();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        //得到輸入流，然後把輸入流放入緩衝區中，緩衝區--->輸出流flush，關閉資源
        BufferedOutputStream out = null;
        InputStream in = null;
        try {
            if (resEntity == null) {
                return null;
            }
            in = resEntity.getContent();

            out = new BufferedOutputStream(new FileOutputStream(filePath));
            byte[] bytes = new byte[1024];
            int len = -1;
            while((len = in.read(bytes)) != -1){
                out.write(bytes,0,len);
            }
            out.flush();
            out.close();
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (in != null) {
                    in.close();
                }
            } catch (IOException e) {
            }

        }
        return filePath.toString();
    }

    public static void main( String[] args ) {
        // TODO Auto-generated method stub

        // 定義即將訪問的鏈接
        String url = "https://www.baidu.com/";

        // 訪問鏈接並獲取內容
        String result = sendGet( url );

        // 使用正則匹配圖片的src內容
        String imgSrc = RegexString( result , "src=//(.+?) " );
        System.out.println( imgSrc );

        // 將圖片獲取到本地
        get(imgSrc);
    }

}

注意點:
（1）正則表達式匹配：”src=//(.+?) ” 在 ) 後面有一個空格，否則匹配不成功。
（2）要導入Httpclient的. jar包
（3）在get圖片時候，要將其路徑補充完整： String tergetUrl = “http://” + url;

第二種：jsoup + Httpclient

import java.io.*;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class TestTwo {

     public static void main(String args[]){  
            String url = "http://www.baidu.com";  
            String result = getBaiduPic(url);
            String picUrl = "Https:"+result;  
            //System.out.println(picUrl);
            downPicture(picUrl);
        }  

    public static String getBaiduPic(String url){  
        Document doc;
        String result="";
        try{  
            //從一個URL加載一個Document  
            doc = Jsoup.connect(url).get();  
            //使用選擇器語法來查找元素,類似於DOM的方法，返回一組元素，如下圖鍵值對  
            Elements ListDiv = doc.getElementsByAttributeValue("id", "lg");  
            for(Element element: ListDiv){  
                //根據標籤名稱在element中匹配符合條件的項  
                Elements links = element.getElementsByTag("img");  
                for(Element link: links){  
                    //取出src屬性對應的值，也就是圖片鏈接  
                    result = link.attr("src");  
                    System.out.println(result);  
                }  
            }  
        }catch(Exception e){  
            e.printStackTrace();  
        }  
        return result;
    }  

    public static void downPicture(String url){  
        java.io.InputStream inputStream = null;  
        OutputStream outputStream = null;  
        File file = new File("D://pic");  
        try {  
            //1.初始化HttpClient對象  
            CloseableHttpClient httpClient = HttpClientBuilder.create().build();  
            //2.創建一個HttpGet方法  
            HttpGet httpGet = new HttpGet(url);  
            //3.執行請求  
            HttpResponse response = httpClient.execute(httpGet);  
            //4.獲取返回狀態碼    
            int returnCode = response.getStatusLine().getStatusCode();  
            if(returnCode == 200){  
                //創建文件夾  
                file.mkdir();  
                HttpEntity entity = response.getEntity();  
                //初始化inputStream  
                inputStream = entity.getContent();  
                outputStream = new FileOutputStream(new File("D://pic//LOGO.jpg"));  
                int temp = -1;  
                while((temp = inputStream.read())!=-1){  
                    outputStream.write(temp);  
                }  
                httpGet.abort();  
            }  
        }catch(Exception e){  
            e.printStackTrace();  
        }finally{  
            if(inputStream!=null){  
                try {  
                    inputStream.close();  
                } catch (IOException e) {  
                    e.printStackTrace();  
                }  
            }  
            if(outputStream!=null){  
                try {  
                    outputStream.close();  
                } catch (IOException e) {  
                    e.printStackTrace();  
                }  
            }  

        }  
    }  

}

注意點：
（1）在downPicture（）中，java.io.InputStream inputStream = null; 由於 HttpEntity entity = response.getEntity(); 的影響。

參考資料:
（1）行走江湖的少俠哥 – 第2節—小任務,爬取百度LOGO鏈接並下載圖片： http://blog.csdn.net/sinat_32588261/article/details/72287108
（2）Mr_river – Java簡單爬蟲系列： https://my.oschina.net/u/2519530/blog/597359
（3）汪海的實驗室 – [Java]知乎下巴第1集：爬蟲世界百度不僅僅可以拿來測網速 : http://blog.csdn.net/pleasecallmewhy/article/details/17594303
（這個博客可以看下討論區）