爬蟲記錄(2)——簡單爬取一個頁面的圖片並保存

繼上一篇文章 爬蟲記錄(1)——簡單爬取一個頁面的內容並寫入到文本中 這代碼,我們在之前類中增加了一些其他的方法

1、爬蟲工具類,用來獲取網頁內容

package com.dyw.crawler.util;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;

/**
 * 爬蟲工具類
 * Created by dyw on 2017/9/1.
 */
public class CrawlerUtils {

    /**
     * 獲取html內容轉成string輸出。
     *
     * @param url url鏈接
     * @return 整個網頁轉成String字符串
     */
    public static String getHtml(String url) throws Exception {
        URL url1 = new URL(url);//使用java.net.URL
        URLConnection connection = url1.openConnection();//打開鏈接
        InputStream in = connection.getInputStream();//獲取輸入流
        InputStreamReader isr = new InputStreamReader(in);//流的包裝
        BufferedReader br = new BufferedReader(isr);

        String line;
        StringBuffer sb = new StringBuffer();
        while ((line = br.readLine()) != null) {//整行讀取
            sb.append(line, 0, line.length());//添加到StringBuffer中
            sb.append('\n');//添加換行符
        }
        //關閉各種流,先聲明的後關閉
        br.close();
        isr.close();
        in.close();
        return sb.toString();
    }

    /**
     * 下載文件流
     * @param urlStr url地址
     * @return InputStream
     */
    public static InputStream downLoadFromUrl(String urlStr) throws IOException {
        URL url = new URL(urlStr);
        HttpURLConnection conn = (HttpURLConnection) url.openConnection();
        //防止屏蔽程序抓取而返回403錯誤
        conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
        //設置超時間爲3秒
        conn.setConnectTimeout(3 * 1000);
        conn.setRequestProperty("Accept",
                "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-powerpoint, application/vnd.ms-excel, application/msword, */*");
        conn.setRequestProperty("Accept-Language", "zh-cn");
        conn.setRequestProperty("UA-CPU", "x86");
        conn.setRequestProperty("Accept-Encoding", "gzip");//爲什麼沒有deflate呢
        conn.setRequestProperty("Content-type", "text/html");
        conn.setRequestProperty("Connection", "keep-alive");
        //得到輸入流
        return conn.getInputStream();
    }
}

2、正則工具類,用來匹配需要獲取的url地址

package com.dyw.crawler.util;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 正則表達式工具類
 * Created by dyw on 2017/9/1.
 */
public class RegularUtils {
    //獲取img標籤正則
    private static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>";
    //獲取href正則
    private static final String AURL_REG = "href=\"(.*?)\"";
    //獲取http開頭,png|jpg|bmp|gif結尾的 正則
    private static final String IMGSRC_REG = "[a-zA-z]+://[^\\s]*(?:png|jpg|bmp|gif)";

    /**
     * 獲取 A 標籤的正則表達式
     *
     * @param html 匹配的內容
     * @return List結果集
     */
    public static List<String> getAUrl(String html) {
        return match(AURL_REG, html);
    }

    /**
     * 獲取 IMG 標籤的正則表達式
     *
     * @param html 匹配的內容
     * @return List結果集
     */
    public static List<String> getIMGUrl(String html) {
        List<String> imgUrl = match(IMGURL_REG, html);
        return match(IMGSRC_REG, imgUrl);
    }
    /**
     * 獲取 A 標籤的正則表達式
     *
     * @param html 匹配的內容
     * @return List結果集
     */
    public static List<String> getIMGSrc(String html) {
        return match(IMGSRC_REG, html);
    }

    /**
     * String匹配正則,封裝到list中
     *
     * @param regular 正則表達式
     * @param html    匹配的內容
     * @return 匹配到的結果 List
     */
    private static List<String> match(String regular, String html) {
        Matcher matcher = Pattern.compile(regular).matcher(html);
        List<String> list = new ArrayList<>();
        while (matcher.find()) {
            list.add(matcher.group());
        }
        return list;
    }

    /**
     * list匹配正則,封裝到list中
     *
     * @param regular 正則表達式
     * @param list    匹配的列表
     * @return 匹配到的結果 List
     */
    private static List<String> match(String regular, List<String> list) {
        List<String> result = new ArrayList<>();
        list.forEach(string -> {
            Matcher matcher = Pattern.compile(regular).matcher(string);
            while (matcher.find()) {
                result.add(matcher.group());
            }
        });
        return result;
    }
}

3、IO工具類,用來把獲取的html內容進行寫入到文件中

package com.dyw.crawler.util;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;

/**
 * IO工具類
 * Created by dyw on 2017/9/1.
 */
public class IOUtils {

    /**
     * 創建文件
     *
     * @param file File類型
     */
    public static void createFile(File file) throws Exception {
        try {
            if (!file.exists()) {
                file.createNewFile();
            }
        } catch (Exception e) {
            throw new Exception("創建文件的時候錯誤!", e);
        }
    }

    /**
     * 寫入String到file中
     *
     * @param content  寫入內容
     * @param fileName 寫入位置
     */
    public static void writeFile(String content, File fileName) throws Exception {
        writeFile(content.getBytes("Utf-8"), fileName);
    }

    /**
     * 寫入bytes到file中
     *
     * @param bytes    寫入內容
     * @param fileName 寫入位置
     */
    public static void writeFile(byte[] bytes, File fileName) throws Exception {
        FileOutputStream o;
        try {
            o = new FileOutputStream(fileName);
            o.write(bytes);
            o.close();
        } catch (Exception e) {
            throw new Exception("寫入文件的時候錯誤!", e);
        }
    }

    /**
     * 保存inputStream到文件
     *
     * @param inputStream 輸入流
     * @param fileName    保存文件的位置
     */
    public static void saveFile(InputStream inputStream, File fileName) throws Exception {
        writeFile(readInputStream(inputStream), fileName);
    }

    /**
     * 從輸入流中獲取字節數組
     *
     * @param inputStream 輸入流
     * @return byte數組
     */
    private static byte[] readInputStream(InputStream inputStream) throws IOException {
        byte[] buffer = new byte[1024];
        int len = 0;
        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        while ((len = inputStream.read(buffer)) != -1) {
            bos.write(buffer, 0, len);
        }
        bos.close();
        inputStream.close();
        return bos.toByteArray();
    }
}

4、main方法執行

package com.dyw.crawler.project;

import com.dyw.crawler.util.CrawlerUtils;
import com.dyw.crawler.util.IOUtils;
import com.dyw.crawler.util.RegularUtils;

import java.io.File;
import java.io.InputStream;
import java.util.List;

/**
 * 下載網頁中的圖片
 * Created by dyw on 2017/9/4.
 */
public class Project1 {
    public static void main(String[] args) {
        //文件放置的路徑
        String path = "C:\\Users\\dyw\\Desktop\\crawler";
        //爬取的網站地址
        String url = "http://blog.csdn.net/juewang_love";
        //獲取內容
        String htmlContent = null;
        try {
            htmlContent = CrawlerUtils.getHtml(url);
        } catch (Exception e) {
            throw new RuntimeException("獲取內容失敗!", e);
        }
        //獲取所有的img的內容
        List<String> imgUrls = RegularUtils.getIMGUrl(htmlContent);
        //分別下載每個img
        imgUrls.forEach(imgUrl -> {
            String[] split = imgUrl.split("/");
            String imgName = split[split.length - 1];
            try {
                File file1 = new File(path + "/" + imgName);
                InputStream inputStream = CrawlerUtils.downLoadFromUrl(imgUrl);
                IOUtils.saveFile(inputStream, file1);
                System.out.println("success:" + imgName);
            } catch (Exception e) {
                System.out.println("fail:" + imgUrl + "" + imgName);
            }
        });
    }
}


5、修改 CrawlerUtils 工具類 用 httpclient 替代 urlConnection

package com.dyw.crawler.util;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

/**
 * 爬蟲工具類
 * Created by dyw on 2017/9/1.
 */
public class CrawlerUtils {

    /**
     * http請求設置消息頭
     *
     * @param httpMethod http請求方法
     */
    private static void setHead(HttpMethod httpMethod) {
        httpMethod.setRequestHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
        httpMethod.setRequestHeader("Content-Type", "Utf-8");
        httpMethod.setRequestHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
    }

    /**
     * 獲取html內容轉成string輸出(get方法)
     *
     * @param url url鏈接
     * @return 整個網頁轉成String字符串
     */
    public static String getHtml(String url) throws Exception {
        InputStream inputStream = downLoadFromUrl(url);
        BufferedReader br = new BufferedReader(new InputStreamReader(inputStream, "Utf-8"));
        StringBuffer stringBuffer = new StringBuffer();
        String str;
        while ((str = br.readLine()) != null) {
            stringBuffer.append(str);
            stringBuffer.append('\n');//添加換行符
        }
        return stringBuffer.toString();
    }

    /**
     * 獲取文件流(get方法)
     *
     * @param urlStr url地址
     * @return InputStream
     */
    public static InputStream downLoadFromUrl(String urlStr) throws IOException {
        //通過httpclient來代替urlConnection
        HttpClient httpClient = new HttpClient();
        HttpMethod httpMethod = new GetMethod(urlStr);
        setHead(httpMethod);
        int status = httpClient.executeMethod(httpMethod);
        InputStream responseBodyAsStream = null;
        if (status == HttpStatus.SC_OK) {
            responseBodyAsStream = httpMethod.getResponseBodyAsStream();
        }
        return responseBodyAsStream;
    }
}
發佈了62 篇原創文章 · 獲贊 80 · 訪問量 12萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章