爬虫记录(2)——简单爬取一个页面的图片并保存

继上一篇文章 爬虫记录(1)——简单爬取一个页面的内容并写入到文本中 这代码,我们在之前类中增加了一些其他的方法

1、爬虫工具类,用来获取网页内容

package com.dyw.crawler.util;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;

/**
 * 爬虫工具类
 * Created by dyw on 2017/9/1.
 */
public class CrawlerUtils {

    /**
     * 获取html内容转成string输出。
     *
     * @param url url链接
     * @return 整个网页转成String字符串
     */
    public static String getHtml(String url) throws Exception {
        URL url1 = new URL(url);//使用java.net.URL
        URLConnection connection = url1.openConnection();//打开链接
        InputStream in = connection.getInputStream();//获取输入流
        InputStreamReader isr = new InputStreamReader(in);//流的包装
        BufferedReader br = new BufferedReader(isr);

        String line;
        StringBuffer sb = new StringBuffer();
        while ((line = br.readLine()) != null) {//整行读取
            sb.append(line, 0, line.length());//添加到StringBuffer中
            sb.append('\n');//添加换行符
        }
        //关闭各种流,先声明的后关闭
        br.close();
        isr.close();
        in.close();
        return sb.toString();
    }

    /**
     * 下载文件流
     * @param urlStr url地址
     * @return InputStream
     */
    public static InputStream downLoadFromUrl(String urlStr) throws IOException {
        URL url = new URL(urlStr);
        HttpURLConnection conn = (HttpURLConnection) url.openConnection();
        //防止屏蔽程序抓取而返回403错误
        conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
        //设置超时间为3秒
        conn.setConnectTimeout(3 * 1000);
        conn.setRequestProperty("Accept",
                "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-powerpoint, application/vnd.ms-excel, application/msword, */*");
        conn.setRequestProperty("Accept-Language", "zh-cn");
        conn.setRequestProperty("UA-CPU", "x86");
        conn.setRequestProperty("Accept-Encoding", "gzip");//为什么没有deflate呢
        conn.setRequestProperty("Content-type", "text/html");
        conn.setRequestProperty("Connection", "keep-alive");
        //得到输入流
        return conn.getInputStream();
    }
}

2、正则工具类,用来匹配需要获取的url地址

package com.dyw.crawler.util;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 正则表达式工具类
 * Created by dyw on 2017/9/1.
 */
public class RegularUtils {
    //获取img标签正则
    private static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>";
    //获取href正则
    private static final String AURL_REG = "href=\"(.*?)\"";
    //获取http开头,png|jpg|bmp|gif结尾的 正则
    private static final String IMGSRC_REG = "[a-zA-z]+://[^\\s]*(?:png|jpg|bmp|gif)";

    /**
     * 获取 A 标签的正则表达式
     *
     * @param html 匹配的内容
     * @return List结果集
     */
    public static List<String> getAUrl(String html) {
        return match(AURL_REG, html);
    }

    /**
     * 获取 IMG 标签的正则表达式
     *
     * @param html 匹配的内容
     * @return List结果集
     */
    public static List<String> getIMGUrl(String html) {
        List<String> imgUrl = match(IMGURL_REG, html);
        return match(IMGSRC_REG, imgUrl);
    }
    /**
     * 获取 A 标签的正则表达式
     *
     * @param html 匹配的内容
     * @return List结果集
     */
    public static List<String> getIMGSrc(String html) {
        return match(IMGSRC_REG, html);
    }

    /**
     * String匹配正则,封装到list中
     *
     * @param regular 正则表达式
     * @param html    匹配的内容
     * @return 匹配到的结果 List
     */
    private static List<String> match(String regular, String html) {
        Matcher matcher = Pattern.compile(regular).matcher(html);
        List<String> list = new ArrayList<>();
        while (matcher.find()) {
            list.add(matcher.group());
        }
        return list;
    }

    /**
     * list匹配正则,封装到list中
     *
     * @param regular 正则表达式
     * @param list    匹配的列表
     * @return 匹配到的结果 List
     */
    private static List<String> match(String regular, List<String> list) {
        List<String> result = new ArrayList<>();
        list.forEach(string -> {
            Matcher matcher = Pattern.compile(regular).matcher(string);
            while (matcher.find()) {
                result.add(matcher.group());
            }
        });
        return result;
    }
}

3、IO工具类,用来把获取的html内容进行写入到文件中

package com.dyw.crawler.util;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;

/**
 * IO工具类
 * Created by dyw on 2017/9/1.
 */
public class IOUtils {

    /**
     * 创建文件
     *
     * @param file File类型
     */
    public static void createFile(File file) throws Exception {
        try {
            if (!file.exists()) {
                file.createNewFile();
            }
        } catch (Exception e) {
            throw new Exception("创建文件的时候错误!", e);
        }
    }

    /**
     * 写入String到file中
     *
     * @param content  写入内容
     * @param fileName 写入位置
     */
    public static void writeFile(String content, File fileName) throws Exception {
        writeFile(content.getBytes("Utf-8"), fileName);
    }

    /**
     * 写入bytes到file中
     *
     * @param bytes    写入内容
     * @param fileName 写入位置
     */
    public static void writeFile(byte[] bytes, File fileName) throws Exception {
        FileOutputStream o;
        try {
            o = new FileOutputStream(fileName);
            o.write(bytes);
            o.close();
        } catch (Exception e) {
            throw new Exception("写入文件的时候错误!", e);
        }
    }

    /**
     * 保存inputStream到文件
     *
     * @param inputStream 输入流
     * @param fileName    保存文件的位置
     */
    public static void saveFile(InputStream inputStream, File fileName) throws Exception {
        writeFile(readInputStream(inputStream), fileName);
    }

    /**
     * 从输入流中获取字节数组
     *
     * @param inputStream 输入流
     * @return byte数组
     */
    private static byte[] readInputStream(InputStream inputStream) throws IOException {
        byte[] buffer = new byte[1024];
        int len = 0;
        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        while ((len = inputStream.read(buffer)) != -1) {
            bos.write(buffer, 0, len);
        }
        bos.close();
        inputStream.close();
        return bos.toByteArray();
    }
}

4、main方法执行

package com.dyw.crawler.project;

import com.dyw.crawler.util.CrawlerUtils;
import com.dyw.crawler.util.IOUtils;
import com.dyw.crawler.util.RegularUtils;

import java.io.File;
import java.io.InputStream;
import java.util.List;

/**
 * 下载网页中的图片
 * Created by dyw on 2017/9/4.
 */
public class Project1 {
    public static void main(String[] args) {
        //文件放置的路径
        String path = "C:\\Users\\dyw\\Desktop\\crawler";
        //爬取的网站地址
        String url = "http://blog.csdn.net/juewang_love";
        //获取内容
        String htmlContent = null;
        try {
            htmlContent = CrawlerUtils.getHtml(url);
        } catch (Exception e) {
            throw new RuntimeException("获取内容失败!", e);
        }
        //获取所有的img的内容
        List<String> imgUrls = RegularUtils.getIMGUrl(htmlContent);
        //分别下载每个img
        imgUrls.forEach(imgUrl -> {
            String[] split = imgUrl.split("/");
            String imgName = split[split.length - 1];
            try {
                File file1 = new File(path + "/" + imgName);
                InputStream inputStream = CrawlerUtils.downLoadFromUrl(imgUrl);
                IOUtils.saveFile(inputStream, file1);
                System.out.println("success:" + imgName);
            } catch (Exception e) {
                System.out.println("fail:" + imgUrl + "" + imgName);
            }
        });
    }
}


5、修改 CrawlerUtils 工具类 用 httpclient 替代 urlConnection

package com.dyw.crawler.util;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

/**
 * 爬虫工具类
 * Created by dyw on 2017/9/1.
 */
public class CrawlerUtils {

    /**
     * http请求设置消息头
     *
     * @param httpMethod http请求方法
     */
    private static void setHead(HttpMethod httpMethod) {
        httpMethod.setRequestHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
        httpMethod.setRequestHeader("Content-Type", "Utf-8");
        httpMethod.setRequestHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
    }

    /**
     * 获取html内容转成string输出(get方法)
     *
     * @param url url链接
     * @return 整个网页转成String字符串
     */
    public static String getHtml(String url) throws Exception {
        InputStream inputStream = downLoadFromUrl(url);
        BufferedReader br = new BufferedReader(new InputStreamReader(inputStream, "Utf-8"));
        StringBuffer stringBuffer = new StringBuffer();
        String str;
        while ((str = br.readLine()) != null) {
            stringBuffer.append(str);
            stringBuffer.append('\n');//添加换行符
        }
        return stringBuffer.toString();
    }

    /**
     * 获取文件流(get方法)
     *
     * @param urlStr url地址
     * @return InputStream
     */
    public static InputStream downLoadFromUrl(String urlStr) throws IOException {
        //通过httpclient来代替urlConnection
        HttpClient httpClient = new HttpClient();
        HttpMethod httpMethod = new GetMethod(urlStr);
        setHead(httpMethod);
        int status = httpClient.executeMethod(httpMethod);
        InputStream responseBodyAsStream = null;
        if (status == HttpStatus.SC_OK) {
            responseBodyAsStream = httpMethod.getResponseBodyAsStream();
        }
        return responseBodyAsStream;
    }
}
发布了62 篇原创文章 · 获赞 80 · 访问量 12万+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章