Java 抓取網頁上的圖片

原創

2020-07-03 21:11

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.sun.xml.internal.fastinfoset.stax.events.Util;

public class CatchPicture {

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		// TODO Auto-generated method stub
		//定義抓取圖片的 正則表達式
		String regular="[*]<b>.*?</b><br/><img src=\"(.*?)\" border=0 alt=\'(.*?)\' style=\".*?\" class=\".*?\">
";
		List<Picture> list=new CatchPicture().lookWeiboPic("http://gaoxiao.jokeji.cn/GrapHtml/dongtai/20120921221658.htm","GBK",regular,"2,1");
		System.out.println(list.size());
	}
	//根據URL查看網站上的圖片
	public List<Picture> lookWeiboPic(String url,String charset,String regular,String attIndex){
		List<Picture> list=new ArrayList<Picture>();
		try {
			//獲取填寫的url
			//判斷所屬網站 獲取 正則表達式
			//獲取圖片存放到 list集合
			if(!Util.isEmptyString(url)){
					String htmls = getPageSource(url.trim(),charset);
					Pattern pattern =null;
					pattern = Pattern.compile(regular.trim());
					if(!Util.isEmptyString(htmls)){
						Matcher matcher = pattern.matcher(htmls);
						
						//得到參數屬性順序
						String[] sort = regular.trim().split(","); //下標：0 表示 標題title ， 1 表示 圖片路徑 
						//判斷後綴後 得到網站的請求頭部 http://www.moonbasa.com/p-032111106.html-->得到 http://www.moonbasa.com
						String[] suffix;
						suffix =url.trim().split("cn");
						String httphread = "";
						if (suffix.length > 1) {
							httphread = suffix[0] + "cn";
	
						} else {
							suffix = url.trim().split("com");
							httphread = suffix[0] + "com";
						}
						//循環匹配找到的
						while(matcher.find()){
							Picture picture=new Picture();
							
							//匹配出title
							if (-1 == Integer.parseInt(sort[0])) {
								// 頁面上抓不到標題
								picture.setTitle("");
							} else {
								// 去標題的#
								String title=matcher.group(Integer.parseInt(sort[0])).replace("#", " ");
								picture.setTitle(title);
							}
							
							//匹配出source
							if (-1 == Integer.parseInt(sort[1])) {
								// 頁面上抓不到圖片路徑
								picture.setSource("");
							}else{
								String webImgUrl=matcher.group(Integer.parseInt(sort[1]));
								//判斷是絕對路徑還是相對路徑
								String[] pathType=webImgUrl.split(":");
								if(pathType.length>1){
									//絕對路徑
									picture.setSource(webImgUrl);
								}else{
									//判斷相對路徑是否含有..
									pathType=webImgUrl.split("\\.\\.");
									if(pathType.length>1){
										picture.setSource(httphread+pathType[1]);
									}else{
										if(webImgUrl.startsWith("/")){
											picture.setSource(httphread+pathType[0]);
										}else{
											picture.setSource(httphread+"/"+pathType[0]);
										}
									}
								}
							}
							String upPath=upload(picture.getSource(),"d:\\image\\");
							picture.setUpPath(upPath);
							list.add(picture);
						}//--end while
					}
		
				}
			}catch (Exception e) {
				e.printStackTrace();
			}
		return list;
	} 
	
	/**
	 * 根據網路路徑獲取 頁面源碼
	 * @param pageUrl
	 * @param encoding
	 * @return
	 */
	public String getPageSource(String pageUrl,String encoding) {    
    StringBuffer sb = new StringBuffer();    
    try {    
        //構建一URL對象    
        URL url = new URL(pageUrl);    
        //使用openStream得到一輸入流並由此構造一個BufferedReader對象    
        BufferedReader in = new BufferedReader(new InputStreamReader(url    
                .openStream(), encoding));    
        String line;    
        //讀取www資源    
        while ((line = in.readLine()) != null) {    
            sb.append(line);    
            sb.append("\n");  
        }    
        in.close();    
    } catch (Exception ex) {    
        System.err.println(ex);    
    }    
    return sb.toString();    
}   
	
	/**
	 * 上傳 圖片 
	 * @param urlStr
	 * @param path
	 * @return
	 * @throws Exception 
	 */
	public String upload(String urlStr,String path) throws Exception{
		Calendar calendar = Calendar.getInstance();
		String month = calendar.get(Calendar.YEAR) + "/"
				+ (calendar.get(Calendar.MONTH) + 1);
		String filename = java.util.UUID.randomUUID().toString()
				+ getExtension(urlStr);
		path =path + month + "/";
		download(urlStr,path,filename);
		return path+month + "/" + filename;
	}
	/**
	 * 根據路徑 下載圖片 然後 保存到對應的目錄下
	 * @param urlString
	 * @param filename
	 * @param savePath
	 * @return
	 * @throws Exception
	 */
	public void download(String urlString, String filename,String savePath) throws Exception {
	    // 構造URL
	    URL url = new URL(urlString);
	    // 打開連接
	    URLConnection con = url.openConnection();
	    //設置請求的路徑
	    con.setConnectTimeout(5*1000);
	    // 輸入流
	    InputStream is = con.getInputStream();
	
	    // 1K的數據緩衝
	    byte[] bs = new byte[1024];
	    // 讀取到的數據長度
	    int len;
	    // 輸出的文件流
	   File sf=new File(savePath);
	   if(!sf.exists()){
		   sf.mkdirs();
	   }
	   OutputStream os = new FileOutputStream(sf.getPath()+"\\"+filename);
	    // 開始讀取
	    while ((len = is.read(bs)) != -1) {
	      os.write(bs, 0, len);
	    }
	    // 完畢，關閉所有鏈接
	    os.close();
	    
	    is.close();
	} 
	
/**
 * 根據文件名 獲取文件的後綴名
 * @param fileUrl
 * @return
 */
 public String getExtension(String fileUrl){
	 return fileUrl.substring(fileUrl.lastIndexOf("."), fileUrl.length());
 }
}

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

Java 抓取網頁上的圖片

「Pygors跨平臺GUI」1：Pygors跨平臺GUI應用研究

[轉帖]

python列出centos7內存使用前50的進程信息

「Pygors跨平臺GUI」2：安裝MinGW-w64、MSYS2還是WSL2

一鍵自動化博客發佈工具,用過的人都說好(掘金篇)

通義千問 2.5 “客串” ChatGPT4，你分的清嗎？

Garnet：微軟官方基於.NET開源的高性能分佈式緩存存儲數據庫

Flink執行圖

Java響應式編程

評估統計算法在銀行僞造鈔票檢測中的價值

Photoshop 快捷鍵

opencms中讓內容類型自動添加模板

MacBook air無法啓動了，一直無限菊花

opencms批量建立測試新聞

OpenCms8創建模板指南（Beginner's guide to template creation with OpenCms 8）

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結