java抓取全部網頁內容

import java.io.BufferedReader; 
import java.io.IOException; 
import java.io.InputStreamReader; 
import java.net.MalformedURLException; 
import java.net.URL; 
import java.util.ArrayList; 
import java.util.List; 
import java.util.regex.Matcher; 
import java.util.regex.Pattern; 
 
/** 
 * 
 * @author yaohucaizi 
 */ 
public class Test { 
 
    /** 
     * 讀取網頁全部內容 
     */ 
    public String getHtmlContent(String htmlurl) { 
        URL url; 
        String temp; 
        StringBuffer sb = new StringBuffer(); 
        try { 
            url = new URL(htmlurl); 
            BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream(), "gbk"));// 讀取網頁全部內容 
            while ((temp = in.readLine()) != null) { 
                sb.append(temp); 
            } 
            in.close(); 
        } catch (final MalformedURLException me) { 
            System.out.println("你輸入的URL格式有問題!"); 
            me.getMessage(); 
        } catch (final IOException e) { 
            e.printStackTrace(); 
        } 
        return sb.toString(); 
    } 
 
    /** 
     * 
     * @param s 
     * @return 獲得網頁標題 
     */ 
    public String getTitle(String s) { 
        String regex; 
        String title = ""; 
        List<String> list = new ArrayList<String>(); 
        regex = "<title>.*?</title>"; 
        Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ); 
        Matcher ma = pa.matcher(s); 
        while (ma.find()) { 
            list.add(ma.group()); 
        } 
        for (int i = 0; i < list.size(); i++) { 
            title = title + list.get(i); 
        } 
        return outTag(title); 
    } 
 
    /** 
     * 
     * @param s 
     * @return 獲得鏈接 
     */ 
    public List<String> getLink(String s) { 
        String regex; 
        List<String> list = new ArrayList<String>(); 
        regex = "<a[^>]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)</a>"; 
        Pattern pa = Pattern.compile(regex, Pattern.DOTALL); 
        Matcher ma = pa.matcher(s); 
        while (ma.find()) { 
            list.add(ma.group()); 
        } 
        return list; 
    } 
 
    /** 
     * 
     * @param s 
     * @return 獲得腳本代碼 
     */ 
    public List<String> getScript(String s) { 
        String regex; 
        List<String> list = new ArrayList<String>(); 
        regex = "<SCRIPT.*?</SCRIPT>"; 
        Pattern pa = Pattern.compile(regex, Pattern.DOTALL); 
        Matcher ma = pa.matcher(s); 
        while (ma.find()) { 
            list.add(ma.group()); 
        } 
        return list; 
    } 
     
    public List<String> getNews(String s) { 
        String regex = "<a.*?</a>"; 
        Pattern pa = Pattern.compile(regex, Pattern.DOTALL); 
        Matcher ma = pa.matcher(s); 
        List<String> list = new ArrayList<String>(); 
        while (ma.find()) { 
            list.add(ma.group()); 
        } 
        return list; 
    } 
 
    /** 
     * 
     * @param s 
     * @return 獲得CSS 
     */ 
    public List<String> getCSS(String s) { 
        String regex; 
        List<String> list = new ArrayList<String>(); 
        regex = "<style.*?</style>"; 
        Pattern pa = Pattern.compile(regex, Pattern.DOTALL); 
        Matcher ma = pa.matcher(s); 
        while (ma.find()) { 
            list.add(ma.group()); 
        } 
        return list; 
    } 
 
    /** 
     * 
     * @param s 
     * @return 去掉標記 
     */ 
    public String outTag(final String s) { 
        return s.replaceAll("<.*?>", ""); 
    } 
     
    public static void main(String[] args) { 
        Test t = new Test(); 
        String content = t.getHtmlContent("http://www.taobao.com"); 
        //content = content.replaceAll("(<br>)+?", "\n");// 轉化換行 
        //content = content.replaceAll("<p><em>.*?</em></p>", "");// 去圖片註釋 
        System.out.println(content); 
        System.out.println(t.getTitle(content)); 
        List<String> a = t.getNews(content); 
        List<String> news = new ArrayList<String>(); 
        for (String s : a) { 
            news.add(s.replaceAll("<.*?>", "")); 
        } 
        System.out.println(news); 
        //…… 獲取js、css等操作省略 
    } 
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章