java爬蟲入門--用jsoup爬取汽車之家的新聞

 public static boolean isContainChinese(String str) {  
        Pattern p = Pattern.compile("[\u4e00-\u9fa5]");  
        Matcher m = p.matcher(str);  
        if (m.find()) {  
            return true;  
        }  
        return false;  
    }  
      
 
    /** 
     * 從汽車之家抓新聞 
     * @param size 
     * @param baseUrl 
     * @param domainName 
     * @param newsListId 
     * @param newsContentClass 
     * @param titleTagOrClass 
     * @param limitHref 
     * @param dateTag 
     * @param needDeleteAlt 
     * @return 
     */  
    public static ArrayList<News> getNewsFromCarHome(int size,String baseUrl,String domainName,String newsListId,  
            String newsContentClass,String titleTag,String dateTag,String needDeleteAlt){  
        ArrayList<News> newsList = new ArrayList<News>();  
        Document doc;  
        Elements elements =null;  
        Element title =null;  
        News news = null;  
        try {  
            doc = Jsoup.connect(baseUrl).get();  
            elements = (Elements) doc.getElementById(newsListId).children();  
            if(elements!=null&&elements.size()>0){  
                for(Element ele:elements){  
                    news = new News();  
                    title = ele.select("a").first();  
                    if(title==null){  
                        continue;  
                    }  
                    news.setTitle(title.getElementsByTag(titleTag).text());  
                    if(news.getTitle()==null||news.getTitle().equals("")){  
                        continue;  
                    }  
                    news.setHref(domainName+title.attr("href"));  
                    if(dateTag!=null){  
                        String date=ele.select("i").text();  
                          news.setDate(date);  
                    }  
                    String newsUrl =news.getHref();  
                    if (isContainChinese(news.getHref())) {  
                        newsUrl = URLEncoder.encode(news.getHref(), "utf-8").toLowerCase().replace("%3a", ":").replace("%2f", "/");  
                    }  
                    Document newsDoc = Jsoup.connect(newsUrl).get();  
                    String text=newsDoc.getElementsByClass(newsContentClass).html();  
                    if(text.indexOf("未經許可")>0||text.indexOf("禁止轉載")>0||text.indexOf("公衆號")>0||text.indexOf("公衆賬號")>0){  
                        continue;  
                    }  
                     text=replaceImgSrcFromDataSrc(text,true,needDeleteAlt);  
                     int index=text.lastIndexOf("(");  
                     if(index>0){  
                         text=text.substring(0,index);  
                     }  
                     StringBuffer textBuffer=new StringBuffer(5);  
                     textBuffer.append("<!DOCTYPE html><html><head><meta name=\"content-type\" content=\"text/html; charset=UTF-8\">");  
                     textBuffer.append("</head><body>");  
                     textBuffer.append(deleteSource(text));  
                     textBuffer.append("</body></html>");  
                     news.setContent(textBuffer.toString());  
                     news.setContent(textBuffer.toString());  
                     System.out.println("標題====="+news.getTitle());  
                     System.out.println("href====="+news.getHref());  
                     System.out.println("content====="+news.getContent());  
                     newsList.add(news);  
                    if(newsList.size()==size){  
                        break;  
                    }  
                }  
            }  
        } catch (Exception e) {  
            e.printStackTrace();  
        }  
        return newsList;  
    }  
      
    private static String deleteImg(String text) {  
        return text.replaceAll("<img [^>]*>", "");  
    }  
    private static String deleteSource(String text) {  
         return text.replaceAll("\\(.*?\\)|\\[.*?]", "");  
     }  
    /** 
     * 刪除a標籤中的href 
     * @param content 
     * @return 
     */  
    public static String removeHref(String content){    
        Document document = Jsoup.parse(content);    
        Elements elements = document.select("a[href]");    
        for(Element el:elements){    
            el.removeAttr("href");    
        }    
        return document.html();    
    }    
      
      
    /** 
     * 將htmlBody中所有img標籤中的src內容替換爲原data-src的內容, <br/> 
     * 如果不報含data-src,則src的內容不會被替換 <br/> 
     * @param htmlBody html內容 
     * @param needDeleteAlt 需要剔除的圖片的alt信息 
     * @param imgUrlNeedAddProtocolPrefix 圖片的url是否需要添加http協議前綴 
     * @return 返回替換後的內容 
     */  
    public static String replaceImgSrcFromDataSrc(String htmlBody,boolean imgUrlNeedAddProtocolPrefix,String needDeleteAlt) {  
        Document document = Jsoup.parseBodyFragment(htmlBody);  
        List<Element> nodes = document.select("img");  
        int nodeLenth = nodes.size();  
        if(nodeLenth==0){  
            return htmlBody;  
        }  
        for (int i = 0; i < nodeLenth; i++) {  
            Element e = nodes.get(i);  
            String dataSrc = e.attr("data-src");  
            if (StringUtils.isNotBlank(dataSrc)) {  
                e.attr("src", dataSrc);  
                e.removeAttr("data-src");  
            }  
        }  
        if (htmlBody.contains("<html>")) {  
            if(needDeleteAlt==null&&!imgUrlNeedAddProtocolPrefix){  
                return document.toString();  
            }else if(needDeleteAlt==null&&imgUrlNeedAddProtocolPrefix){  
                return document.toString().replace("src=\"//", "src=\"http://");  
            }else if(needDeleteAlt!=null&&imgUrlNeedAddProtocolPrefix){  
                return document.toString().replace("src=\"//", "src=\"http://").replace("alt="+needDeleteAlt, "");  
            }  
            return document.toString().replace("alt="+needDeleteAlt, "");  
        } else {  
            if(needDeleteAlt==null&&!imgUrlNeedAddProtocolPrefix){  
                return document.select("body>*").toString();  
            }else if(needDeleteAlt==null&&imgUrlNeedAddProtocolPrefix){  
                return document.select("body>*").toString().replace("src=\"//", "src=\"http://");  
            }else if(needDeleteAlt!=null&&imgUrlNeedAddProtocolPrefix){  
                return document.select("body>*").toString().replace("src=\"//", "src=\"http://").replace("alt="+needDeleteAlt, "");  
            }  
            return document.select("body>*").toString().replace("alt="+needDeleteAlt, "");  
        }  
      
    }  
    public static void main(String[] args) throws Exception{  
        getNewsFromCarHome(1,"http://m.autohome.com.cn/channel","http://m.autohome.com.cn","list","details","h4","time","汽車之家"); 
    }  

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章