public static boolean isContainChinese(String str) {
Pattern p = Pattern.compile("[\u4e00-\u9fa5]");
Matcher m = p.matcher(str);
if (m.find()) {
return true;
}
return false;
}
/**
* 從汽車之家抓新聞
* @param size
* @param baseUrl
* @param domainName
* @param newsListId
* @param newsContentClass
* @param titleTagOrClass
* @param limitHref
* @param dateTag
* @param needDeleteAlt
* @return
*/
public static ArrayList<News> getNewsFromCarHome(int size,String baseUrl,String domainName,String newsListId,
String newsContentClass,String titleTag,String dateTag,String needDeleteAlt){
ArrayList<News> newsList = new ArrayList<News>();
Document doc;
Elements elements =null;
Element title =null;
News news = null;
try {
doc = Jsoup.connect(baseUrl).get();
elements = (Elements) doc.getElementById(newsListId).children();
if(elements!=null&&elements.size()>0){
for(Element ele:elements){
news = new News();
title = ele.select("a").first();
if(title==null){
continue;
}
news.setTitle(title.getElementsByTag(titleTag).text());
if(news.getTitle()==null||news.getTitle().equals("")){
continue;
}
news.setHref(domainName+title.attr("href"));
if(dateTag!=null){
String date=ele.select("i").text();
news.setDate(date);
}
String newsUrl =news.getHref();
if (isContainChinese(news.getHref())) {
newsUrl = URLEncoder.encode(news.getHref(), "utf-8").toLowerCase().replace("%3a", ":").replace("%2f", "/");
}
Document newsDoc = Jsoup.connect(newsUrl).get();
String text=newsDoc.getElementsByClass(newsContentClass).html();
if(text.indexOf("未經許可")>0||text.indexOf("禁止轉載")>0||text.indexOf("公衆號")>0||text.indexOf("公衆賬號")>0){
continue;
}
text=replaceImgSrcFromDataSrc(text,true,needDeleteAlt);
int index=text.lastIndexOf("(");
if(index>0){
text=text.substring(0,index);
}
StringBuffer textBuffer=new StringBuffer(5);
textBuffer.append("<!DOCTYPE html><html><head><meta name=\"content-type\" content=\"text/html; charset=UTF-8\">");
textBuffer.append("</head><body>");
textBuffer.append(deleteSource(text));
textBuffer.append("</body></html>");
news.setContent(textBuffer.toString());
news.setContent(textBuffer.toString());
System.out.println("標題====="+news.getTitle());
System.out.println("href====="+news.getHref());
System.out.println("content====="+news.getContent());
newsList.add(news);
if(newsList.size()==size){
break;
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return newsList;
}
private static String deleteImg(String text) {
return text.replaceAll("<img [^>]*>", "");
}
private static String deleteSource(String text) {
return text.replaceAll("\\(.*?\\)|\\[.*?]", "");
}
/**
* 刪除a標籤中的href
* @param content
* @return
*/
public static String removeHref(String content){
Document document = Jsoup.parse(content);
Elements elements = document.select("a[href]");
for(Element el:elements){
el.removeAttr("href");
}
return document.html();
}
/**
* 將htmlBody中所有img標籤中的src內容替換爲原data-src的內容, <br/>
* 如果不報含data-src,則src的內容不會被替換 <br/>
* @param htmlBody html內容
* @param needDeleteAlt 需要剔除的圖片的alt信息
* @param imgUrlNeedAddProtocolPrefix 圖片的url是否需要添加http協議前綴
* @return 返回替換後的內容
*/
public static String replaceImgSrcFromDataSrc(String htmlBody,boolean imgUrlNeedAddProtocolPrefix,String needDeleteAlt) {
Document document = Jsoup.parseBodyFragment(htmlBody);
List<Element> nodes = document.select("img");
int nodeLenth = nodes.size();
if(nodeLenth==0){
return htmlBody;
}
for (int i = 0; i < nodeLenth; i++) {
Element e = nodes.get(i);
String dataSrc = e.attr("data-src");
if (StringUtils.isNotBlank(dataSrc)) {
e.attr("src", dataSrc);
e.removeAttr("data-src");
}
}
if (htmlBody.contains("<html>")) {
if(needDeleteAlt==null&&!imgUrlNeedAddProtocolPrefix){
return document.toString();
}else if(needDeleteAlt==null&&imgUrlNeedAddProtocolPrefix){
return document.toString().replace("src=\"//", "src=\"http://");
}else if(needDeleteAlt!=null&&imgUrlNeedAddProtocolPrefix){
return document.toString().replace("src=\"//", "src=\"http://").replace("alt="+needDeleteAlt, "");
}
return document.toString().replace("alt="+needDeleteAlt, "");
} else {
if(needDeleteAlt==null&&!imgUrlNeedAddProtocolPrefix){
return document.select("body>*").toString();
}else if(needDeleteAlt==null&&imgUrlNeedAddProtocolPrefix){
return document.select("body>*").toString().replace("src=\"//", "src=\"http://");
}else if(needDeleteAlt!=null&&imgUrlNeedAddProtocolPrefix){
return document.select("body>*").toString().replace("src=\"//", "src=\"http://").replace("alt="+needDeleteAlt, "");
}
return document.select("body>*").toString().replace("alt="+needDeleteAlt, "");
}
}
public static void main(String[] args) throws Exception{
getNewsFromCarHome(1,"http://m.autohome.com.cn/channel","http://m.autohome.com.cn","list","details","h4","time","汽車之家");
}
java爬蟲入門--用jsoup爬取汽車之家的新聞
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.