package com.orange.qqnews;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.sf.json.JSONObject;
public class Test {
public static void main(String[] args) {
//新聞正文正則
String regex1 = "<div id=\"Cnt-Main-Article-QQ\" bossZone=\"content\">([\\d\\D]*?)</div>";
//評論ID正則
String regex2 = "cmt_id = ([\\d]*?);";
//獲取網頁源代碼
String html = openUrl("http://news.qq.com/a/20150825/004734.htm","gb2312");
//獲取新聞正則
String content = getContent(regex1,html);
System.out.println(content);
//獲取評論ID
String cmtId = getContent(regex2,html);
System.out.println(cmtId);
//拼接評論地址
String cmtUrl = "http://coral.qq.com/article/"+cmtId+"/comment?commentid=0&reqnum=20";
String cmt = openUrl(cmtUrl,"gb2312");
JSONObject jsonMap = new JSONObject();
Map map = jsonMap.fromObject(cmt);
Map<String,List> data = (Map)map.get("data");
List<Map<String,String>> comments = data.get("commentid");
for(Map<String,String> m : comments){
String cmtContent = m.get("content"); //評論
//其他信息略過(回覆人,回覆時間,贊等)
System.out.println(cmtContent);
}
}
/**
* 訪問url返回url的html代碼
*/
public static String openUrl(String currentUrl,String charset) {
InputStream is = null;
BufferedReader br = null;
URL url;
StringBuffer html = new StringBuffer();
try {
url = new URL(currentUrl);
URLConnection conn = url.openConnection();
conn.setReadTimeout(5000);
conn.connect();
is = conn.getInputStream();
br = new BufferedReader(new InputStreamReader(is,charset));
String str;
while (null != (str = br.readLine())) {
html.append(str).append("\n");
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (br != null) {
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (is != null) {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return html.toString();
}
private static String getContent(String regex,String text) {
String content = "";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(text);
while(matcher.find()) {
content = matcher.group(1).toString();
}
return content;
}
}
抓取騰訊新聞評論
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.