抓取騰訊新聞評論


package com.orange.qqnews;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import net.sf.json.JSONObject;

public class Test {
	public static void main(String[] args) {
		//新聞正文正則
		String regex1 = "<div id=\"Cnt-Main-Article-QQ\" bossZone=\"content\">([\\d\\D]*?)</div>";
		//評論ID正則
		String regex2 = "cmt_id = ([\\d]*?);";
		//獲取網頁源代碼
		String html = openUrl("http://news.qq.com/a/20150825/004734.htm","gb2312");
		//獲取新聞正則
		String content = getContent(regex1,html);
		System.out.println(content);
		//獲取評論ID
		String cmtId = getContent(regex2,html);
		System.out.println(cmtId);
		
		//拼接評論地址
		String cmtUrl = "http://coral.qq.com/article/"+cmtId+"/comment?commentid=0&reqnum=20";
		String cmt = openUrl(cmtUrl,"gb2312");

		JSONObject jsonMap = new JSONObject();
		Map map = jsonMap.fromObject(cmt);
		Map<String,List> data = (Map)map.get("data");
		List<Map<String,String>> comments = data.get("commentid");
		
		for(Map<String,String> m : comments){
			String cmtContent = m.get("content"); //評論
			
			//其他信息略過(回覆人,回覆時間,贊等)
			
			System.out.println(cmtContent);
		}
		

	}

	/**
	 * 訪問url返回url的html代碼
	 */
	public static String openUrl(String currentUrl,String charset) {
		InputStream is = null;
		BufferedReader br = null;
		URL url;
		StringBuffer html = new StringBuffer();
		try {
			url = new URL(currentUrl);
			URLConnection conn = url.openConnection();
			conn.setReadTimeout(5000);
			conn.connect();
			is = conn.getInputStream();
			br = new BufferedReader(new InputStreamReader(is,charset));
			String str;
			while (null != (str = br.readLine())) {
				html.append(str).append("\n");
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			if (br != null) {
				try {
					br.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
			if (is != null) {
				try {
					is.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}

		}
		return html.toString();
	}
	
	private static String getContent(String regex,String text) {
		String content = "";
		Pattern pattern = Pattern.compile(regex);
		Matcher matcher = pattern.matcher(text);
		while(matcher.find()) {
			content = matcher.group(1).toString();
		}
		return content;
	}
}




發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章