java新聞爬取

原創

2018-08-31 23:37

本來想爬今日頭條，在網上找了很多方法，走了很多彎路，異步刷新沒能解決，本人爬蟲小白。後來發現json數據和本地cookie也有關，感覺前路艱難。果斷換到網易新聞，網易新聞相對來說獲取數據比較簡單，通過谷歌F12分析包數據，發現網易異步刷新的包和訪問路徑有關，通過在線json解析數據發現可以解析，這讓我欣喜不已。

json數據：

廢話不多說，直接上代碼

//網易新聞類型
String[] typeArray={"BBM54PGAwangning","BCR1UC1Qwangning","BD29LPUBwangning","BD29MJTVwangning","C275ML7Gwangning"};
String type = typeArray[width];

//網易新聞列表url
String url1 = "http://3g.163.com/touch/reconstruct/article/list/";
//網易新聞內容url
String url2 = "http://3g.163.com/news/article/";

//根據新聞列表url，獲取新聞docid,並把docid存儲到list中
private static List<String> getDocid(String url,int num,String type) {
    String json = null;
    List<String> id=new ArrayList<>();
    Map map=null;
    JSONArray parseArray=null;
    String jsonStrM="";
    json = JSONUtils.loadJson(url+type+"/"+num+"-10.html");
    String jsonStr = StringUtils.substringBeforeLast(json, ")");
    String jsonStrO = StringUtils.substringAfter(jsonStr,"artiList(");
    Map parse = (Map) JSONObject.parse(jsonStrO);
    parseArray = (JSONArray) parse.get(type);
    for(int j=0;j<parseArray.size();j++){
        map = (Map)parseArray.get(j);
        id.add((String) map.get("docid"));
    }
    return id;
}

//根據內容url2獲取新聞信息並進行存儲
private static void getContent(String url2, List<String> ids) {
    System.out.println("存儲開始！！");
    String url = null;
    Connection connection = Jsoup.connect(url2);
    int i = 1;
    for (;i<ids.size();i++){
        url = url2+ids.get(i)+".html";
        connection = Jsoup.connect(url);
        try {
            Document document = connection.get();
            //獲取新聞標題
            Elements title = document.select("[class=title]");
            //獲取新聞來源和文章發佈時間
            Elements articleInfo = document.select("[class=info]");
            Elements src = articleInfo.select("[class=source js-source]");
            Elements time = articleInfo.select("[class=time js-time]");
            //獲取新聞內容
            Elements contentEle = document.select("[class=page js-page on]");
            DBCollection dbCollection= null;
            try {
                dbCollection = MongoDBUtils.connMongoDB();
            } catch (Exception e) {
                e.printStackTrace();
            }
            BasicDBObject obj = new BasicDBObject();
            obj.put("title", src.html());
            obj.put("srcFrom", src.html());
            obj.put("time", time.html());
            obj.put("content", contentEle.html());
            dbCollection.insert(obj);
            DBCursor dbCursor = dbCollection.find();
            while(dbCursor.hasNext()){
                Map map = (Map)dbCursor.next();
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    System.out.println("本次共計存儲"+i*0.8+"條數據");
}

//設置爬取深度，循環多次獲取docid
private static List<String> getIds(String url1,int num,String type) {
    List<String> id = new ArrayList<>();
    List<String> ids = new ArrayList<>();
    for (int i=0;i<=num;i+=10){
        id = getDocid(url1,i,type);
        ids.addAll(id);
    }
    return ids;
}

public static void main(String[] args) throws Exception {
    //爬取條數,10的倍數，網易新聞每10條預留大約2個廣告位，所以爬取新聞的真實條數大約爲80%
    int deep = 30;
    //爬取寬度，0:首頁，1:社會，2:國內，3:國際，4:歷史
    int width = 1;

    //網易新聞類型
    String[] typeArray={"BBM54PGAwangning","BCR1UC1Qwangning","BD29LPUBwangning","BD29MJTVwangning","C275ML7Gwangning"};
    String type = typeArray[width];

    //網易新聞列表url
    String url1 = "http://3g.163.com/touch/reconstruct/article/list/";
    //網易新聞內容url
    String url2 = "http://3g.163.com/news/article/";


    List<String> ids = new ArrayList<>();

    //根據url1，爬取條數，新聞類型獲取新聞docid
    ids = getIds(url1,deep,type);
    //根據url2，新聞docid獲取內容並存儲到MongoDB
    getContent(url2,ids);
}

爲了方便存取比較大的數據量，使用了mongodb數據庫進行存儲

列表

內容

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

java新聞爬取

C語言--右移左移

12款高效開源Wiki系統推薦，打造團隊知識管理利器

一個開源且全面的C#算法實戰教程

dotnet 基於 DirectML 控制檯運行 Phi-3 模型

自定義MyBatis插件

一款.NET開源、功能強大、跨平臺的繪圖庫 - OxyPlot

常用的 Git 指令

鼠標控制軟件有可能和虛擬機軟件產生衝突

sm4加密工具類

javaweb過濾器配置

80端口被佔用問題

eclipse環境錯誤問題

javaweb攔截器

AJAX 跨域訪問問題

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結