本來想爬今日頭條,在網上找了很多方法,走了很多彎路,異步刷新沒能解決,本人爬蟲小白。後來發現json數據和本地cookie也有關,感覺前路艱難。果斷換到網易新聞,網易新聞相對來說獲取數據比較簡單,通過谷歌F12分析包數據,發現網易異步刷新的包和訪問路徑有關,通過在線json解析數據發現可以解析,這讓我欣喜不已。
//網易新聞類型 String[] typeArray={"BBM54PGAwangning","BCR1UC1Qwangning","BD29LPUBwangning","BD29MJTVwangning","C275ML7Gwangning"}; String type = typeArray[width]; //網易新聞列表url String url1 = "http://3g.163.com/touch/reconstruct/article/list/"; //網易新聞內容url String url2 = "http://3g.163.com/news/article/";
//根據新聞列表url,獲取新聞docid,並把docid存儲到list中 private static List<String> getDocid(String url,int num,String type) { String json = null; List<String> id=new ArrayList<>(); Map map=null; JSONArray parseArray=null; String jsonStrM=""; json = JSONUtils.loadJson(url+type+"/"+num+"-10.html"); String jsonStr = StringUtils.substringBeforeLast(json, ")"); String jsonStrO = StringUtils.substringAfter(jsonStr,"artiList("); Map parse = (Map) JSONObject.parse(jsonStrO); parseArray = (JSONArray) parse.get(type); for(int j=0;j<parseArray.size();j++){ map = (Map)parseArray.get(j); id.add((String) map.get("docid")); } return id; }
//根據內容url2獲取新聞信息並進行存儲 private static void getContent(String url2, List<String> ids) { System.out.println("存儲開始!!"); String url = null; Connection connection = Jsoup.connect(url2); int i = 1; for (;i<ids.size();i++){ url = url2+ids.get(i)+".html"; connection = Jsoup.connect(url); try { Document document = connection.get(); //獲取新聞標題 Elements title = document.select("[class=title]"); //獲取新聞來源和文章發佈時間 Elements articleInfo = document.select("[class=info]"); Elements src = articleInfo.select("[class=source js-source]"); Elements time = articleInfo.select("[class=time js-time]"); //獲取新聞內容 Elements contentEle = document.select("[class=page js-page on]"); DBCollection dbCollection= null; try { dbCollection = MongoDBUtils.connMongoDB(); } catch (Exception e) { e.printStackTrace(); } BasicDBObject obj = new BasicDBObject(); obj.put("title", src.html()); obj.put("srcFrom", src.html()); obj.put("time", time.html()); obj.put("content", contentEle.html()); dbCollection.insert(obj); DBCursor dbCursor = dbCollection.find(); while(dbCursor.hasNext()){ Map map = (Map)dbCursor.next(); } } catch (IOException e) { e.printStackTrace(); } } System.out.println("本次共計存儲"+i*0.8+"條數據"); }
//設置爬取深度,循環多次獲取docid private static List<String> getIds(String url1,int num,String type) { List<String> id = new ArrayList<>(); List<String> ids = new ArrayList<>(); for (int i=0;i<=num;i+=10){ id = getDocid(url1,i,type); ids.addAll(id); } return ids; }
public static void main(String[] args) throws Exception { //爬取條數,10的倍數,網易新聞每10條預留大約2個廣告位,所以爬取新聞的真實條數大約爲80% int deep = 30; //爬取寬度,0:首頁,1:社會,2:國內,3:國際,4:歷史 int width = 1; //網易新聞類型 String[] typeArray={"BBM54PGAwangning","BCR1UC1Qwangning","BD29LPUBwangning","BD29MJTVwangning","C275ML7Gwangning"}; String type = typeArray[width]; //網易新聞列表url String url1 = "http://3g.163.com/touch/reconstruct/article/list/"; //網易新聞內容url String url2 = "http://3g.163.com/news/article/"; List<String> ids = new ArrayList<>(); //根據url1,爬取條數,新聞類型獲取新聞docid ids = getIds(url1,deep,type); //根據url2,新聞docid獲取內容並存儲到MongoDB getContent(url2,ids); }