廢話板塊
做大數據怎麼沒有數據呢?對於我們,數據的來源便是爬蟲。其實博主之前自己基於HTTP協議寫過一個小的爬蟲。所以更加明白要處理去重,解析頁面。解決各種各樣的小麻煩,和觸發js,跳過防爬蟲機制是有多麼的令人鬧心。所幸。有一羣無私的人創造了WebCollector Java 爬蟲,將這些問題的大部分全部解決。並且十分的利於二次開發。十分感謝他們的付出。這是他們的網站:WebCollector教程。
廢話板塊二
爬蟲簡介:
WebCollector是一個無須配置、便於二次開發的JAVA爬蟲框架(內核),它提供精簡的的API,只需少量代碼即可實現一個功能強大的爬蟲。WebCollector-Hadoop是WebCollector的Hadoop版本,支持分佈式爬取。爬蟲內核:
WebCollector致力於維護一個穩定、可擴的爬蟲內核,便於開發者進行靈活的二次開發。內核具有很強的擴展性,用戶可以在內核基礎上開發自己想要的爬蟲。源碼中集成了Jsoup,可進行精準的網頁解析。2.x版本中集成了selenium,可以處理javascript生成的數據。
如何搭建WebCollector,在他們的網站上很詳細,並且有例子可循。再此便不墜述。
這裏我直接貼上我的微博爬蟲。註釋寫得很清楚,也就不解釋太多(的確太累了!!T.T)
直接看到這裏複製過去吧
使用selenium登陸微博獲取cookie
package com.codsway.crawler;
import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
import cn.edu.hfut.dmic.webcollector.net.HttpResponse;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.util.Set;
import javax.imageio.ImageIO;
import java.awt.BorderLayout;
import java.awt.Container;
import java.awt.Dimension;
import java.awt.Graphics;
import java.awt.Toolkit;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.awt.image.BufferedImage;
import javax.swing.JButton;
import javax.swing.JFrame;
import javax.swing.JPanel;
import javax.swing.JTextField;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.htmlunit.HtmlUnitDriver;
/**
* 使用Selenium獲取登錄新浪微博weibo.cn的cookie
* 由於weibo.com是密文,所以只能讀weibo.cn的
* @author wrm
*
*/
public class WeiboCN {
/**
* 獲取新浪微博的cookie,這個方法針對weibo.cn有效,對weibo.com無效 weibo.cn以明文形式傳輸數據,請使用小號
* @param uname 用戶名
* @param pwd 用戶密碼
* @return
*/
public static String getSinaCookie(String username, String password) throws Exception {
StringBuilder sb = new StringBuilder();
HtmlUnitDriver driver = new HtmlUnitDriver();
driver.setJavascriptEnabled(true);
driver.get("http://login.weibo.cn/login/");
WebElement ele = driver.findElementByCssSelector("img");
String src = ele.getAttribute("src");
String cookie = concatCookie(driver);
HttpRequest request = new HttpRequest(src);
request.setCookie(cookie);
HttpResponse response = request.getResponse();
ByteArrayInputStream is = new ByteArrayInputStream(response.getContent());
BufferedImage img = ImageIO.read(is);
is.close();
ImageIO.write(img, "png", new File("result.png"));
String userInput = new CaptchaFrame(img).getUserInput();
WebElement mobile = driver.findElementByCssSelector("input[name=mobile]");
mobile.sendKeys(username);
WebElement pass = driver.findElementByCssSelector("input[type=password]");
pass.sendKeys(password);
WebElement code = driver.findElementByCssSelector("input[name=code]");
code.sendKeys(userInput);
WebElement rem = driver.findElementByCssSelector("input[name=remember]");
rem.click();
WebElement submit = driver.findElementByCssSelector("input[name=submit]");
submit.click();
String result = concatCookie(driver);
driver.close();
if (result.contains("gsid_CTandWM")) {
return result;
} else {
throw new Exception("weibo login failed");
}
}
public static String concatCookie(HtmlUnitDriver driver) {
Set<Cookie> cookieSet = driver.manage().getCookies();
StringBuilder sb = new StringBuilder();
for (Cookie cookie : cookieSet) {
sb.append(cookie.getName() + "=" + cookie.getValue() + ";");
}
String result = sb.toString();
return result;
}
public static class CaptchaFrame {
JFrame frame;
JPanel panel;
JTextField input;
int inputWidth = 100;
BufferedImage img;
String userInput = null;
public CaptchaFrame(BufferedImage img) {
this.img = img;
}
public String getUserInput() {
frame = new JFrame("輸入驗證碼");
final int imgWidth = img.getWidth();
final int imgHeight = img.getHeight();
int width = imgWidth * 2 + inputWidth * 2;
int height = imgHeight * 2+50;
Dimension dim = Toolkit.getDefaultToolkit().getScreenSize();
int startx = (dim.width - width) / 2;
int starty = (dim.height - height) / 2;
frame.setBounds(startx, starty, width, height);
Container container = frame.getContentPane();
container.setLayout(new BorderLayout());
panel = new JPanel() {
@Override
public void paintComponent(Graphics g) {
super.paintComponent(g);
g.drawImage(img, 0, 0, imgWidth * 2, imgHeight * 2, null);
}
};
panel.setLayout(null);
container.add(panel);
input = new JTextField(6);
input.setBounds(imgWidth * 2, 0, inputWidth, imgHeight * 2);
panel.add(input);
JButton btn = new JButton("登錄");
btn.addActionListener(new ActionListener() {
@Override
public void actionPerformed(ActionEvent e) {
userInput = input.getText().trim();
synchronized (CaptchaFrame.this) {
CaptchaFrame.this.notify();
}
}
});
btn.setBounds(imgWidth * 2 + inputWidth, 0, inputWidth, imgHeight * 2);
panel.add(btn);
frame.setVisible(true);
synchronized (this) {
try {
this.wait();
} catch (InterruptedException ex) {
ex.printStackTrace();
}
}
frame.dispose();
return userInput;
}
}
}
微博內容爬取
/**
* 抽取微博 ,根據正則表達式進行過濾
*/
@Override
public void visit(Page page, CrawlDatums next) {
/*抽取用戶界面:http://weibo.cn/u/XXXXX*/
String url = page.getUrl();
SinaUserInfo si = new SinaUserInfo();
List<SinaDataInfo> sd = new Vector<SinaDataInfo>();
if(page.matchUrl("http://weibo.cn/u?/?[^/,\\?]*")){ //用戶頁面
//1.尋找匹配的用戶進行深度爬取
Elements aLink = page.select("a");
for(Element e : aLink){
String href = e.attr("href");
System.out.println("全部href:"+href);
if(href.matches("(http://weibo.cn)?(/u?/?[^/]*)")){
if(href.indexOf("http://weibo.cn")<0){
href="http://weibo.cn"+href;
}
if(href.indexOf("sinaurl")<0){
System.out.println("過了的:"+href);
next.add(new CrawlDatum(href));
}
}
}
}else if(page.matchUrl("http://weibo.cn/u?/?[^?]*\\??page=[0-9]*$")){ /*TODO:帶?page=?的 這時候不用爬取用戶信息,只用爬取微博,並且爬取頁數要可設置*/
boolean flag = Integer.parseInt(page.getUrl().substring(page.getUrl().indexOf("page=")+5))>10;
if(flag){
return;
}
sd = weiBoCrawler(page);
sdi.addAll(sd);
//2.尋找匹配的用戶進行深度爬取
Elements aLink = page.select("a");
for(Element e : aLink){
String href = e.attr("href");
if(href.matches("(http://weibo.cn)?(/u?/?[^/]*)")){
if(href.indexOf("http://weibo.cn")<0){
href="http://weibo.cn"+href;
}
if(href.indexOf("sinaurl")<0){
System.out.println("過了的:"+href);
next.add(new CrawlDatum(href));
}
}
}
}
}
爬取微博方法:
/**
* 爬取微博信息方法
* @param si 微博用戶
* @param page 頁面
* @return 該頁面下的所有微博的集合
*/
private List<SinaDataInfo> weiBoCrawler( Page page) {
List<SinaDataInfo> sdv = new Vector<SinaDataInfo>();
String url=page.getUrl();
String wbuid = "";
//ID
if(url.indexOf("?")>0){
wbuid = url.substring(url.lastIndexOf("/")+2,url.indexOf("?"));
}else{
wbuid = url.substring(url.lastIndexOf("/")+2);
}
//暱稱
String userSimpla = page.select("div.u").select("span.ctt").get(0).text();
String wbuser = userSimpla.substring(0, userSimpla.indexOf(" "));
Elements wbBox = page.select("div.c");
for(int i=0;i<2;i++)
wbBox.remove(wbBox.size()-1);
for(Element e : wbBox){
SinaDataInfo sd = new SinaDataInfo();
//微博編號
String weiboid = e.attr("id");
//內容
String content = e.select("span.ctt").text();
//平臺
String platform = e.select("span.ct").text().substring(e.select("span.ct").text().indexOf(" ", 2)+1);
//地址
String address = "";
//經度
String lon = "";
//緯度
String lat = "";
//轉發原文
String origincontent = "";
//贊評論轉發總是最後一個
Elements rp = e.select("div").last().select("a");
//贊 倒數第4個a
String praiseStr = rp.get(rp.size()-4).text();
System.out.println("rp:"+rp+"praiseStr:"+praiseStr);
Integer praise = Integer.parseInt(praiseStr.substring(praiseStr.indexOf("[")+1, praiseStr.indexOf("]")));
// //轉發 倒數第3個a
String repostStr = rp.get(rp.size()-3).text();
Integer repost = Integer.parseInt(repostStr.substring(repostStr.indexOf("[")+1, repostStr.indexOf("]")));
// //評論 倒數第2個a
String wbcommentStr = rp.get(rp.size()-3).text();
Integer wbcomment = Integer.parseInt(wbcommentStr.substring(wbcommentStr.indexOf("[")+1, wbcommentStr.indexOf("]")));
//發佈日期
String published = e.select("span.ct").text().substring(0,e.select("span.ct").text().indexOf(" ", 2));
//包含class 'cmt'的爲轉發的 纔有這裏面的這些
Integer originrepost =0 ;
Integer origincomment =0 ;
if(e.select("span").hasClass("cmt")){
//原微博轉發 第三個cmt
String originrepostStr = e.select("span.cmt").get(2).text();
originrepost = Integer.parseInt(originrepostStr.substring(originrepostStr.indexOf("[")+1, originrepostStr.indexOf("]")));
// //原微博評論 第四個cmt
String origincommentStr = e.select("a.cc").get(0).text();
origincomment = Integer.parseInt(origincommentStr.substring(origincommentStr.indexOf("[")+1, origincommentStr.indexOf("]")));
}
sd.setAddress(address)
.setContent(content)
.setLat(lat)
.setLon(lon)
// .setOrigincontent(origincontent)
.setPlatform(platform)
.setPraise(praise)
.setPublished(published)
.setRepost(repost)
.setWbcomment(wbcomment)
.setWeiboid(weiboid)
.setOrigincomment(origincomment)
.setOriginrepost(originrepost)
.setWbuid(wbuid)
.setWbuser(wbuser);
sdv.add(sd);
if (jdbcTemplate != null) {
int updates=jdbcTemplate.update("insert into weiboinfo"
+" (wbuser,wbuid,weiboid,content,html,platform,address,lon,lat,origincontent"
+ ",praise,repost,wbcomment,originrepost,origincomment,published) value(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
wbuser,wbuid,weiboid,content,null,platform,address,lon,lat,origincontent
,praise,repost,wbcomment,originrepost,origincomment,published);
if(updates==1){
System.out.println("mysql插入成功");
}
}
}
return sdv;
}
看着很麻煩吧,我也覺得,但這已經比自己解析頁面簡單得太多了。
因爲好用,所以分享。碼字不易,轉發請註明出處:
http://blog.csdn.net/qq_28945021/article/details/52300736