需求分析
- 最近新型肺炎肆虐寒假在家閒來無事就突發奇想,爬取新浪微博中與肺炎患者自救有關的帖子以texcel形式保存起來做一些分析。
- 經過一番折騰,最終使用webMagic框架成功爬取了200個微博頁面共3263條帖子。
- 立圖爲證
環境搭建
- 首先創建一個空的maven工程。
- 在pom.xml文件引入相關依賴
<dependencies>
<!-- 讀寫excel文件-->
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.15</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.15</version>
</dependency>
<!-- webmagic-->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<!-- 布隆過濾器-->
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>25.1-jre</version>
</dependency>
</dependencies>
實現步驟
- 因爲自己爬取的是肺炎患者自助相關帖子,爬蟲的入口鏈接爲 https://s.weibo.com/weibo?q=肺炎患者自助&Refer=index&page=1。
- 我們知道微博必須要登錄才能訪問,如果直接爬取鏈接爬到的只是空頁面。爲了方便,這裏直接僞裝一個登錄態。
- 首先登錄微博,然後在瀏覽器調試窗口抓取cookie信息。
- 然後將cookie信息添加到請求頭中,僞裝一個登錄態。
Site site = Site.me()
.setCharset("utf8") //設置編碼
.setTimeOut(10*1000) //設置超時時間
.setRetrySleepTime(3000) //設置重試的間隔時間
.setRetryTimes(3) //設置重試的次數
//添加抓包獲取的cookie信息
.addCookie("s.weibo.com", "ALF","1612149474")
.addCookie("s.weibo.com", "Apache","8262060252464.451.1580612613221")
.addCookie("s.weibo.com", "SINAGLOBAL","8262060252464.451.1580612613221")
.addCookie("s.weibo.com", "SSOLoginState","1580613475")
.addCookie("s.weibo.com", "SUB","_2A25zMk80DeRhGeNJ7FsT8ivOyT-IHXVQRif8rDV8PUNbmtAKLWfMkW9NS7UYSlen74ogE4N7bsE8rkiUf8KC-h9R")
.addCookie("s.weibo.com", "SUBP","0033WrSXqPxfM725Ws9jqgMF55529P9D9WWvS_evbaDmZal0ML-HEL_A5JpX5KzhUgL.Fo-NS0.Eeo-Eeoe2dJLoI05LxKnLBoqL1h-LxKMLB.2LBKMLxK-LB.BLBK.LxKnL1K.LBo.LxKMLBoeL1Kq7ehqt")
.addCookie("s.weibo.com", "SUHB","0u5RC7NpfZ1RL0")
.addCookie("s.weibo.com", "ULV","1580612613262:1:1:1:8262060252464.451.1580612613221:")
.addCookie("s.weibo.com", "UOR",",,graph.qq.com")
.addCookie("s.weibo.com", "_s_tentry","s.weibo.com")
.addCookie("s.weibo.com", "wb_view_log_5779225223","1920*10801")
.addCookie("s.weibo.com", "wvr","6")
.addCookie("weibo.com", "Ugrow-G0","589da022062e21d675f389ce54f2eae7")
.addCookie("weibo.com", "YF-Page-G0","70942dbd611eb265972add7bc1c85888|1580610997|1580610997")
.addCookie("weibo.com", "YF-V5-G0","99df5c1ecdf13307fb538c7e59e9bc9d")
- 爲了防止ip請求過於頻繁被禁ip的風險,這裏在每次請求之間隨機間隔一定時間。
//每請求一次 隨機讓線程休眠0-5秒
Random random = new Random();
int anInt = random.nextInt(5);
try {
Thread.sleep(anInt);
} catch (InterruptedException e) {
e.printStackTrace();
}
/**
* 爬取關鍵詞搜索的帖子 1-200頁 保存爲txt文件
*/
public class MyProcess implements PageProcessor {
public static String url = "https://s.weibo.com/weibo?q=肺炎患者自助&Refer=index&page=";//請求url
public static int maxpage = 200; //標識最大頁數
public static FileWriter wr = null;
public static BufferedWriter out = null;
static {
try {
wr = new FileWriter("肺炎患者自助.txt");//文件保存路徑
} catch (IOException e) {
e.printStackTrace();
}
out = new BufferedWriter(wr);
}
public static int count = 1; //當前獲取了多少數據
public static int thispage = 1; //標識當前請求到第幾頁
public void process(Page page) {
System.out.println(page.getHtml());
List<Selectable> list = page.getHtml().css("#pl_feedlist_index div.card-wrap").nodes();
//獲取當前頁面所有帖子
for(Selectable crad:list){
Document doc = Jsoup.parse(crad.toString());
//獲取編號 獲取發帖人
String persion = doc.select("a.name").first().attr("nick-name");
System.out.println("編號:"+count+"發帖人:"+persion);
//獲取發帖人微博
String weibo = crad.css("div[class=avator]").links().get();
//獲取帖子內容
String text = doc.select("p[node-type=feed_list_content]").text();
String text_full = doc.select("p[node-type=feed_list_content_full]").text();
String content = text_full.equals("")?text:text_full;
//獲取時間
String time = doc.select("p[class=from]").first().text();
//獲取
String zan = doc.select("div[class=card-act]").text();
try {
out.write("編號:"+count);
out.newLine();
out.write("作者:"+persion);
out.newLine();
out.write("鏈接:"+weibo);
out.newLine();
out.write("內容:"+content);
out.newLine();
out.write("時間:"+time);
out.newLine();
out.write("相關:"+zan);
out.newLine();
out.flush();
} catch (IOException e) {
e.printStackTrace();
}
count++;
}
//在第一頁請求完成後把第2-200頁加入到任務隊列
if(thispage==1){
for(int i =2;i<=maxpage;i++){
page.addTargetRequest(url+i);
}
}
System.out.println("第"+thispage+"頁 請求完成");
thispage++;
//每請求一次 隨機讓線程休眠0-5秒
Random random = new Random();
int anInt = random.nextInt(5);
try {
Thread.sleep(anInt);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
private Site site = Site.me()
.setCharset("utf8") //設置編碼
.setTimeOut(10*1000) //設置超時時間
.setRetrySleepTime(3000) //設置重試的間隔時間
.setRetryTimes(3) //設置重試的次數
//添加抓包獲取的cookie信息
.addCookie("s.weibo.com", "ALF","1612149474")
.addCookie("s.weibo.com", "Apache","8262060252464.451.1580612613221")
.addCookie("s.weibo.com", "SINAGLOBAL","8262060252464.451.1580612613221")
.addCookie("s.weibo.com", "SSOLoginState","1580613475")
.addCookie("s.weibo.com", "SUB","_2A25zMk80DeRhGeNJ7FsT8ivOyT-IHXVQRif8rDV8PUNbmtAKLWfMkW9NS7UYSlen74ogE4N7bsE8rkiUf8KC-h9R")
.addCookie("s.weibo.com", "SUBP","0033WrSXqPxfM725Ws9jqgMF55529P9D9WWvS_evbaDmZal0ML-HEL_A5JpX5KzhUgL.Fo-NS0.Eeo-Eeoe2dJLoI05LxKnLBoqL1h-LxKMLB.2LBKMLxK-LB.BLBK.LxKnL1K.LBo.LxKMLBoeL1Kq7ehqt")
.addCookie("s.weibo.com", "SUHB","0u5RC7NpfZ1RL0")
.addCookie("s.weibo.com", "ULV","1580612613262:1:1:1:8262060252464.451.1580612613221:")
.addCookie("s.weibo.com", "UOR",",,graph.qq.com")
.addCookie("s.weibo.com", "_s_tentry","s.weibo.com")
.addCookie("s.weibo.com", "wb_view_log_5779225223","1920*10801")
.addCookie("s.weibo.com", "wvr","6")
.addCookie("weibo.com", "Ugrow-G0","589da022062e21d675f389ce54f2eae7")
.addCookie("weibo.com", "YF-Page-G0","70942dbd611eb265972add7bc1c85888|1580610997|1580610997")
.addCookie("weibo.com", "YF-V5-G0","99df5c1ecdf13307fb538c7e59e9bc9d")
//添加請求頭,僞裝瀏覽器請求
.addHeader("User-Agent",
"ozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80" +
" Safari/537.36 Core/1.47.516.400 QQBrowser/9.4.8188.400")
.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
.addHeader("Accept-Encoding", "gzip, deflate, sdch")
.addHeader("Accept-Language", "zh-CN,zh;q=0.8")
.addHeader("Connection", "keep-alive")
.addHeader("Referer", "https://s.weibo.com");
;
public Site getSite() {
return site;
}
//主函數,執行爬蟲
public static void main(String[] args) {
//請求微博關鍵詞搜索界面第一頁
Spider.create(new MyProcess())
.addUrl(url+1)
// .addPipeline(new ExcelPipeline())
// .thread(5) //表示開啓5個線程來完成任務
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(1000*1000)))//設置布隆過濾器,最多對100w數據進行去重
.run();
}
}
- 因爲業務邏輯比較簡單,爲了方便沒有使用Pipeline,直接每解析一條帖子就寫入到txt文件中。
- 爬取結束後再將txt文件數據寫入到excel文件即可。
//讀取txt文件帖子 保存到excel文件
public static void save() throws Exception {
FileReader reader = new FileReader("關鍵詞搜索.txt");
BufferedReader in = new BufferedReader(reader);
//創建工作簿
Workbook wb = new HSSFWorkbook();
//創建 Sheet頁
Sheet sheetA = wb.createSheet("A");
FileOutputStream fos = new FileOutputStream("關鍵詞搜索.xls");
//當前excel文件行數
int num=0;
//6行 6行的讀取數據 每6行表示一個帖子
String line1 = in.readLine();
String line2 = in.readLine();
String line3 = in.readLine();
String line4 = in.readLine();
String line5 = in.readLine();
String line6 = in.readLine();
//創建單元行
Row row = sheetA.createRow(num);
//每行插入6列元素
Cell cell0 = row.createCell(0);
Cell cell1 = row.createCell(1);
Cell cell2 = row.createCell(2);
Cell cell3 = row.createCell(3);
Cell cell4 = row.createCell(4);
Cell cell5 = row.createCell(5);
cell0.setCellValue(line1.substring(3));
cell1.setCellValue(line2.substring(3));
cell2.setCellValue(line3.substring(3));
cell3.setCellValue(line4.substring(3));
cell4.setCellValue(line5.substring(3));
cell5.setCellValue(line6.substring(3));
System.out.println("寫數據結束!");
while(
line1!=null &&
line2!=null &&
line3!=null &&
line4!=null &&
line5!=null &&
line6!=null
){
System.out.println(num);
num++;
line1 = in.readLine();
line2 = in.readLine();
line3 = in.readLine();
line4 = in.readLine();
line5 = in.readLine();
line6 = in.readLine();
//創建單元行
row = sheetA.createRow(num);
//每行插入6列元素
cell0 = row.createCell(0);
cell1 = row.createCell(1);
cell2 = row.createCell(2);
cell3 = row.createCell(3);
cell4 = row.createCell(4);
cell5 = row.createCell(5);
if(line1==null){
break;
}
cell0.setCellValue(line1.substring(3));
cell1.setCellValue(line2.substring(3));
cell2.setCellValue(line3.substring(3));
cell3.setCellValue(line4.substring(3));
cell4.setCellValue(line5.substring(3));
cell5.setCellValue(line6.substring(3));
}
wb.write(fos);
fos.close();
wb.close();
}