最早接觸爬蟲是利用java寫腳本,後來自學了利用python進行爬蟲來做入門,會用scrapy,最近用了下R,找了幾個不同類型的字段獲取,當作好玩吧。
爬取內容
R代碼
library(XML)
library(RCurl)
library(stringr)
giveNovel_name = function(rootNode){
novel_name <- xpathSApply(rootNode,"//div[@class='title']/h1/text()",xmlValue)
novel_name=gsub("([\r\n ])","",novel_name)
}
giveAuthor_name = function(rootNode){
author_name <- xpathSApply(rootNode,c("//div[@class='title']/span/a/span/text()"),xmlValue)
author_name=gsub("([\r\n ])","",author_name)
}
giveUri = function(rootNode){
uri <- xpathSApply(rootNode,c("//div[@class='title']//span//a"),xmlAttrs,"href")#xpath路徑中屬性獲取
uri=gsub("([\r\n\t ])","",uri)
}
giveRead = function(rootNode){
read_num <- xpathSApply(rootNode,c("//div[@class='score_txt']/text()[1]"),xmlValue)
read_num=str_extract_all(read_num,"[0-9]+[0-9]")#從字符串中獲取數字
}
##頁面內請求獲取評論量
giveReply = function(rootNode){
population <- xpathSApply(rootNode,c("//div[@class='data']//b//span[@id='lblReviewCnt']//text()"),xmlValue)
}
webData= function(URL){
Sys.sleep(runif(1,1,2))
wp<-getURL(URL,.encoding="UTF-8") #對應的網站編碼
doc<-htmlParse(wp,asText=T,encoding="UTF-8")
rootNode<-xmlRoot(doc)
book_id=str_extract_all(URL,"[0-9]+[0-9]")[[1]]
url2=gsub(" ","",paste("http://c.pingba.qidian.com/BookComment.aspx?BookId=",book_id,""))##拼接頁面內數據請求url
sub_wp<-getURL(url2,.encoding="UTF-8") #對應的網站編碼
sub_doc<-htmlParse(sub_wp,asText=T,encoding="UTF-8")
sub_rootNode<-xmlRoot(sub_doc)
date<-Sys.Date()
data.frame(
novel_name=giveNovel_name(rootNode),
author_name=giveAuthor_name(rootNode),
uri=giveUri(rootNode)[3,1],
read_num=as.numeric(giveRead(rootNode)),
month_likenum=likenum[[1]][1],
population=giveReply(sub_rootNode),
updatetime=date#更新時間
)
}
上述完成了爬取得具體內容,作品的url可能是多個,可以進行批量抓取
##測試單個url##
URL="http://www.qidian.com/Book/3548786.aspx"
info<-webData(URL)#使用編寫的函數,獲得網頁數據
write.table(info,"F:\\數據收集\\qidian_literature.csv",append=TRUE,col.names=FALSE,row.names = FALSE,sep=",")###將數據存到本地文件
####批處理####
con <- file("F:\\數據收集\\qidian_urls.csv", "r")
line=readLines(con,n=1)
while( length(line) != 0 ) {
info<-webData(line)#使用編寫的函數,獲得網頁數據
write.table(info,"F:\\數據收集\\qidian_literature.csv",append=TRUE,col.names=FALSE,row.names = FALSE,sep=",")###將數據存到本地文件
line=readLines(con,n=1)
}
close(con)