首先是分析網頁
從頁面可以看出,所有的鏈接都在class="f1 bm"標籤之內
之後找到headers,獲取User-Agent
因此headers可以寫成如下:
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'cZBD_2132_saltkey=JFG36F5m; cZBD_2132_lastvisit=1592714390; Hm_lvt_2504f1c08c3a31e74bbfb16ecaff376b=1592717993; cZBD_2132_adclose_88=1; cZBD_2132_sid=ChElwH; cZBD_2132_st_t=0%7C1592718787%7C7cfe33c6fc8493fe312e796a99ca0dad; cZBD_2132_forum_lastvisit=D_97_1592718341D_83_1592718519D_277_1592718765D_86_1592718787; cZBD_2132_ulastactivity=596177hNivTSB%2FriI9%2FBHBSYoKbt4EeT91XI5SC5R2lZRKqPI5v5; cZBD_2132_auth=f040j2lAGWalqGmrBuzDI4Q9veLHnWl21UOHi031c%2BuUvxUmZx%2FAH5hH7r7pHpWt8L1RyLHPKrol3N69FKxPpDfE8Tg; cZBD_2132_lastcheckfeed=485953%7C1592719011; cZBD_2132_lip=218.92.226.20%2C1592718764; cZBD_2132_nofavfid=1; cZBD_2132_onlineusernum=2271; cZBD_2132_noticeTitle=1; cZBD_2132_ignore_notice=1; Hm_lpvt_2504f1c08c3a31e74bbfb16ecaff376b=1592719183; cZBD_2132_lastact=1592719260%09search.php%09forum',
'Host': 'cskaoyan.com',
'Referer': 'http://cskaoyan.com/search.php?mod=forum&searchid=145&orderby=lastpost&ascdesc=desc&searchsubmit=yes&kw=%B4%F3%D1%A7',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'
}
函數入口如下:
s = ['2020','考研']
def main():
url = "http://www.cskaoyan.com/"
#獲取所有頁面的詳細頁面URL
detail_url = getdetailurl(url)
#把所有詳細頁面的信息保存到data中
Get_all(detail_url)
for s_num in s:
savepath = s_num + '.xls'
find_tag(data, savepath, s_num)
url是王道論壇的網站首頁
detail_url是獲取首頁class="f1 bm"標籤之內的所有href鏈接,如下所示:
它的每一個鏈接都是class="fl_g"
#獲取首頁所有的標題鏈接
def getdetailurl(url):
selector = comp(url)
datalist = selector.xpath('//*[@class="fl_g"]/dl/dt/a/@href')
# datalist = res.xpath('//*[@class="fl_g"]/dl/dt/a/@href').extract()
for item in datalist:
new_url = "http://www.cskaoyan.com/" + item
details_url.append(new_url)
return details_url
details_url用於保存首頁的所有標題鏈接,其中comp(url)是用來解析網頁
#頁面解析
def comp(url):
html_data = requests.get(url=url, headers=headers)
html_data.encoding = html_data.apparent_encoding
html = html_data.text
selector = etree.HTML(html)
return selector
接下來是依次遍歷所有的鏈接
def Get_all(detail_url):
for item in detail_url:
#都是從第一頁開始的
print("準備爬取{}".format(item))
Get_all_href(item)
Get_all_href是獲取每一個首頁鏈接裏面的所有的子頁面的鏈接如:
獲取每一頁的鏈接代碼如下:
def Get_all_href(url):
# 判斷是否有下一頁
panduan = next_page(url)
#獲取當前頁的所有href鏈接和text文本
selector = comp(url)
#使用模糊查詢,查詢class="common"和class="new"的相關的值
url_result = selector.xpath('//*[contains(@class, "common") or contains(@class, "new")]/a[last()]/@href')
title_result = selector.xpath('//*[contains(@class, "common") or contains(@class, "new")]/a[last()]/text()')
for temp in range(len(url_result)):
title_url = 'http://cskaoyan.com/' + url_result[temp]
title = title_result[temp]
result = [title,title_url]
data.append(result)
#有下一頁
if panduan:
#下一頁的url
url = 'http://www.cskaoyan.com/' + get_next_url(url)
Get_all_href(url)
#沒有下一頁
else:
return
這裏的url_result和title_result用於保存獲取到的鏈接和標題,保存到result列表中.
url_result = selector.xpath('//*[contains(@class, "common") or contains(@class, "new")]/a[last()]/@href')
title_result = selector.xpath('//*[contains(@class, "common") or contains(@class, "new")]/a[last()]/text()')
這兩條語句的寫法採用了xpath的模糊查詢,因爲用etree.HTML解析出來的網頁會出現和原網頁不同的情況如下:
我們發現他的class變了,原網頁是class="common",用etree.HTML解析完之後變成了class="new"
所以此網頁我用了模糊查詢,凡是有common和new的都爬下來(缺點是有可能會爬到無關東西,由於此處我看了下網頁沒有其他的class有common或者new字段,所以沒問題)
接下來分析,如何判斷是否有下一頁:代碼如下
def next_page(url):
selector = comp(url)
next = selector.xpath('//*[@class="pg"]/a[last()]/text()')
if (len(next)):
if (next[0] == "下一頁"):
return True
else:
return False
此處的xpath爲何要這樣寫呢?
因爲通過分析網頁發現“下一頁”的a標籤總是在最後
所以我們通過xpath獲取其文本內容和"下一頁"對比,如果是對的,則返回True
而如何獲取下一頁的URL(如當前在第3頁,我們應該獲取第4頁的URL)呢?
通過分析網頁,我們發現下一頁的鏈接就在"下一頁"的a標籤內,所以直接獲取
獲取下一頁的鏈接,然後和url拼接
#獲得下一個頁面的鏈接
def get_next_url(url):
selector = comp(url)
next_url = selector.xpath('//*[@class="pg"]/a[last()]/@href')
return next_url[0]
至此,我們獲取到了所有的url鏈接和標題
接下來根據查找相關關鍵字,把查找到的放到excel中,代碼如下:
def find_tag(datalist,savepath,strr):
# datalist中保存的是data數組中的所有元素
savedata = []
print('尋找{}:'.format(strr))
for item in datalist:
if strr in item[0]:
savedata.append(item)
save_data(savedata,savepath,strr)
def save_data(data,savepath,strr):
workbook = xlwt.Workbook(encoding="utf-8", style_compression=0)
worksheet = workbook.add_sheet(strr, cell_overwrite_ok=True)
col = ("標題","鏈接")
for i in range(0, 2):
worksheet.write(0, i, col[i])
for i in range(len(data)):
print("第%d條數據寫入完畢!" % (i + 1))
datalist = data[i]
for j in range(0, 2):
worksheet.write(i + 1, j, datalist[j])
workbook.save(savepath)
至此,我們獲取到了具有關鍵字的標題和其URL鏈接。
完整代碼如下:
import requests
from lxml import etree
import xlwt
#headers
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'cZBD_2132_saltkey=JFG36F5m; cZBD_2132_lastvisit=1592714390; Hm_lvt_2504f1c08c3a31e74bbfb16ecaff376b=1592717993; cZBD_2132_adclose_88=1; cZBD_2132_sid=ChElwH; cZBD_2132_st_t=0%7C1592718787%7C7cfe33c6fc8493fe312e796a99ca0dad; cZBD_2132_forum_lastvisit=D_97_1592718341D_83_1592718519D_277_1592718765D_86_1592718787; cZBD_2132_ulastactivity=596177hNivTSB%2FriI9%2FBHBSYoKbt4EeT91XI5SC5R2lZRKqPI5v5; cZBD_2132_auth=f040j2lAGWalqGmrBuzDI4Q9veLHnWl21UOHi031c%2BuUvxUmZx%2FAH5hH7r7pHpWt8L1RyLHPKrol3N69FKxPpDfE8Tg; cZBD_2132_lastcheckfeed=485953%7C1592719011; cZBD_2132_lip=218.92.226.20%2C1592718764; cZBD_2132_nofavfid=1; cZBD_2132_onlineusernum=2271; cZBD_2132_noticeTitle=1; cZBD_2132_ignore_notice=1; Hm_lpvt_2504f1c08c3a31e74bbfb16ecaff376b=1592719183; cZBD_2132_lastact=1592719260%09search.php%09forum',
'Host': 'cskaoyan.com',
'Referer': 'http://cskaoyan.com/search.php?mod=forum&searchid=145&orderby=lastpost&ascdesc=desc&searchsubmit=yes&kw=%B4%F3%D1%A7',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'
}
#保存所有頁面鏈接的詳細頁面
details_url = []
data = []
s = ['2020','考研']
def main():
url = "http://www.cskaoyan.com/"
#獲取所有頁面的詳細頁面URL
detail_url = getdetailurl(url)
#把所有詳細頁面的信息保存到data中
Get_all(detail_url)
for s_num in s:
savepath = s_num + '.xls'
find_tag(data, savepath, s_num)
#獲取首頁所有的標題鏈接
def getdetailurl(url):
selector = comp(url)
datalist = selector.xpath('//*[@class="fl_g"]/dl/dt/a/@href')
# datalist = res.xpath('//*[@class="fl_g"]/dl/dt/a/@href').extract()
for item in datalist:
new_url = "http://www.cskaoyan.com/" + item
details_url.append(new_url)
return details_url
def Get_all(detail_url):
for item in detail_url:
#都是從第一頁開始的
print("準備爬取{}".format(item))
Get_all_href(item)
def Get_all_href(url):
# 判斷是否有下一頁
panduan = next_page(url)
#獲取當前頁的所有href鏈接和text文本
selector = comp(url)
#使用模糊查詢,查詢class="common"和class="new"的相關的值
url_result = selector.xpath('//*[contains(@class, "common") or contains(@class, "new")]/a[last()]/@href')
title_result = selector.xpath('//*[contains(@class, "common") or contains(@class, "new")]/a[last()]/text()')
for temp in range(len(url_result)):
title_url = 'http://cskaoyan.com/' + url_result[temp]
title = title_result[temp]
result = [title,title_url]
data.append(result)
#有下一頁
if panduan:
#下一頁的url
url = 'http://www.cskaoyan.com/' + get_next_url(url)
Get_all_href(url)
#沒有下一頁
else:
return
def next_page(url):
selector = comp(url)
next = selector.xpath('//*[@class="pg"]/a[last()]/text()')
if (len(next)):
if (next[0] == "下一頁"):
return True
else:
return False
#頁面解析
def comp(url):
html_data = requests.get(url=url, headers=headers)
html_data.encoding = html_data.apparent_encoding
html = html_data.text
selector = etree.HTML(html)
return selector
#獲得下一個頁面的鏈接
def get_next_url(url):
selector = comp(url)
next_url = selector.xpath('//*[@class="pg"]/a[last()]/@href')
return next_url[0]
#保存數據
def save_data(data,savepath,strr):
workbook = xlwt.Workbook(encoding="utf-8", style_compression=0)
worksheet = workbook.add_sheet(strr, cell_overwrite_ok=True)
col = ("標題","鏈接")
for i in range(0, 2):
worksheet.write(0, i, col[i])
for i in range(len(data)):
print("第%d條數據寫入完畢!" % (i + 1))
datalist = data[i]
for j in range(0, 2):
worksheet.write(i + 1, j, datalist[j])
workbook.save(savepath)
def find_tag(datalist,savepath,strr):
# datalist中保存的是data數組中的所有元素
savedata = []
print('尋找{}:'.format(strr))
for item in datalist:
if strr in item[0]:
savedata.append(item)
save_data(savedata,savepath,strr)
if __name__ == '__main__':
main()