首先是分析网页
从页面可以看出,所有的链接都在class="f1 bm"标签之内
之后找到headers,获取User-Agent
因此headers可以写成如下:
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'cZBD_2132_saltkey=JFG36F5m; cZBD_2132_lastvisit=1592714390; Hm_lvt_2504f1c08c3a31e74bbfb16ecaff376b=1592717993; cZBD_2132_adclose_88=1; cZBD_2132_sid=ChElwH; cZBD_2132_st_t=0%7C1592718787%7C7cfe33c6fc8493fe312e796a99ca0dad; cZBD_2132_forum_lastvisit=D_97_1592718341D_83_1592718519D_277_1592718765D_86_1592718787; cZBD_2132_ulastactivity=596177hNivTSB%2FriI9%2FBHBSYoKbt4EeT91XI5SC5R2lZRKqPI5v5; cZBD_2132_auth=f040j2lAGWalqGmrBuzDI4Q9veLHnWl21UOHi031c%2BuUvxUmZx%2FAH5hH7r7pHpWt8L1RyLHPKrol3N69FKxPpDfE8Tg; cZBD_2132_lastcheckfeed=485953%7C1592719011; cZBD_2132_lip=218.92.226.20%2C1592718764; cZBD_2132_nofavfid=1; cZBD_2132_onlineusernum=2271; cZBD_2132_noticeTitle=1; cZBD_2132_ignore_notice=1; Hm_lpvt_2504f1c08c3a31e74bbfb16ecaff376b=1592719183; cZBD_2132_lastact=1592719260%09search.php%09forum',
'Host': 'cskaoyan.com',
'Referer': 'http://cskaoyan.com/search.php?mod=forum&searchid=145&orderby=lastpost&ascdesc=desc&searchsubmit=yes&kw=%B4%F3%D1%A7',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'
}
函数入口如下:
s = ['2020','考研']
def main():
url = "http://www.cskaoyan.com/"
#获取所有页面的详细页面URL
detail_url = getdetailurl(url)
#把所有详细页面的信息保存到data中
Get_all(detail_url)
for s_num in s:
savepath = s_num + '.xls'
find_tag(data, savepath, s_num)
url是王道论坛的网站首页
detail_url是获取首页class="f1 bm"标签之内的所有href链接,如下所示:
它的每一个链接都是class="fl_g"
#获取首页所有的标题链接
def getdetailurl(url):
selector = comp(url)
datalist = selector.xpath('//*[@class="fl_g"]/dl/dt/a/@href')
# datalist = res.xpath('//*[@class="fl_g"]/dl/dt/a/@href').extract()
for item in datalist:
new_url = "http://www.cskaoyan.com/" + item
details_url.append(new_url)
return details_url
details_url用于保存首页的所有标题链接,其中comp(url)是用来解析网页
#页面解析
def comp(url):
html_data = requests.get(url=url, headers=headers)
html_data.encoding = html_data.apparent_encoding
html = html_data.text
selector = etree.HTML(html)
return selector
接下来是依次遍历所有的链接
def Get_all(detail_url):
for item in detail_url:
#都是从第一页开始的
print("准备爬取{}".format(item))
Get_all_href(item)
Get_all_href是获取每一个首页链接里面的所有的子页面的链接如:
获取每一页的链接代码如下:
def Get_all_href(url):
# 判断是否有下一页
panduan = next_page(url)
#获取当前页的所有href链接和text文本
selector = comp(url)
#使用模糊查询,查询class="common"和class="new"的相关的值
url_result = selector.xpath('//*[contains(@class, "common") or contains(@class, "new")]/a[last()]/@href')
title_result = selector.xpath('//*[contains(@class, "common") or contains(@class, "new")]/a[last()]/text()')
for temp in range(len(url_result)):
title_url = 'http://cskaoyan.com/' + url_result[temp]
title = title_result[temp]
result = [title,title_url]
data.append(result)
#有下一页
if panduan:
#下一页的url
url = 'http://www.cskaoyan.com/' + get_next_url(url)
Get_all_href(url)
#没有下一页
else:
return
这里的url_result和title_result用于保存获取到的链接和标题,保存到result列表中.
url_result = selector.xpath('//*[contains(@class, "common") or contains(@class, "new")]/a[last()]/@href')
title_result = selector.xpath('//*[contains(@class, "common") or contains(@class, "new")]/a[last()]/text()')
这两条语句的写法采用了xpath的模糊查询,因为用etree.HTML解析出来的网页会出现和原网页不同的情况如下:
我们发现他的class变了,原网页是class="common",用etree.HTML解析完之后变成了class="new"
所以此网页我用了模糊查询,凡是有common和new的都爬下来(缺点是有可能会爬到无关东西,由于此处我看了下网页没有其他的class有common或者new字段,所以没问题)
接下来分析,如何判断是否有下一页:代码如下
def next_page(url):
selector = comp(url)
next = selector.xpath('//*[@class="pg"]/a[last()]/text()')
if (len(next)):
if (next[0] == "下一页"):
return True
else:
return False
此处的xpath为何要这样写呢?
因为通过分析网页发现“下一页”的a标签总是在最后
所以我们通过xpath获取其文本内容和"下一页"对比,如果是对的,则返回True
而如何获取下一页的URL(如当前在第3页,我们应该获取第4页的URL)呢?
通过分析网页,我们发现下一页的链接就在"下一页"的a标签内,所以直接获取
获取下一页的链接,然后和url拼接
#获得下一个页面的链接
def get_next_url(url):
selector = comp(url)
next_url = selector.xpath('//*[@class="pg"]/a[last()]/@href')
return next_url[0]
至此,我们获取到了所有的url链接和标题
接下来根据查找相关关键字,把查找到的放到excel中,代码如下:
def find_tag(datalist,savepath,strr):
# datalist中保存的是data数组中的所有元素
savedata = []
print('寻找{}:'.format(strr))
for item in datalist:
if strr in item[0]:
savedata.append(item)
save_data(savedata,savepath,strr)
def save_data(data,savepath,strr):
workbook = xlwt.Workbook(encoding="utf-8", style_compression=0)
worksheet = workbook.add_sheet(strr, cell_overwrite_ok=True)
col = ("标题","链接")
for i in range(0, 2):
worksheet.write(0, i, col[i])
for i in range(len(data)):
print("第%d条数据写入完毕!" % (i + 1))
datalist = data[i]
for j in range(0, 2):
worksheet.write(i + 1, j, datalist[j])
workbook.save(savepath)
至此,我们获取到了具有关键字的标题和其URL链接。
完整代码如下:
import requests
from lxml import etree
import xlwt
#headers
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'cZBD_2132_saltkey=JFG36F5m; cZBD_2132_lastvisit=1592714390; Hm_lvt_2504f1c08c3a31e74bbfb16ecaff376b=1592717993; cZBD_2132_adclose_88=1; cZBD_2132_sid=ChElwH; cZBD_2132_st_t=0%7C1592718787%7C7cfe33c6fc8493fe312e796a99ca0dad; cZBD_2132_forum_lastvisit=D_97_1592718341D_83_1592718519D_277_1592718765D_86_1592718787; cZBD_2132_ulastactivity=596177hNivTSB%2FriI9%2FBHBSYoKbt4EeT91XI5SC5R2lZRKqPI5v5; cZBD_2132_auth=f040j2lAGWalqGmrBuzDI4Q9veLHnWl21UOHi031c%2BuUvxUmZx%2FAH5hH7r7pHpWt8L1RyLHPKrol3N69FKxPpDfE8Tg; cZBD_2132_lastcheckfeed=485953%7C1592719011; cZBD_2132_lip=218.92.226.20%2C1592718764; cZBD_2132_nofavfid=1; cZBD_2132_onlineusernum=2271; cZBD_2132_noticeTitle=1; cZBD_2132_ignore_notice=1; Hm_lpvt_2504f1c08c3a31e74bbfb16ecaff376b=1592719183; cZBD_2132_lastact=1592719260%09search.php%09forum',
'Host': 'cskaoyan.com',
'Referer': 'http://cskaoyan.com/search.php?mod=forum&searchid=145&orderby=lastpost&ascdesc=desc&searchsubmit=yes&kw=%B4%F3%D1%A7',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'
}
#保存所有页面链接的详细页面
details_url = []
data = []
s = ['2020','考研']
def main():
url = "http://www.cskaoyan.com/"
#获取所有页面的详细页面URL
detail_url = getdetailurl(url)
#把所有详细页面的信息保存到data中
Get_all(detail_url)
for s_num in s:
savepath = s_num + '.xls'
find_tag(data, savepath, s_num)
#获取首页所有的标题链接
def getdetailurl(url):
selector = comp(url)
datalist = selector.xpath('//*[@class="fl_g"]/dl/dt/a/@href')
# datalist = res.xpath('//*[@class="fl_g"]/dl/dt/a/@href').extract()
for item in datalist:
new_url = "http://www.cskaoyan.com/" + item
details_url.append(new_url)
return details_url
def Get_all(detail_url):
for item in detail_url:
#都是从第一页开始的
print("准备爬取{}".format(item))
Get_all_href(item)
def Get_all_href(url):
# 判断是否有下一页
panduan = next_page(url)
#获取当前页的所有href链接和text文本
selector = comp(url)
#使用模糊查询,查询class="common"和class="new"的相关的值
url_result = selector.xpath('//*[contains(@class, "common") or contains(@class, "new")]/a[last()]/@href')
title_result = selector.xpath('//*[contains(@class, "common") or contains(@class, "new")]/a[last()]/text()')
for temp in range(len(url_result)):
title_url = 'http://cskaoyan.com/' + url_result[temp]
title = title_result[temp]
result = [title,title_url]
data.append(result)
#有下一页
if panduan:
#下一页的url
url = 'http://www.cskaoyan.com/' + get_next_url(url)
Get_all_href(url)
#没有下一页
else:
return
def next_page(url):
selector = comp(url)
next = selector.xpath('//*[@class="pg"]/a[last()]/text()')
if (len(next)):
if (next[0] == "下一页"):
return True
else:
return False
#页面解析
def comp(url):
html_data = requests.get(url=url, headers=headers)
html_data.encoding = html_data.apparent_encoding
html = html_data.text
selector = etree.HTML(html)
return selector
#获得下一个页面的链接
def get_next_url(url):
selector = comp(url)
next_url = selector.xpath('//*[@class="pg"]/a[last()]/@href')
return next_url[0]
#保存数据
def save_data(data,savepath,strr):
workbook = xlwt.Workbook(encoding="utf-8", style_compression=0)
worksheet = workbook.add_sheet(strr, cell_overwrite_ok=True)
col = ("标题","链接")
for i in range(0, 2):
worksheet.write(0, i, col[i])
for i in range(len(data)):
print("第%d条数据写入完毕!" % (i + 1))
datalist = data[i]
for j in range(0, 2):
worksheet.write(i + 1, j, datalist[j])
workbook.save(savepath)
def find_tag(datalist,savepath,strr):
# datalist中保存的是data数组中的所有元素
savedata = []
print('寻找{}:'.format(strr))
for item in datalist:
if strr in item[0]:
savedata.append(item)
save_data(savedata,savepath,strr)
if __name__ == '__main__':
main()