人生苦短,我用Python
# 數據挖掘
import requests
# 數據清洗
from lxml import etree
# 其他
import random
import time
''' 分析翻頁規律
https://www.telnote.cn/xiaohua/baoxiao/list_1.htm
https://www.telnote.cn/xiaohua/baoxiao/list_2.htm
https://www.telnote.cn/xiaohua/baoxiao/list_3.htm
# <meta name="description" content="笑話內容.......">
'''
url="https://www.telnote.cn/xiaohua/baoxiao/list_"
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
sum=0
for i in range(1,4): #爬1~3頁
# 每頁url
page=i*1
urls=url+str(page)+".htm"
# print(urls)
# 笑話詳情頁url
req=requests.get(urls,headers=headers)
html=req.content.decode('gbk') #轉二進制再轉gbk,主要看網頁是什麼格式
# print(html)
html=etree.HTML(html)
result=html.xpath('//dd[@class="content"]/h1/a')
# print(result)
for i in result:
links="https://www.telnote.cn"+i.get("href")
# print(links)
# 爬取詳情頁的內容
req=requests.get(links,headers=headers)
html=req.content.decode('gbk')
# print(html)
html=etree.HTML(html)
result=html.xpath('//meta[@name="description"]')
# print(result)
for x in result:
sum+=1
print("正在下載第",sum,"個笑話")
data=x.get("content")
# print(type(data))
xieru=open(r"E:\測試python效果\爬取的笑話.txt","a")
xieru.write(data+"\n")