requests库的使用
一、简介
二、发起请求
三、接收响应
四、session对象
五、练习
http://wz.sun0769.com/index.php/question/questionType?type=4
爬取投诉帖子的编号、帖子的url、帖子的标题,和帖子里的内容,并将内容写入到json文件中。
import re
import requests
import json
def request(url, headers=None):
res = requests.get(url, headers=headers)
res.encoding = "gbk"
# print(res.text)
return res
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36',
'Cookie': 'NSC_wt_xa.tvo0769.dpn=ffffffffc3a0145d45525d5f4f58455e445a4a423660'
}
url_1 = "http://wz.sun0769.com/index.php/question/questionType?type=4"
res = request(url_1, headers=headers)
# 匹配投诉编号
num = re.findall(r'<td width="53" height="30" align="center" bgcolor="#FFFFFF">(\d+?)</td>', res.text)
print(num)
# 匹配帖子url及标题
url_titles = re.findall(r'\[投诉\]</a> <a href="(.*?)" title="(.*?)"', res.text, re.S)
print(url_titles)
for i in range(len(url_titles)):
url = url_titles[i][0] # 取出投诉详情的url
title = url_titles[i][1] # 取出投诉的标题文本
number = num[i] # 取出投诉编号
content = request(url)
text = re.findall(r'<meta name="description" content="(.+?)"', content.text, re.S) # 匹配出投诉的具体内容
# text_result = re.sub(r"<.*>", "", text[0]) # 将符合规则的内容替换成""(空)
# print(text_result)
complaint = {'投诉': {'编号': number, 'url': url, '标题': title, '内容': text}}
with open(r"D:\Python学院学习环境\pachong\complaints\complaint{}.json".format(i), "w")as f:
f.writelines(json.dumps(complaint, ensure_ascii=False, indent=4))
效果图