关于微信公众号文章的评论数网上的教程是可以用的,这里就不另外讲了,。这里要说的我抓阅读量的过程,太tm坎坷了,足足花了我10个小时,干到半夜12点半有想法了又起来开机!!不过好在最终实现了全代码运行,不需要模拟器或者手机之类的。不过初始的token参数需要fiddler抓取一下pc端微信。
首先,前面抓评论数的经验是微信安卓端和PC端的请求是不一样的,需要综合两者的请求一起抓。
然后,该带的信息必须全部带上,一个post十几个参数,一个都不能少! 我就是在这个问题上浪费了太多时间。
最后,善用chrome调试,多把中间的页面保存下来分析,上面的js你看不明白就对比下chrome发的请求和你自己拼接出来的请求一不一样。
最后测试结果,还有部分文章没抓出来,估计是我哪里还有bug。然后超过10w的阅读量是只会显示100001的。从搜索到获取信息全部由python独立完成,连selenium都没用。不过如果不追求速度用selenium会非常简单!
参考代码
# coding=utf8
import json
import re
import time
from datetime import datetime
import urllib
import requests
import random
class WxMps(object):
"""微信公众号文章、评论抓取爬虫"""
def __init__(self, _biz, _pass_ticket, _app_msg_token, _cookie, _offset=0):
self.offset = _offset
self.biz = _biz # 公众号标志
self.msg_token = _app_msg_token # 票据(非固定)
self.pass_ticket = _pass_ticket # 票据(非固定)
self.session = requests.session()
headers = {
'Cookie': _cookie, # Cookie(非固定)
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/3.53.1159.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat'
# 修改头信息 ,chrome就完美变成微信客户端了!!
}
self.headers = headers
cookies = requests.utils.cookiejar_from_dict(headers, cookiejar=None, overwrite=True)
self.session.cookies=cookies
self.session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/3.53.1159.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat'})
def start(self):
"""请求获取公众号的文章接口"""
offset = self.offset
while True:
api = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={0}&f=json&offset={1}' \
'&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={2}&wxtoken=&appmsg_token' \
'={3}&x5=1&f=json'.format(self.biz, offset, self.pass_ticket, self.msg_token)
print(api)
resp = self.session.get(api, verify=False).json()
ret, status = resp.get('ret'), resp.get('errmsg') # 状态信息
if ret == 0 or status == 'ok':
offset = resp['next_offset'] # 下一次请求偏移量
general_msg_list = resp['general_msg_list']
msg_list = json.loads(general_msg_list)['list'] # 获取文章列表
for msg in msg_list:
comm_msg_info = msg['comm_msg_info'] # 该数据是本次推送多篇文章公共的
msg_id = comm_msg_info['id'] # 文章id
post_time = datetime.fromtimestamp(comm_msg_info['datetime']) # 发布时间
# msg_type = comm_msg_info['type'] # 文章类型
# msg_data = json.dumps(comm_msg_info, ensure_ascii=False) # msg原数据
app_msg_ext_info = msg.get('app_msg_ext_info') # article原数据
if app_msg_ext_info:
# 本次推送的首条文章
self._parse_articles(app_msg_ext_info, msg_id, post_time)
# 本次推送的其余文章
multi_app_msg_item_list = app_msg_ext_info.get('multi_app_msg_item_list')
if multi_app_msg_item_list:
for item in multi_app_msg_item_list:
msg_id = item['fileid'] # 文章id
if msg_id == 0:
msg_id = int(time.time() * 1000) # 设置唯一id,解决部分文章id=0出现唯一索引冲突的情况
self._parse_articles(item, msg_id, post_time)
print('next offset is %d' % offset)
else:
print('Before break , Current offset is %d' % offset)
break
def _parse_articles(self, info, msg_id, post_time):
"""解析嵌套文章数据并保存入库"""
title = info.get('title') # 标题
cover = info.get('cover') # 封面图
author = info.get('author') # 作者
digest = info.get('digest') # 关键字
source_url = info.get('source_url') # 原文地址
content_url = info.get('content_url') # 微信地址
# ext_data = json.dumps(info, ensure_ascii=False) # 原始数据
content_url = content_url.replace('amp;', '').replace('http', 'https')
self._parse_article_detail(content_url, 1)
def _parse_article_detail(self, content_url, article_id):
# 从文章页提取相关参数用于获取评论,article_id是已保存的文章id
try:
html = self.session.get(content_url,verify=False).text
with open('1.html','w',encoding="utf-8") as f:
f.write(html)
except Exception as e:
print('获取评论失败' + content_url)
print(e)
else:
# group(0) is current line
str_comment = re.search(r'var comment_id = "(.*)" \|\| "(.*)" \* 1;', html)
str_msg = re.search(r"var appmsgid = '' \|\| '(.*)'\|\|", html)
str_token = re.search(r'window.appmsg_token = "(.*)";', html)
mid = re.search(r'mid=(\d*)', content_url).group(1)
sn = re.search(r'sn=(\w*)', content_url).group(1)
ct = re.search(r'ct = "(.*)";', html).group(1)
title = re.search(r'var msg_title = "(.*?)"', html).group(1)
req_id = re.search(r'var req_id = \'(.*?)\'', html).group(1)
devicetype = re.search(r'var devicetype = "(.*?)"', html).group(1)
scene = re.search(r'scene=(\d*)', content_url).group(1)
if str_comment and str_msg and str_token:
comment_id = str_comment.group(1) # 评论id(固定)
app_msg_id = str_msg.group(1) # 票据id(非固定)
appmsg_token = str_token.group(1) # 票据token(非固定)
# 缺一不可
if appmsg_token and app_msg_id and comment_id:
print('Crawl article comments: ' + content_url)
# self._crawl_comments(app_msg_id, comment_id, appmsg_token, article_id)
self._crawl_yuedu(self.pass_ticket, appmsg_token,mid,comment_id,sn,ct,title,req_id,devicetype,scene)
def _crawl_comments(self, app_msg_id, comment_id, appmsg_token, article_id):
"""抓取文章的评论"""
api = 'https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&scene=0&__biz={0}' \
'&appmsgid={1}&idx=1&comment_id={2}&offset=0&limit=100&uin=777&key=777' \
'&pass_ticket={3}&wxtoken=777&devicetype=android-26&clientversion=26060739' \
'&appmsg_token={4}&x5=1&f=json'.format(self.biz, app_msg_id, comment_id,
self.pass_ticket, appmsg_token)
resp = self.session.get(api, verify=False).json()
ret, status = resp['base_resp']['ret'], resp['base_resp']['errmsg']
if ret == 0 or status == 'ok':
elected_comment = resp['elected_comment']
for comment in elected_comment:
nick_name = comment.get('nick_name') # 暱称
logo_url = comment.get('logo_url') # 头像
comment_time = datetime.fromtimestamp(comment.get('create_time')) # 评论时间
content = comment.get('content') # 评论内容
content_id = comment.get('content_id') # id
like_num = comment.get('like_num') # 点赞数
# reply_list = comment.get('reply')['reply_list'] # 回复数据
print(nick_name,like_num)
def _crawl_yuedu(self,pass_ticket, appmsg_token,mid,comment_id,sn,ct,title,req_id,devicetype,scene):
api = 'https://mp.weixin.qq.com/mp/getappmsgext?f=json&mock=&uin=777&key=777&pass_ticket=&wxtoken=777&devicetype={0}&clientversion=62060833&__biz={1}&appmsg_token={2}&f=json&x5=0'.format(
urllib.parse.quote(devicetype),self.biz,appmsg_token)
data = {
'r':random.random(),
'__biz': 'MjM5NzI3NDg4MA==',
'appmsg_type': 9,
'mid': mid,
'sn': sn,
'idx': 1,
'scene': 27,
'title': urllib.parse.quote(title),
'ct': 1569809116,
'abtest_cookie':None,
'devicetype': 'Windows10',
'version': 62060833,
'is_need_ticket': 1,
'is_need_ad': 0,
'comment_id': comment_id,
'is_need_reward': 0,
'both_ad': 0,
'reward_uin_count': 0,
'send_time':None,
'msg_daily_idx': 1,
'is_original': 0,
'is_only_read': 1,
'req_id': req_id,
'pass_ticket':None,
'is_temp_url': 0,
'item_show_type': 0,
'tmp_version': 1,
'more_read_type': 0,
'appmsg_like_type': 2,
'related_video_sn':None,
'vid':None
}
time.sleep(2)
resp = self.session.post(api, verify=False,data=data).json() #为啥这里做着做着就变成了python的header?
print(resp)
if __name__ == '__main__':
biz = 'MjM5NzI3NDg4MA%3D%3D' # "人民网"
pass_ticket = 'nu+69QK2jsmCmAKjjqp998SuOZpRzA6cyxmu6F7xHCw/P/IoHxF8WWrrCwyBi8sI'
app_msg_token = '1028_XeSDhMAkMLG3l5xAQax7E9jA7h-o1-KWlAPWVIuxzgtokiEOFK89u8_U6RsoI9OdXdBBLcgfxvc156Mz'
cookie = 'pgv_pvid=7670766920; pgv_pvi=6227352576; RK=uBrUka7cTI; ptcz=366b89eceeb512317f19bb9082a8579afc611ad8f3546e3ab65722bbd9f2ece8; wxuin=464351360; lang=zh_CN; rewardsn=; wxtokenkey=777; devicetype=Windows10; version=62060833; pass_ticket=nu+69QK2jsmCmAKjjqp998SuOZpRzA6cyxmu6F7xHCw/P/IoHxF8WWrrCwyBi8sI; wap_sid2=CIDhtd0BElxQRHJSQ1JBV2ZRbEo2djhoWVA4UjRTOVZjai0zR3Z1YUlDYjJ6VURUb1RGY2hJOXgtejlxakQtem94ZGdrb0ItYlZSTl8yY2pWbmVIaE9nNXFHMUpNUVFFQUFBfjCrt8XsBTgNQAE='
# 以上信息不同公众号每次抓取都需要借助抓包工具做修改
wxMps = WxMps(biz, pass_ticket, app_msg_token, cookie)
wxMps.start() # 开始爬取文章及评论
# 运行前关闭fiddler!!