python抓取微信公众号文章阅读量

  关于微信公众号文章的评论数网上的教程是可以用的,这里就不另外讲了,。这里要说的我抓阅读量的过程,太tm坎坷了,足足花了我10个小时,干到半夜12点半有想法了又起来开机!!不过好在最终实现了全代码运行,不需要模拟器或者手机之类的。不过初始的token参数需要fiddler抓取一下pc端微信。

  首先,前面抓评论数的经验是微信安卓端和PC端的请求是不一样的,需要综合两者的请求一起抓。

  然后,该带的信息必须全部带上,一个post十几个参数,一个都不能少! 我就是在这个问题上浪费了太多时间。

  最后,善用chrome调试,多把中间的页面保存下来分析,上面的js你看不明白就对比下chrome发的请求和你自己拼接出来的请求一不一样。

最后测试结果,还有部分文章没抓出来,估计是我哪里还有bug。然后超过10w的阅读量是只会显示100001的。从搜索到获取信息全部由python独立完成,连selenium都没用。不过如果不追求速度用selenium会非常简单!
在这里插入图片描述

参考代码

# coding=utf8
import json
import re
import time
from datetime import datetime
import urllib
import requests
import random
class WxMps(object):
    """微信公众号文章、评论抓取爬虫"""

    def __init__(self, _biz, _pass_ticket, _app_msg_token, _cookie, _offset=0):
        self.offset = _offset
        self.biz = _biz  # 公众号标志
        self.msg_token = _app_msg_token  # 票据(非固定)
        self.pass_ticket = _pass_ticket  # 票据(非固定)
        self.session = requests.session()
        headers = {
            'Cookie': _cookie,  # Cookie(非固定)
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/3.53.1159.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat'
            # 修改头信息 ,chrome就完美变成微信客户端了!!

       }
        self.headers = headers
        cookies = requests.utils.cookiejar_from_dict(headers, cookiejar=None, overwrite=True)
        self.session.cookies=cookies
        self.session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/3.53.1159.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat'})

    def start(self):
        """请求获取公众号的文章接口"""

        offset = self.offset
        while True:
            api = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={0}&f=json&offset={1}' \
                  '&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={2}&wxtoken=&appmsg_token' \
                  '={3}&x5=1&f=json'.format(self.biz, offset, self.pass_ticket, self.msg_token)
            print(api)
            resp = self.session.get(api, verify=False).json()
            ret, status = resp.get('ret'), resp.get('errmsg')  # 状态信息
            if ret == 0 or status == 'ok':
                offset = resp['next_offset']  # 下一次请求偏移量
                general_msg_list = resp['general_msg_list']
                msg_list = json.loads(general_msg_list)['list']  # 获取文章列表
                for msg in msg_list:
                    comm_msg_info = msg['comm_msg_info']  # 该数据是本次推送多篇文章公共的
                    msg_id = comm_msg_info['id']  # 文章id
                    post_time = datetime.fromtimestamp(comm_msg_info['datetime'])  # 发布时间
                    # msg_type = comm_msg_info['type']  # 文章类型
                    # msg_data = json.dumps(comm_msg_info, ensure_ascii=False)  # msg原数据

                    app_msg_ext_info = msg.get('app_msg_ext_info')  # article原数据
                    if app_msg_ext_info:
                        # 本次推送的首条文章
                        self._parse_articles(app_msg_ext_info, msg_id, post_time)
                        # 本次推送的其余文章
                        multi_app_msg_item_list = app_msg_ext_info.get('multi_app_msg_item_list')
                        if multi_app_msg_item_list:
                            for item in multi_app_msg_item_list:
                                msg_id = item['fileid']  # 文章id
                                if msg_id == 0:
                                    msg_id = int(time.time() * 1000)  # 设置唯一id,解决部分文章id=0出现唯一索引冲突的情况
                                self._parse_articles(item, msg_id, post_time)
                print('next offset is %d' % offset)
            else:
                print('Before break , Current offset is %d' % offset)
                break

    def _parse_articles(self, info, msg_id, post_time):
        """解析嵌套文章数据并保存入库"""

        title = info.get('title')  # 标题
        cover = info.get('cover')  # 封面图
        author = info.get('author')  # 作者
        digest = info.get('digest')  # 关键字
        source_url = info.get('source_url')  # 原文地址
        content_url = info.get('content_url')  # 微信地址
        # ext_data = json.dumps(info, ensure_ascii=False)  # 原始数据

        content_url = content_url.replace('amp;', '').replace('http', 'https')

        self._parse_article_detail(content_url, 1)

    def _parse_article_detail(self, content_url, article_id):
    # 从文章页提取相关参数用于获取评论,article_id是已保存的文章id
        try:
            html = self.session.get(content_url,verify=False).text
            with open('1.html','w',encoding="utf-8") as f:
                f.write(html)
        except Exception as e:
            print('获取评论失败' + content_url)
            print(e)
        else:
            # group(0) is current line
            str_comment = re.search(r'var comment_id = "(.*)" \|\| "(.*)" \* 1;', html)
            str_msg = re.search(r"var appmsgid = '' \|\| '(.*)'\|\|", html)
            str_token = re.search(r'window.appmsg_token = "(.*)";', html)
            mid = re.search(r'mid=(\d*)', content_url).group(1)
            sn = re.search(r'sn=(\w*)', content_url).group(1)
            ct =  re.search(r'ct = "(.*)";', html).group(1)
            title = re.search(r'var msg_title = "(.*?)"', html).group(1)
            req_id = re.search(r'var req_id = \'(.*?)\'', html).group(1)
            devicetype = re.search(r'var devicetype = "(.*?)"', html).group(1)
            scene = re.search(r'scene=(\d*)', content_url).group(1)
            if str_comment and str_msg and str_token:
                comment_id = str_comment.group(1)  # 评论id(固定)
                app_msg_id = str_msg.group(1)  # 票据id(非固定)
                appmsg_token = str_token.group(1)  # 票据token(非固定)

                # 缺一不可
                if appmsg_token and app_msg_id and comment_id:
                    print('Crawl article comments: ' + content_url)
                    # self._crawl_comments(app_msg_id, comment_id, appmsg_token, article_id)
                    self._crawl_yuedu(self.pass_ticket, appmsg_token,mid,comment_id,sn,ct,title,req_id,devicetype,scene)


    def _crawl_comments(self, app_msg_id, comment_id, appmsg_token, article_id):
        """抓取文章的评论"""

        api = 'https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&scene=0&__biz={0}' \
              '&appmsgid={1}&idx=1&comment_id={2}&offset=0&limit=100&uin=777&key=777' \
              '&pass_ticket={3}&wxtoken=777&devicetype=android-26&clientversion=26060739' \
              '&appmsg_token={4}&x5=1&f=json'.format(self.biz, app_msg_id, comment_id,
                                                     self.pass_ticket, appmsg_token)
        resp = self.session.get(api, verify=False).json()
        ret, status = resp['base_resp']['ret'], resp['base_resp']['errmsg']
        if ret == 0 or status == 'ok':
            elected_comment = resp['elected_comment']
            for comment in elected_comment:
                nick_name = comment.get('nick_name')  # 暱称
                logo_url = comment.get('logo_url')  # 头像
                comment_time = datetime.fromtimestamp(comment.get('create_time'))  # 评论时间
                content = comment.get('content')  # 评论内容
                content_id = comment.get('content_id')  # id
                like_num = comment.get('like_num')  # 点赞数
                # reply_list = comment.get('reply')['reply_list']  # 回复数据
                print(nick_name,like_num)

    def _crawl_yuedu(self,pass_ticket, appmsg_token,mid,comment_id,sn,ct,title,req_id,devicetype,scene):
        api = 'https://mp.weixin.qq.com/mp/getappmsgext?f=json&mock=&uin=777&key=777&pass_ticket=&wxtoken=777&devicetype={0}&clientversion=62060833&__biz={1}&appmsg_token={2}&f=json&x5=0'.format(
            urllib.parse.quote(devicetype),self.biz,appmsg_token)
        data = {
            'r':random.random(),
            '__biz': 'MjM5NzI3NDg4MA==',
                   'appmsg_type': 9,
        'mid': mid,
        'sn': sn,
        'idx': 1,
        'scene': 27,
        'title': urllib.parse.quote(title),
        'ct': 1569809116,
        'abtest_cookie':None,
        'devicetype': 'Windows10',
        'version': 62060833,
        'is_need_ticket': 1,
        'is_need_ad': 0,
        'comment_id': comment_id,
        'is_need_reward': 0,
        'both_ad': 0,
        'reward_uin_count': 0,
        'send_time':None,
        'msg_daily_idx': 1,
        'is_original': 0,
        'is_only_read': 1,
        'req_id': req_id,
        'pass_ticket':None,
        'is_temp_url': 0,
        'item_show_type': 0,
        'tmp_version': 1,
        'more_read_type': 0,
        'appmsg_like_type': 2,
        'related_video_sn':None,
        'vid':None

        }
        time.sleep(2)
        resp = self.session.post(api, verify=False,data=data).json() #为啥这里做着做着就变成了python的header?
        print(resp)



if __name__ == '__main__':
    biz = 'MjM5NzI3NDg4MA%3D%3D'  # "人民网"
    pass_ticket = 'nu+69QK2jsmCmAKjjqp998SuOZpRzA6cyxmu6F7xHCw/P/IoHxF8WWrrCwyBi8sI'
    app_msg_token = '1028_XeSDhMAkMLG3l5xAQax7E9jA7h-o1-KWlAPWVIuxzgtokiEOFK89u8_U6RsoI9OdXdBBLcgfxvc156Mz'
    cookie = 'pgv_pvid=7670766920; pgv_pvi=6227352576; RK=uBrUka7cTI; ptcz=366b89eceeb512317f19bb9082a8579afc611ad8f3546e3ab65722bbd9f2ece8; wxuin=464351360; lang=zh_CN; rewardsn=; wxtokenkey=777; devicetype=Windows10; version=62060833; pass_ticket=nu+69QK2jsmCmAKjjqp998SuOZpRzA6cyxmu6F7xHCw/P/IoHxF8WWrrCwyBi8sI; wap_sid2=CIDhtd0BElxQRHJSQ1JBV2ZRbEo2djhoWVA4UjRTOVZjai0zR3Z1YUlDYjJ6VURUb1RGY2hJOXgtejlxakQtem94ZGdrb0ItYlZSTl8yY2pWbmVIaE9nNXFHMUpNUVFFQUFBfjCrt8XsBTgNQAE='
    # 以上信息不同公众号每次抓取都需要借助抓包工具做修改
    wxMps = WxMps(biz, pass_ticket, app_msg_token, cookie)
    wxMps.start()  # 开始爬取文章及评论
    # 运行前关闭fiddler!!

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章