python 爬取微博關鍵詞搜索

# coding:utf-8

import re
import random
import requests
import time
import hashlib
import json
import MySQLdb
import multiprocessing
from django.utils.http import urlquote

mysql_config = {"host": "*****8",
                    "port": ,
                    'user': "root",
                    "passwd": "***8888",
                    "db": "won",
                    "charset": "utf8"}


PC_UAS = [
    'MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1',
    'Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10',
    'Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0',
    'Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10',
    'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',
    'Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+',
    'Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999',
    'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',
    'Mozilla/5.0 (Linux; Android 7.1.1; MI 6 Build/NMF26X; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/6.2 TBS/043807 Mobile Safari/537.36 MicroMessenger/6.6.1.1220(0x26060135) NetType/WIFI Language/zh_CN',
    'Mozilla/5.0 (Linux; Android 7.1.1; OD103 Build/NMF26F; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.6.1.1220(0x26060135) NetType/4G Language/zh_CN',
    'Mozilla/5.0 (Linux; Android 6.0.1; SM919 Build/MXB48T; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.6.1.1220(0x26060135) NetType/WIFI Language/zh_CN',
    'Mozilla/5.0 (Linux; Android 5.1.1; vivo X6S A Build/LMY47V; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.6.1.1220(0x26060135) NetType/WIFI Language/zh_CN'
]


TRY_COUNT = 5


from datetime import datetime, timedelta

current_year = datetime.today().strftime("%Y")


def conv_time(t):

    if "剛剛" in t:
        return int(time.time())

    min = int(re.findall('\d+', t)[0])

    if u'秒' in t:
        s = (datetime.now() - timedelta(seconds=min))
    elif u'分鐘' in t:
        s = (datetime.now() - timedelta(minutes=min))

    elif u'小時' in t:
        s = (datetime.now() - timedelta(hours=min))

    elif u'天' in t:
        s = (datetime.now() - timedelta(days=min))

    else:
        len_time = len(t.split("-"))
        if len_time == 3:
            s = datetime.strptime(t, "%Y-%m-%d")
        elif len_time == 2:
            t += ", " + current_year
            s = datetime.strptime(t, "%m-%d, %Y")
    try:
        timstamp = int(time.mktime(s.timetuple()))
    except:
        return 0
    else:
        return timstamp


def parse_mblog(mblog):

    data_showtime = mblog.get("created_at")

    if data_showtime:
        data_showtime = conv_time(data_showtime)
    else:
        data_showtime = 0

    b_text = mblog.get("text")

    if b_text:
        title = re.sub("<.*?>", "", b_text)
        imgs_num = int(mblog.get("pic_num", 0))

        if imgs_num == 1:
            data_imgs = mblog.get("original_pic", "")
        else:
            pics = [i.get("url") for i in mblog.get("pics", [])]
            if pics:
                data_imgs = "|||".join(pics)
            else:
                data_imgs = ""

        bid = mblog.get("id", "")
        bid_str = mblog.get("bid", "")

    else:
        title = ""
        data_imgs = ""
        bid = ""
        bid_str = ""

    try:
        author_name = mblog.get("user").get("screen_name", "")
    except:
        author_name = ""

    try:
        author_imgs = mblog.get("user").get("profile_image_url", "")
    except:
        author_imgs = ""

    try:
        author_id = mblog.get("user").get("id", "")
    except:
        author_id = ""

    try:
        author_gender = mblog.get("user").get("author_gender", "")
    except:
        author_gender = ""

    try:
        author_description = mblog.get("user").get("description", "")
    except:
        author_description = ""

    data_json = json.dumps({"bid_num": bid, "text": b_text, "bid_str": bid_str})

    author_json = json.dumps({"gender": author_gender,
                   "description": author_description})

    return title, data_imgs, data_showtime, author_id, author_imgs, author_name, data_json, author_json


def dig_weibo(keyword, page):
    """
    抓取微博
    """
    conn = MySQLdb.connect(**mysql_config)
    cursor = conn.cursor()
    proxies = {'http': 'http://**********', 'https': 'http://***********'}
    pc_headers = {
        'User-Agent': random.sample(PC_UAS, 1)[0],
        'Accept': 'application/json, text/plain, */*',
        'Accept-Encoding': 'gzip, deflate, br',
        'Cache-Control': 'no-cache',
        'Pragma': 'no-cache',
        'Referer': 'https://m.weibo.cn',
        'Connection': 'close',
        'Host': 'm.weibo.cn'
    }
    url = "https://m.weibo.cn/api/container/getIndex?containerid=100103"
    count = 0
    insert_infos = []
    resp_data = {}

    req_url = url + urlquote("&type=1&q={}".format(keyword)) + "&page_type=searchall&page={}".format(str(page))

    while count < TRY_COUNT:
        try:
            resp_data = requests.get(req_url, headers=pc_headers, proxies=proxies).json()
        except Exception as e:
            time.sleep(0.1)
            print(e)
            count += 1
            continue
        else:
            break

    print(req_url)

    if resp_data.get("ok") != 1:
        return False

    cards = resp_data.get("data").get('cards')

    if not cards:
        return False

    mblogs = []
    hot_wenzhang = []
    for card in cards:
        mblog = card.get("mblog")
        if mblog:
            mblogs.append([card, mblog])
        else:
            card_group = card.get("card_group")
            if not card_group:
                continue
            else:
                for gcard in card_group:
                    mblog = gcard.get("mblog")
                    if mblog:
                        mblogs.append([gcard, mblog])
                    else:
                        try:
                            gcard.get("title_sub")
                        except:
                            continue
                        else:
                            hot_wenzhang.append(gcard)

    addtime = int(time.time())
    if hot_wenzhang:
        for card in hot_wenzhang:
            title = card.get("title_sub")
            try:
                target_url = card.get("scheme", "").split("?", 1)[0]
            except:
                target_url = ""
                target_url_md5 = ""
            else:
                target_url_md5 = hashlib.md5(target_url.encode("utf8")).hexdigest()
            data_imgs = card.get("pic", "")

            try:
                author_name, wb_time = card.get("desc").split(" ")
                data_showtime = conv_time(wb_time)
            except:
                data_showtime = 0
                author_name = ""

            info = [target_url, target_url_md5, addtime, title, data_imgs,
                    title, data_showtime, "{}", 2, keyword, 0, author_name, "", "", "{}"]
            insert_infos.append(info)

    if mblogs:
        for card, mblog in mblogs:
            title, data_imgs, data_showtime, author_id, author_imgs, \
            author_name, data_json, author_json = parse_mblog(mblog)

            try:
                target_url = card.get("scheme", "").split("?", 1)[0]
            except:
                target_url = ""
                target_url_md5 = ""
            else:
                target_url_md5 = hashlib.md5(target_url.encode("utf8")).hexdigest()

            info = [target_url, target_url_md5, addtime, title, data_imgs,
                    title, data_showtime, data_json, 2, keyword, 0, author_name,
                    author_imgs, author_id, author_json]
            insert_infos.append(info)

    if insert_infos:
        for info in insert_infos:

            if info[3] is None or info[0].startswith("sinanews"):
                continue

            target_url_md5 = info[1]
            cursor.execute("select source_keywords from crawl_result where target_url_md5=%s", (target_url_md5,))
            kw_info = cursor.fetchone()
            if kw_info:
                kw_list = kw_info[0].split(",")
                kw_list.append(info[9])
                kws = ",".join(list(set(kw_list)))
                cursor.execute("""update crawl_result set source_keywords=%s where target_url_md5=%s""", (kws, target_url_md5))
                conn.commit()
            else:
                try:
                    cursor.execute("""insert into crawl_result(target_url,target_url_md5, addtime,data_title,
                                     data_imgs,data_content,data_showtime,data_json,source,source_keywords,
                                     state,author_name,author_imgs,author_id,author_json) 
                                     values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) 
                                     on duplicate key update target_url_md5=values(target_url_md5)""", info)
                    conn.commit()
                except Exception as e:
                    print(e)
    cursor.close()
    if insert_infos:
        return True
    else:
        return False


def get_kws():
    conn = MySQLdb.connect(**mysql_config)
    cursor = conn.cursor()
    cursor.execute("select keyword from whool_opinion.keyword order by kid asc")
    kw = cursor.fetchall()
    return kw


if __name__ == "__main__":

    kw = get_kws()
    result_list = []
    url = "https://m.weibo.cn/api/container/getIndex?containerid=100103"
    for k, in kw:
        for page in range(1, 100):
            ret_info = dig_weibo(k, str(page))
            if ret_info:
                continue
            else:
                break

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章