python 爬取今日頭條關鍵詞搜索

使用python 獲取今日頭條的關鍵詞的文章
使用進程池
代碼如下:

# -*- coding: utf-8 -*-
import requests
import random
import requests
import json
import time
import hashlib


from utils.img_to_tencent import img_to_tencent


def md5(str):
    return hashlib.md5(str.encode('utf-8')).hexdigest()


PC_UAS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.73.11 (KHTML, like Gecko) Version/7.0.1 Safari/537.73.11',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:26.0) Gecko/20100101 Firefox/26.0',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.102 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.102 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:26.0) Gecko/20100101 Firefox/26.0',
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:26.0) Gecko/20100101 Firefox/26.0',
    'Mozilla/5.0 (Windows NT 6.1; rv:26.0) Gecko/20100101 Firefox/26.0',
    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36'
]


def crawl_baidu(word):
    pc_headers = {
        'User-Agent': random.sample(PC_UAS, 1)[0],
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Cache-Control': 'no-cache',
        'Pragma': 'no-cache',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        "Accept - Language": "zh - CN, zh;q = 0.9",
        "Cookie": "tt_webid=6710713392061285902; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6710713392061285902; UM_distinctid=16bc9db8a29f6-0417349b599406-516d3e71-13c680-16bc9db8a2d85; csrftoken=5eb2a0e00bcbb888f417ef261ee5269a; CNZZDATA1259612802=1761938442-1562456487-https%253A%252F%252Fwww.baidu.com%252F%7C1562461887; s_v_web_id=ddb620b1224506f21ba99de20d4169e3; __tasessionId=ned"
                              "4t635k1562467258609"

    }

    result = []
    for i in range(0,3):
        page = i * 20
        html_text = ''
        news= ''
        for j in range(3):
            url = "https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset=%s&format=json&keyword=%s&autoload=true" \
                  "&count=20&cur_tab=1"%(int(page),word)
            try:
                proxies = { 'https': 'http://*******8*****8','http': 'http://*******8**8****8'}
                resp = requests.get(url, headers=pc_headers, proxies=proxies,timeout=20)
                time.sleep(0.5)
                html_text = resp.text
                resp.keep_alive = False
                resp.close()
                data = json.loads(html_text)
                news = data['data']
                if not news:
                    continue
                else:
                    pass
            except Exception as e:
                print(e)
                time.sleep(0.1)
                continue
            break
        if news:
            data = json.loads(html_text)
            news = data['data']
            for i in news:
                try:
                    data_title = i['title']
                    real_url = i['url']
                    data_showtime = i['display']['info']['time_factor']
                    author_name = i['display']['info']['site_name']
                    author_imgs = i['display']['info']['icon_img']

                    if not author_imgs:
                        author_imgs = ''
                    data_imgs = i['display']['info']['images']
                    if data_imgs:
                        data_imgs = data_imgs[0]
                        img_to_tencent(data_imgs)
                    else:
                        data_imgs = ''
                    data_content = i['display']['summary']['text']
                except:
                    pass
                else:
                    img_to_tencent(author_imgs)
                    if 'ä' in data_title or 'ä' in(data_content):
                        pass
                    else:
                        print(real_url, data_title, data_imgs, data_content, data_showtime, author_name, author_imgs, word)
                        mysql_config = {"host": "*****888888",
                                        "port": *****6,
                                        'user': "root",
                                        "passwd": "***88",
                                        "db": "*********",
                                        "charset": "utf8"}
                        conn = MySQLdb.connect(**mysql_config)
                        cursor = conn.cursor()
                        target_url_md5 = md5(real_url)
                        cursor.execute("select source_keywords from crawl_result where target_url_md5=%s",
                                       (target_url_md5,))
                        dat = cursor.fetchone()
                        if dat:
                            source_keywords = dat[0]
                            if word not in source_keywords.strip().split(","):
                                source_keywords += ",%s" % word
                                source_keywords = ','.join(list(set(source_keywords.split(","))))
                                cursor.execute("update  crawl_result set source_keywords=%s where  target_url_md5=%s",
                                               (source_keywords, target_url_md5))
                                conn.commit()
                                print('ok1111')
                        else:
                            if data_content:
                                cursor = conn.cursor()
                                cursor.execute(
                                    "insert into crawl_result(target_url,target_url_md5,addtime,data_title,data_imgs,data_content,data_showtime,data_json,source,source_keywords,state,author_name,author_imgs,author_id,author_json) "
                                    "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
                                    (real_url, target_url_md5, int(time.time()), data_title, data_imgs, data_content,
                                     data_showtime,'', 6, word, 0, author_name, author_imgs, '', ''))
                                conn.commit()
                                print('ok')


if __name__ == '__main__':
    from multiprocessing.dummy import Pool
    pool = Pool(20)
    # kws_list = ['化妝水']
    pool.map(crawl_baidu, kws_list)
    cursor.close()
    conn.close()


在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章