今日頭條新聞爬取+storm流處理存儲（1）——爬取部分

項目簡介

本項目整體分爲三個部分來進行

今日頭條新聞爬取
將爬取下來的新聞正文部分進行實體分析，並將結果可視化
用storm框架將爬取的新聞數據存入mysql

本文主要介紹今日頭條新聞爬取的部分,下面給出整個項目的框架

由於下面可能涉及到kafka相關的東西，關於這部分內容可以參考這篇文章：流處理平臺的搭建
實體分析部分可以參考：實體分析
storm流處理部分可以參考：storm流處理
項目下載地址：今日頭條爬取+實體分析+storm流處理

代碼介紹

main.py
程序的啓動入口，這裏爲了能夠讓程序不斷的執行下去用了個死循環

# -- coding: utf-8 -

from get_page_index import get_page_index
import time

def main():
    get_page_index()

if __name__ == "__main__":
    a=1
    while True:
        print('<----------------------第'+str(a)+'輪啓動---------------------->\n')
        main()
        print('<----------------------第' + str(a) + '輪結束---------------------->\n')
        print('<----------------------進入休眠---------------------->\n')
        time.sleep(300)
        a+=1

get_page_index.py
這個函數主要起到一個承上啓下的中轉的作用

import time
import random
from paser_page_index import paser_page_index
from get_ip_index import get_ip_index
from get_page_detail import get_page_detail

def get_page_index():
    print('-----------------開始獲取網頁列表-----------------')
    list=paser_page_index()
    print('-----------------獲取網頁列表結束-----------------\n')
    print('-----------------開始獲取代理IP-----------------')
    ip_list=get_ip_index()

    print('-----------------開始爬取網頁-----------------')
    for i in list:
        url='https://www.toutiao.com/a'+i
        a=get_page_detail(url,ip_list)
        time.sleep(random.randint(3, 5))
        # if a==0:
        #     print('這是一條廣告或者無法解析該網頁')
        # if a==1:
        #     print('這篇文章是個問答')
        # if a==2:
        #     print('這是一個圖片類文章')
    print('-----------------爬取網頁結束-----------------\n')

paser_page_index.py
這個函數的作用是獲取我們需要爬取的網頁的列表，以便於對列表內的網頁進行爬取

import time
import requests
from bs4 import BeautifulSoup
import hashlib

#這個函數是用來計算今日頭條加密算法的結果
def get_as_cp_args():
    zz = {}
    now = round(time.time())
    e = hex(int(now)).upper()[2:]  # hex()轉換一個整數對象爲十六進制的字符串表示
    i = hashlib.md5(str(int(now)).encode("utf8")).hexdigest().upper()  # hashlib.md5().hexdigest()創建hash對象並返回16進制結果
    if len(e) != 8:
        zz = {'as': "479BB4B7254C150",
              'cp': "7E0AC8874BB0985"}
        return zz
    n = i[:5]
    a = i[-5:]
    r = ""
    s = ""
    for i in range(5):
        s = s + n[i] + e[i]
    for j in range(5):
        r = r + e[j + 3] + a[j]
    zz = {
        'as': "A1" + s + e[-3:],
        'cp': e[0:3] + r + "E1"
    }
    return zz


def paser_page_index():
    url1 = [
           'https://www.toutiao.com/api/pc/feed/?category=news_hot',
           'https://www.toutiao.com/api/pc/feed/?category=news_tech',
           'https://www.toutiao.com/api/pc/feed/?category=news_entertainment',
           'https://www.toutiao.com/api/pc/feed/?category=news_game',
           'https://www.toutiao.com/api/pc/feed/?category=news_sports',
           'https://www.toutiao.com/api/pc/feed/?category=news_car',
           'https://www.toutiao.com/api/pc/feed/?category=news_finance',
           'https://www.toutiao.com/api/pc/feed/?category=funny',
           'https://www.toutiao.com/api/pc/feed/?category=news_military',
           'https://www.toutiao.com/api/pc/feed/?category=news_world',
           'https://www.toutiao.com/api/pc/feed/?category=news_fashion',
           'https://www.toutiao.com/api/pc/feed/?category=news_travel',
           'https://www.toutiao.com/api/pc/feed/?category=news_discovery',
           'https://www.toutiao.com/api/pc/feed/?category=news_baby',
           'https://www.toutiao.com/api/pc/feed/?category=news_regimen',
           'https://www.toutiao.com/api/pc/feed/?category=news_essay',
           'https://www.toutiao.com/api/pc/feed/?category=news_history',
           'https://www.toutiao.com/api/pc/feed/?category=news_food'
            ]
    list=[]
    for i in url1:
        user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
        cookie = 'tt_webid=6762050087801406989; tt_webid=6762050087801406989; csrftoken=be4be279678742cea85ca2bfc0b308c8; WEATHER_CITY=%E5%8C%97%E4%BA%AC; s_v_web_id=c05bab65d1f25e1c6b72817b6f34f92a; __tasessionId=lgnzbs4ah1578017571495'
        headers = {'user-agent': user_agent, 'cookie': cookie,'referer': i}
        as_cp=get_as_cp_args()
        url2 = '&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as='+as_cp['as']+'&cp='+as_cp['cp']
        respond = requests.get(i + url2, headers=headers)
        soup = BeautifulSoup(respond.text, 'html.parser')
        print(soup)
        try:
            if respond.status_code == 200:
                dict1 = respond.json()
            for i in dict1['data']:
                list.append(i['group_id'])
        except:
            None
    return list

get_ip_index.py
這個程序是用來進行代理IP的爬取的，返回的結果是可用的代理IP，主要是由於我們需要長時間的對新聞進行爬取，一旦爬取時間過長，很容易IP就被封了，所以需要用到代理IP

import requests
from bs4 import BeautifulSoup
import random

def get_ip_index():
    randomlist=['/nn/','/wn/','/wt/']
    url='https://www.xicidaili.com'+random.choice(randomlist)+str(random.randint(1,3))
    print('代理IP來源網址：',url)
    list=[]
    proxies={}
    start=random.randint(1,40)
    end=random.randint(50,90)
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
    r=requests.get(url,headers=headers)
    soup = BeautifulSoup(r.text, 'html.parser')
    tag=soup.find_all('tr')
    for j in tag[start:end]:
        tag1=j.find_all('td')
        list.append(tag1[1].text+':'+tag1[2].text)
    # 這部分是用來驗證代理IP是否可用的，可以加上，也可以不加
    # for i in list:
    #     try:
    #         ip="https://" + i
    #         # print(ip)
    #         proxies['https']=ip
    #         r=requests.get('https://www.baidu.com',headers=headers,proxies=proxies,timeout=(3,7))
    #     except:
    #         list.remove(i)
    print('-----------------成功獲得代理' + str(len(list)) + '個-----------------\n')
    return list

get_page_detail.py
這個函數是爬取今日頭條的主體函數

import requests
from bs4 import BeautifulSoup
import re
from my_kafka import kafka_produce
from get_ip_index import get_ip_index
from get_article import get_article
from text_grapher import Entity_extraction

def get_page_detail(url,ip_list):
    proxies = {}
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    headers = {'user-agent': user_agent, 'x-requested-with': 'XMLHttpRequest'}
    print('當前獲取網頁：',url)
    while True:
        if proxies:
            try:
                r = requests.get(url,headers=headers,allow_redirects=False,proxies=proxies,timeout=(3,7))
                if r.status_code == 200:
                    break
            except:
                proxies['https'] = 'https://' + ip_list[0]
                ip_list.pop(0)
                if ip_list==[]:
                    ip_list=get_ip_index()
        else:
            r = requests.get(url, headers=headers, allow_redirects=False, timeout=(3, 7))
            if r.status_code == 200:
                break
            else:
                proxies['https'] = 'https://' + ip_list[0]
                ip_list.pop(0)
    r.encoding = 'utf-8'
    article = {}
    article['url']=url
    soup = BeautifulSoup(r.text, 'html.parser')
    # print(soup.prettify())
    Str = soup.text
    try:
    	type=re.findall(re.compile(r'chineseTag: \'(.*?)\'', re.S), Str)
    except:
    	return 0
    if type==[] or type==['']:
        return 0
    if type == ['問答']:
        return 1
    if type == ['圖片']:
        return 2
    #類別
    article['type']=re.findall(re.compile(r'chineseTag: \'(.*?)\'', re.S), Str)[0]
    #標題
    title_result=re.findall(re.compile(r'title: \'(.*?)\'', re.S), Str)[0]
    title=re.sub(r'[\\/:*?"<>|]', '', title_result)
    article['title']=title
    # 時間，評論次數，來源，封面圖片，關鍵詞
    article['time'] = re.findall(re.compile(r'time: \'(.*?)\'', re.S), Str)[0]
    article['comments_count']=re.findall(re.compile(r'comments_count: (.*?),', re.S), Str)[0]
    article['source']=re.findall(re.compile(r'source: \'(.*?)\'', re.S), Str)[0]
    article['coverImg']=re.findall(re.compile(r'coverImg: \'(.*?)\'', re.S), Str)[0]
    article['keywords']=re.findall(re.compile(r'{"name":\"(.*?)\"}', re.S), Str)
    keywords=''
    for i in re.findall(re.compile(r'{"name":\"(.*?)\"}', re.S), Str):
        keywords=keywords+i+'\t'
    #正文
    text=get_article(r)
    article['news']=text
    #kafka通信
    kafka_produce(str([article]).replace("&quot;",""),url)
    #實體分析
    Entity_extraction(text,title.replace("&quot;",""))
    # print(article)

get_article.py
這個函數是爬取今日頭條新聞正文部分的函數，主要由於今日頭條的正文部分摻雜了很多網頁的標籤需要去掉，以及我們想要做到整個正文部分的文字連同圖片的順序不能亂，實現起來會有點複雜

from bs4 import BeautifulSoup
import re

def get_article(response):
    string=''
    soup = BeautifulSoup(response.text , features="lxml")
    # .encode('utf-8').decode("unicode-escape")
    body = soup.find('body')
    script4 = body.find_all('script')

    rawMeterial = re.findall("articleInfo:([\s\S]*)tagInfo:", str(script4[3])[23:][:-10])[0]

    pipeiArticle = "content: '([\s\S]*)groupId:"
    Article = re.findall(pipeiArticle, rawMeterial)

    # print(Article)

    a = Article[0].strip()
    b = a.split(r'\u003Cp\u003E')
    for each in b:
        each2 = each.replace(r'\u003C','<').replace(r'p\u003E','p>').replace(r'\u002F','\\').replace(r'\u003E','>')
        if '<\p>' in each2:
            # print(each2.index('<\p>'))
            each3 = each2[:each2.index('<\p>')].strip()
        # print(re.sub(re.compile("<\\p>(.*?)"), "", each2))
            each4 = re.sub(re.compile("<(.*?)>"), "", each3)
            # print(re.sub(re.compile("<(.*?)>"), "", each3))
            string=string+each4+'\n'
            # print(each4)

        pipeiSource = "<img src([\s\S]*)\&quot; img_width"
        pipeiSource2 = "http:([\s\S]*)"
        source2 = re.findall(pipeiSource, each2)
        # print(each2)
        # print(source2)
        if source2 != []:
            # print(source2)
            source3 = source2[0].split('\&quot; img_width')
            # print(source3)
            for each in source3:
                source4 = re.findall(pipeiSource2, each)
                # print('http:' + source4[0])
                string = string + str('http:' + source4[0]).strip() + '\n'
            # print(source2[0][13:][:-1].strip())
        # print('\n')

        # pipeiSource = "<img src([\s\S]*)\&quot; img_width"
        # source2 = re.findall(pipeiSource, each2)
        # if source2 != []:
        #     string=string+source2[0][13:][:-1].strip()+'\n'
        #     # print(source2[0][13:][:-1].strip())
    return string.replace("&quot;","")

my_kafka.py
這個程序是用來連接kafka的，用來將我們爬取的新聞發送到storm內

# -*- coding: utf-8 -*-

from kafka import KafkaProducer
from kafka.errors import KafkaError

KAFAKA_HOST="192.168.161.100"
KAFAKA_PORT = 9092  # 端口號
KAFAKA_TOPIC = "today_news"  # topic

class Kafka_producer():
    def __init__(self, kafkahost, kafkaport, kafkatopic):
        self.kafkaHost = kafkahost
        self.kafkaPort = kafkaport
        self.kafkatopic = kafkatopic
        self.producer = KafkaProducer(bootstrap_servers='{kafka_host}:{kafka_port}'.format(
            kafka_host=self.kafkaHost,
            kafka_port=self.kafkaPort)
        )

    def sendjsondata(self, params):
        try:
            parmas_message = params  # 注意dumps
            producer = self.producer
            producer.send(self.kafkatopic, value=parmas_message.encode('utf-8'))
            producer.flush()
        except KafkaError as e:
            print(e)

def kafka_produce(params,url):
    # 生產模塊
    producer = Kafka_producer(KAFAKA_HOST, KAFAKA_PORT, KAFAKA_TOPIC)
    print("======> producer:", url, '\n')
    producer.sendjsondata(params)

實體分析模塊的代碼這裏不做具體介紹

總結

整個今日頭條的爬取其實整體不算困難，主要是今日頭條近幾天將自己的api又升級了一下，訪問api的時候必須要加上as和cp兩個參數，這部分解密的代碼是參考的別人的代碼。整體實現下來效果還是不錯的，接下來我還會介紹這個項目的另外兩塊內容，感興趣的可以繼續關注一下。

今日頭條新聞爬取+storm流處理存儲（1）——爬取部分

項目簡介

代碼介紹

總結

MySQL 核心模塊揭祕 | 18 期 | 鎖在內存里長什麼樣*

使用perf工具生成火焰圖

HttpSecurity 是如何組裝過濾器鏈的

數說海南——近6年海南各市縣人口簡單看

長序列中Transformers的高級注意力機制總結

WebStorm 創建 Vue 項目

大齡程序員思考

響應式界面控件DevExtreme * 更強的數據分析和可視化功能

今日頭條新聞爬取+storm流處理存儲（1）——爬取部分

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結