項目簡介
本項目整體分爲三個部分來進行
- 今日頭條新聞爬取
- 將爬取下來的新聞正文部分進行實體分析,並將結果可視化
- 用storm框架將爬取的新聞數據存入mysql
本文主要介紹今日頭條新聞爬取的部分,下面給出整個項目的框架
由於下面可能涉及到kafka相關的東西,關於這部分內容可以參考這篇文章:流處理平臺的搭建
實體分析部分可以參考:實體分析
storm流處理部分可以參考:storm流處理
項目下載地址:今日頭條爬取+實體分析+storm流處理
代碼介紹
main.py
程序的啓動入口,這裏爲了能夠讓程序不斷的執行下去用了個死循環
# -- coding: utf-8 -
from get_page_index import get_page_index
import time
def main():
get_page_index()
if __name__ == "__main__":
a=1
while True:
print('<----------------------第'+str(a)+'輪啓動---------------------->\n')
main()
print('<----------------------第' + str(a) + '輪結束---------------------->\n')
print('<----------------------進入休眠---------------------->\n')
time.sleep(300)
a+=1
get_page_index.py
這個函數主要起到一個承上啓下的中轉的作用
import time
import random
from paser_page_index import paser_page_index
from get_ip_index import get_ip_index
from get_page_detail import get_page_detail
def get_page_index():
print('-----------------開始獲取網頁列表-----------------')
list=paser_page_index()
print('-----------------獲取網頁列表結束-----------------\n')
print('-----------------開始獲取代理IP-----------------')
ip_list=get_ip_index()
print('-----------------開始爬取網頁-----------------')
for i in list:
url='https://www.toutiao.com/a'+i
a=get_page_detail(url,ip_list)
time.sleep(random.randint(3, 5))
# if a==0:
# print('這是一條廣告或者無法解析該網頁')
# if a==1:
# print('這篇文章是個問答')
# if a==2:
# print('這是一個圖片類文章')
print('-----------------爬取網頁結束-----------------\n')
paser_page_index.py
這個函數的作用是獲取我們需要爬取的網頁的列表,以便於對列表內的網頁進行爬取
import time
import requests
from bs4 import BeautifulSoup
import hashlib
#這個函數是用來計算今日頭條加密算法的結果
def get_as_cp_args():
zz = {}
now = round(time.time())
e = hex(int(now)).upper()[2:] # hex()轉換一個整數對象爲十六進制的字符串表示
i = hashlib.md5(str(int(now)).encode("utf8")).hexdigest().upper() # hashlib.md5().hexdigest()創建hash對象並返回16進制結果
if len(e) != 8:
zz = {'as': "479BB4B7254C150",
'cp': "7E0AC8874BB0985"}
return zz
n = i[:5]
a = i[-5:]
r = ""
s = ""
for i in range(5):
s = s + n[i] + e[i]
for j in range(5):
r = r + e[j + 3] + a[j]
zz = {
'as': "A1" + s + e[-3:],
'cp': e[0:3] + r + "E1"
}
return zz
def paser_page_index():
url1 = [
'https://www.toutiao.com/api/pc/feed/?category=news_hot',
'https://www.toutiao.com/api/pc/feed/?category=news_tech',
'https://www.toutiao.com/api/pc/feed/?category=news_entertainment',
'https://www.toutiao.com/api/pc/feed/?category=news_game',
'https://www.toutiao.com/api/pc/feed/?category=news_sports',
'https://www.toutiao.com/api/pc/feed/?category=news_car',
'https://www.toutiao.com/api/pc/feed/?category=news_finance',
'https://www.toutiao.com/api/pc/feed/?category=funny',
'https://www.toutiao.com/api/pc/feed/?category=news_military',
'https://www.toutiao.com/api/pc/feed/?category=news_world',
'https://www.toutiao.com/api/pc/feed/?category=news_fashion',
'https://www.toutiao.com/api/pc/feed/?category=news_travel',
'https://www.toutiao.com/api/pc/feed/?category=news_discovery',
'https://www.toutiao.com/api/pc/feed/?category=news_baby',
'https://www.toutiao.com/api/pc/feed/?category=news_regimen',
'https://www.toutiao.com/api/pc/feed/?category=news_essay',
'https://www.toutiao.com/api/pc/feed/?category=news_history',
'https://www.toutiao.com/api/pc/feed/?category=news_food'
]
list=[]
for i in url1:
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
cookie = 'tt_webid=6762050087801406989; tt_webid=6762050087801406989; csrftoken=be4be279678742cea85ca2bfc0b308c8; WEATHER_CITY=%E5%8C%97%E4%BA%AC; s_v_web_id=c05bab65d1f25e1c6b72817b6f34f92a; __tasessionId=lgnzbs4ah1578017571495'
headers = {'user-agent': user_agent, 'cookie': cookie,'referer': i}
as_cp=get_as_cp_args()
url2 = '&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as='+as_cp['as']+'&cp='+as_cp['cp']
respond = requests.get(i + url2, headers=headers)
soup = BeautifulSoup(respond.text, 'html.parser')
print(soup)
try:
if respond.status_code == 200:
dict1 = respond.json()
for i in dict1['data']:
list.append(i['group_id'])
except:
None
return list
get_ip_index.py
這個程序是用來進行代理IP的爬取的,返回的結果是可用的代理IP,主要是由於我們需要長時間的對新聞進行爬取,一旦爬取時間過長,很容易IP就被封了,所以需要用到代理IP
import requests
from bs4 import BeautifulSoup
import random
def get_ip_index():
randomlist=['/nn/','/wn/','/wt/']
url='https://www.xicidaili.com'+random.choice(randomlist)+str(random.randint(1,3))
print('代理IP來源網址:',url)
list=[]
proxies={}
start=random.randint(1,40)
end=random.randint(50,90)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
r=requests.get(url,headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
tag=soup.find_all('tr')
for j in tag[start:end]:
tag1=j.find_all('td')
list.append(tag1[1].text+':'+tag1[2].text)
# 這部分是用來驗證代理IP是否可用的,可以加上,也可以不加
# for i in list:
# try:
# ip="https://" + i
# # print(ip)
# proxies['https']=ip
# r=requests.get('https://www.baidu.com',headers=headers,proxies=proxies,timeout=(3,7))
# except:
# list.remove(i)
print('-----------------成功獲得代理' + str(len(list)) + '個-----------------\n')
return list
get_page_detail.py
這個函數是爬取今日頭條的主體函數
import requests
from bs4 import BeautifulSoup
import re
from my_kafka import kafka_produce
from get_ip_index import get_ip_index
from get_article import get_article
from text_grapher import Entity_extraction
def get_page_detail(url,ip_list):
proxies = {}
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
headers = {'user-agent': user_agent, 'x-requested-with': 'XMLHttpRequest'}
print('當前獲取網頁:',url)
while True:
if proxies:
try:
r = requests.get(url,headers=headers,allow_redirects=False,proxies=proxies,timeout=(3,7))
if r.status_code == 200:
break
except:
proxies['https'] = 'https://' + ip_list[0]
ip_list.pop(0)
if ip_list==[]:
ip_list=get_ip_index()
else:
r = requests.get(url, headers=headers, allow_redirects=False, timeout=(3, 7))
if r.status_code == 200:
break
else:
proxies['https'] = 'https://' + ip_list[0]
ip_list.pop(0)
r.encoding = 'utf-8'
article = {}
article['url']=url
soup = BeautifulSoup(r.text, 'html.parser')
# print(soup.prettify())
Str = soup.text
try:
type=re.findall(re.compile(r'chineseTag: \'(.*?)\'', re.S), Str)
except:
return 0
if type==[] or type==['']:
return 0
if type == ['問答']:
return 1
if type == ['圖片']:
return 2
#類別
article['type']=re.findall(re.compile(r'chineseTag: \'(.*?)\'', re.S), Str)[0]
#標題
title_result=re.findall(re.compile(r'title: \'(.*?)\'', re.S), Str)[0]
title=re.sub(r'[\\/:*?"<>|]', '', title_result)
article['title']=title
# 時間,評論次數,來源,封面圖片,關鍵詞
article['time'] = re.findall(re.compile(r'time: \'(.*?)\'', re.S), Str)[0]
article['comments_count']=re.findall(re.compile(r'comments_count: (.*?),', re.S), Str)[0]
article['source']=re.findall(re.compile(r'source: \'(.*?)\'', re.S), Str)[0]
article['coverImg']=re.findall(re.compile(r'coverImg: \'(.*?)\'', re.S), Str)[0]
article['keywords']=re.findall(re.compile(r'{"name":\"(.*?)\"}', re.S), Str)
keywords=''
for i in re.findall(re.compile(r'{"name":\"(.*?)\"}', re.S), Str):
keywords=keywords+i+'\t'
#正文
text=get_article(r)
article['news']=text
#kafka通信
kafka_produce(str([article]).replace(""",""),url)
#實體分析
Entity_extraction(text,title.replace(""",""))
# print(article)
get_article.py
這個函數是爬取今日頭條新聞正文部分的函數,主要由於今日頭條的正文部分摻雜了很多網頁的標籤需要去掉,以及我們想要做到整個正文部分的文字連同圖片的順序不能亂,實現起來會有點複雜
from bs4 import BeautifulSoup
import re
def get_article(response):
string=''
soup = BeautifulSoup(response.text , features="lxml")
# .encode('utf-8').decode("unicode-escape")
body = soup.find('body')
script4 = body.find_all('script')
rawMeterial = re.findall("articleInfo:([\s\S]*)tagInfo:", str(script4[3])[23:][:-10])[0]
pipeiArticle = "content: '([\s\S]*)groupId:"
Article = re.findall(pipeiArticle, rawMeterial)
# print(Article)
a = Article[0].strip()
b = a.split(r'\u003Cp\u003E')
for each in b:
each2 = each.replace(r'\u003C','<').replace(r'p\u003E','p>').replace(r'\u002F','\\').replace(r'\u003E','>')
if '<\p>' in each2:
# print(each2.index('<\p>'))
each3 = each2[:each2.index('<\p>')].strip()
# print(re.sub(re.compile("<\\p>(.*?)"), "", each2))
each4 = re.sub(re.compile("<(.*?)>"), "", each3)
# print(re.sub(re.compile("<(.*?)>"), "", each3))
string=string+each4+'\n'
# print(each4)
pipeiSource = "<img src([\s\S]*)\" img_width"
pipeiSource2 = "http:([\s\S]*)"
source2 = re.findall(pipeiSource, each2)
# print(each2)
# print(source2)
if source2 != []:
# print(source2)
source3 = source2[0].split('\" img_width')
# print(source3)
for each in source3:
source4 = re.findall(pipeiSource2, each)
# print('http:' + source4[0])
string = string + str('http:' + source4[0]).strip() + '\n'
# print(source2[0][13:][:-1].strip())
# print('\n')
# pipeiSource = "<img src([\s\S]*)\" img_width"
# source2 = re.findall(pipeiSource, each2)
# if source2 != []:
# string=string+source2[0][13:][:-1].strip()+'\n'
# # print(source2[0][13:][:-1].strip())
return string.replace(""","")
my_kafka.py
這個程序是用來連接kafka的,用來將我們爬取的新聞發送到storm內
# -*- coding: utf-8 -*-
from kafka import KafkaProducer
from kafka.errors import KafkaError
KAFAKA_HOST="192.168.161.100"
KAFAKA_PORT = 9092 # 端口號
KAFAKA_TOPIC = "today_news" # topic
class Kafka_producer():
def __init__(self, kafkahost, kafkaport, kafkatopic):
self.kafkaHost = kafkahost
self.kafkaPort = kafkaport
self.kafkatopic = kafkatopic
self.producer = KafkaProducer(bootstrap_servers='{kafka_host}:{kafka_port}'.format(
kafka_host=self.kafkaHost,
kafka_port=self.kafkaPort)
)
def sendjsondata(self, params):
try:
parmas_message = params # 注意dumps
producer = self.producer
producer.send(self.kafkatopic, value=parmas_message.encode('utf-8'))
producer.flush()
except KafkaError as e:
print(e)
def kafka_produce(params,url):
# 生產模塊
producer = Kafka_producer(KAFAKA_HOST, KAFAKA_PORT, KAFAKA_TOPIC)
print("======> producer:", url, '\n')
producer.sendjsondata(params)
實體分析模塊的代碼這裏不做具體介紹
總結
整個今日頭條的爬取其實整體不算困難,主要是今日頭條近幾天將自己的api又升級了一下,訪問api的時候必須要加上as和cp兩個參數,這部分解密的代碼是參考的別人的代碼。整體實現下來效果還是不錯的,接下來我還會介紹這個項目的另外兩塊內容,感興趣的可以繼續關注一下。