思路
- 在聚合數據申請賬號(https://www.juhe.cn/)
- 通過聚合數據api獲取微信精選文章api
- 通過
newspaper
庫提取相應的文本內容,關於newspaper
庫的使用方法可以參考這裏
代碼
# -*- coding: utf-8 -*-
# !/usr/bin/env python
# Time: 2019/5/9 18:57
# Author: sty
# File: get_data.py
import json, urllib
from urllib.parse import urlencode
import requests
import json
import re
from newspaper import Article
def remove_punctuation(strs):
"""
去除標點符號
:param strs:
:return:
"""
return re.sub("[\s+\.\!\/<>“”,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "", strs.strip())
def remove_unusual_upunctuation(strs):
"""
去除標點符號
:param strs:
:return:
"""
return re.sub("[\s+\.\!\/<>“”,$%^*(+\"\']+|[+——、~@#¥%……&*()]+", "", strs.strip())
# 獲取當天的詳細信息
def request():
url = "http://v.juhe.cn/weixin/query"
payload = {
"pno": 1,
"ps":50,
"dtype":"json",
"key":"_______" # 這裏填寫自己在聚合數據申請api時,產生的key
}
f = requests.get(url,params=payload)
res = json.loads(f.text)
for detail in res["result"]["list"]:
url = detail["url"]
article = Article(url, language='zh') # Chinese
article.download()
article.parse()
text_res = article.text[:].strip()
print("Title is :", detail["title"])
text_res = text_res.replace("\n\n", "")
print(remove_unusual_upunctuation(text_res))
if __name__ == '__main__':
request()