python爬取知乎精華

原創

2020-07-01 02:58



import json
import csv
import requests
import re
import time


def getchina(str1): # 提取中文
	res1 = ''.join(re.findall('[\u4e00-\u9fa5]',str1))
	return res1
	
def gettime(timeStamp): # 將時間戳轉爲時間字符串
	timeArray = time.localtime(timeStamp)
	#otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
	otherStyleTime = time.strftime("%Y-%m-%d", timeArray)
	return otherStyleTime
	
def writecsv(data): # 將數據寫入csv
	for item in data:
		#item['target']['content']
		if 'title' not in item['target'].keys():
			#print(item['target']['question']['title'],gettime(item['target']['updated_time']),item['target']['author']['name'],item['target']['voteup_count'],item['target']['comment_count'],getchina(item['target']['content']))
			with open('ifo.csv','a',newline='') as f:
				csv_writer = csv.writer(f)
				csv_writer.writerow([item['target']['question']['title'],gettime(item['target']['updated_time']),item['target']['author']['name'],item['target']['voteup_count'],item['target']['comment_count'],getchina(item['target']['content'])])
		else :
			#print(item['target']['title'],gettime(item['target']['updated']),item['target']['author']['name'],item['target']['voteup_count'],item['target']['comment_count'],getchina(item['target']['content']))
			with open('ifo.csv','a',newline='') as f:
				csv_writer = csv.writer(f)
				csv_writer.writerow([item['target']['title'],gettime(item['target']['updated']),item['target']['author']['name'],item['target']['voteup_count'],item['target']['comment_count'],getchina(item['target']['content'])])
	
with open('ifo.csv','a',newline='',encoding='utf-8') as f:
	csv_writer = csv.writer(f)
	csv_writer.writerow(["title","time","name","voteup","comment","content"])

url = "http://www.zhihu.com/api/v4/topics/21238418/feeds/essence?include=data%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Dpeople%29%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Canswer_type%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.paid_info%3Bdata%5B%3F%28target.type%3Darticle%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dquestion%29%5D.target.annotation_detail%2Ccomment_count%3B&limit=10&offset=0"
headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
    "Refer":"https://www.zhihu.com/"
}
resp = requests.get(url,headers=headers)
content = resp.content.decode('utf-8')
res = json.loads(content)

data = res['data']

count = 0
while(res['paging']['next']!=url):
	count = count + 1
	print(count)
	writecsv(data)
	url = res['paging']['next']
	resp = requests.get(url,headers=headers)
	content = resp.content.decode('utf-8')
	res = json.loads(content)
	data = res['data']

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

python爬取知乎精華

2024年DataOps趨勢預測：AI不會取代數據工程師

雲原生週刊：K8s 中的服務和網絡｜ 2024.4.29

[轉帖]cpupower

今天，昨天，近七天，近30天，近90天，js封裝

華爲云云原生FinOps解決方案，釋放雲原生最大價值

echarts+bmap

安徽省飛機路線圖（echarts+map）

python爬取知乎精華

windows、linux(centos)中php分別讀中文文件名

centos7 apache中php無法寫入文件（權限問題）

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結