本文爲爬蟲及數據分析學習文章,網頁解析方法較笨,僅作紀念。
百家號爬蟲(獲取各領域創作者appid)
由於百度的限制,每個領域最多能獲取760個id
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from urllib.parse import quote
from urllib import request
from bs4 import BeautifulSoup
from urllib import error
from openpyxl import Workbook
import time
#Some User Agents
hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
{'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\
{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}]
#當遍歷賬號後,百度搜索結果會重新開始;所以要獲取第一個name,作爲停止的判斷標準
def name_first(field):
url = 'https://www.baidu.com/sf?word=%E7%99%BE%E5%AE%B6%E5%8F%B7%2B'\
+quote(field)+'&pd=cambrian_list&atn=index&title=%E7%99%BE%E5%AE%B6%E5%8F%B7%2B'\
+quote(field)+'&lid=9080249029523443283&ms=1&frsrcid=206&frorder=1&pn=0&data_type=json%20---------------------%20'
Response_1= str(request.urlopen(url).read().decode('utf-8'))
soup_1= BeautifulSoup(Response_1,'lxml')
name_1=soup_1.find('div',class_=\
'c-color-link c-font-big sfc-cambrian-list-subscribe-title c-line-clamp1').string.strip()
print(name_1)
return name_1
def appid_list_excel(appid_list,field):
wb=Workbook()
ws=wb.active
ws.append(['name','field','appid','smallfont','vip_info'])
for i in range(len(appid_list)):
lists=appid_list[i]
ws.append([lists[0],lists[1],lists[2],lists[3],lists[4]])
save_path=field
save_path+='.xlsx'
wb.save(save_path)
#從百度搜索獲取各領域百家號賬號信息
def get_appid(field,name_1):
number = 0 #URL地址中,pn=number爲賬號定位,XHR,每次從pn開始返回10賬號,所以要循環操作
appid_list=[]
name='name'
while number<=10000 and name!=name_1 :
url = 'https://www.baidu.com/sf?word=%E7%99%BE%E5%AE%B6%E5%8F%B7%2B'\
+quote(field)+'&pd=cambrian_list&atn=index&title=%E7%99%BE%E5%AE%B6%E5%8F%B7%2B'\
+quote(field)+'&lid=9080249029523443283&ms=1&frsrcid=206&frorder=1&pn='\
+str(number)+'&data_type=json%20---------------------%20'
try:
req = request.Request(url, headers=hds[number%len(hds)])
Response = str(request.urlopen(req).read().decode('utf-8'))
soup = BeautifulSoup(Response,'lxml')
subsrcibes =soup.find_all('div',class_="sfc-cambrian-list-subscribe")
except error.HTTPError as e:
print("HTTPError")
print(e.code)
except error.URLError as e:
print("URLError")
print(e.reason)
for subsrcibe in subsrcibes:
smallfont=subsrcibe.find('div',class_='c-font-small c-gray c-line-clamp1').string.strip()
name=subsrcibe.find('div',class_=\
'c-color-link c-font-big sfc-cambrian-list-subscribe-title c-line-clamp1').string.strip()
img_info=subsrcibe.find_all('img') #從圖片地址截取信息
try:
appid_info=str(img_info[0])
appid=appid_info[appid_info.find('_')+1:appid_info.find('.jpeg')]
except:
appid='缺失'
try:
vip_info=str(img_info[1])\
[str(img_info[1]).find('vip'):str(img_info[1]).find('vip')+5]
except:
vip_info='暫無'
if number>=10 and name==name_1:
break
appid_list.append([name,field,appid,smallfont,vip_info])
number+=10
print('%s==%d'% (field,number))
time.sleep(1)
return appid_list
if __name__=='__main__':
# field_list = ['娛樂’,’體育’,’財經’]
# field_list = ['人文’,'科技','互聯網','數碼','社會']
# field_list = ['汽車','房產','旅遊','女人','情感','時尚','星座','美食','生活']
# field_list = ['育兒','影視','音樂','動漫','搞笑','教育','文化','寵物','遊戲','家居']
# field_list = ['悅讀','藝術','攝影','健康','養生','科學','三農','職場','綜合','百科','學術']
field_list =['其它']
for field in field_list:
name_1=name_first(field)
appid_list=get_appid(field,name_1)
appid_list_excel(appid_list,field)
print('ok')