百家號爬蟲(獲取各領域創作者appid)

本文爲爬蟲及數據分析學習文章,網頁解析方法較笨,僅作紀念。

百家號爬蟲(獲取各領域創作者appid)

由於百度的限制,每個領域最多能獲取760個id

#!/usr/bin/env python3
# -*- coding: utf-8 -*-


from urllib.parse import quote
from urllib import request
from bs4 import BeautifulSoup
from urllib import error
from openpyxl import Workbook
import time

#Some User Agents
hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
{'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\
{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}]


#當遍歷賬號後,百度搜索結果會重新開始;所以要獲取第一個name,作爲停止的判斷標準
def name_first(field):
    url = 'https://www.baidu.com/sf?word=%E7%99%BE%E5%AE%B6%E5%8F%B7%2B'\
    +quote(field)+'&pd=cambrian_list&atn=index&title=%E7%99%BE%E5%AE%B6%E5%8F%B7%2B'\
    +quote(field)+'&lid=9080249029523443283&ms=1&frsrcid=206&frorder=1&pn=0&data_type=json%20---------------------%20'
    Response_1= str(request.urlopen(url).read().decode('utf-8'))
    soup_1= BeautifulSoup(Response_1,'lxml')
    name_1=soup_1.find('div',class_=\
    'c-color-link c-font-big sfc-cambrian-list-subscribe-title c-line-clamp1').string.strip()
    print(name_1)
    return name_1 
    
def appid_list_excel(appid_list,field):
    wb=Workbook()
    ws=wb.active
    ws.append(['name','field','appid','smallfont','vip_info']) 
    for i in range(len(appid_list)):
        lists=appid_list[i]
        ws.append([lists[0],lists[1],lists[2],lists[3],lists[4]])
    save_path=field
    save_path+='.xlsx'
    wb.save(save_path)


#從百度搜索獲取各領域百家號賬號信息
def get_appid(field,name_1):
    number = 0 #URL地址中,pn=number爲賬號定位,XHR,每次從pn開始返回10賬號,所以要循環操作
    appid_list=[]
    name='name'
    
    while  number<=10000 and name!=name_1 : 

        url = 'https://www.baidu.com/sf?word=%E7%99%BE%E5%AE%B6%E5%8F%B7%2B'\
        +quote(field)+'&pd=cambrian_list&atn=index&title=%E7%99%BE%E5%AE%B6%E5%8F%B7%2B'\
        +quote(field)+'&lid=9080249029523443283&ms=1&frsrcid=206&frorder=1&pn='\
        +str(number)+'&data_type=json%20---------------------%20'  
        
        try:
            req = request.Request(url, headers=hds[number%len(hds)])
            Response = str(request.urlopen(req).read().decode('utf-8'))
            soup = BeautifulSoup(Response,'lxml')
            subsrcibes =soup.find_all('div',class_="sfc-cambrian-list-subscribe")
        except error.HTTPError as e:
                print("HTTPError") 
                print(e.code)
        except error.URLError as e:
                print("URLError")
                print(e.reason)
           
        for subsrcibe in subsrcibes:
            smallfont=subsrcibe.find('div',class_='c-font-small c-gray c-line-clamp1').string.strip()
            name=subsrcibe.find('div',class_=\
            'c-color-link c-font-big sfc-cambrian-list-subscribe-title c-line-clamp1').string.strip()
            img_info=subsrcibe.find_all('img')  #從圖片地址截取信息
            try:
                appid_info=str(img_info[0])
                appid=appid_info[appid_info.find('_')+1:appid_info.find('.jpeg')] 
            except:
                appid='缺失'
            try:
                  vip_info=str(img_info[1])\
                  [str(img_info[1]).find('vip'):str(img_info[1]).find('vip')+5]
            except:
                vip_info='暫無'
            if number>=10 and name==name_1: 
                break         
            appid_list.append([name,field,appid,smallfont,vip_info])
            
        number+=10
        print('%s==%d'% (field,number))
        time.sleep(1)
        
    return appid_list

if __name__=='__main__':
#    field_list = ['娛樂’,’體育’,’財經’]
#    field_list = ['人文’,'科技','互聯網','數碼','社會']
#    field_list = ['汽車','房產','旅遊','女人','情感','時尚','星座','美食','生活']
#    field_list = ['育兒','影視','音樂','動漫','搞笑','教育','文化','寵物','遊戲','家居']
#    field_list = ['悅讀','藝術','攝影','健康','養生','科學','三農','職場','綜合','百科','學術']
    field_list =['其它']
    for field in field_list:
        name_1=name_first(field)
        appid_list=get_appid(field,name_1) 
        appid_list_excel(appid_list,field)
    print('ok')

     
    
    
    
    
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章