導入相關模塊
import requests
import numpy as np
from sleep import time
from bs4 import BeautifulSoup
設置一個請求頭列表
headers = [{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"},
{"User-Agent":"Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50"},
{"User-Agent":"Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0"}]
通過book_spider函數傳入的書籍的關鍵詞 獲取書名 作者 出版信息 評論人數 將內容加入至book_list列表
def book_spider(book_tag):
page_num = 0
book_list = []
try_times = 0
while(1):
# url = 'https://www.douban.com/tag/%E5%B0%8F%E8%AF%B4/book?start=0'
url = 'https://www.douban.com/tag/'+book_tag+'/book?start='+str(page_num*15)
#np.random.rand() 返回一個服從(0~1)均勻分佈隨機樣本值 樣本取值
#範圍[0,) 不包括1
sleep(np.random.rand()*5)
response = requests.get(url,headers=headers[page_num%len(headers)])
data = response.text
#print(data)
soup = BeautifulSoup(data,'lxml')
#find中條件 div標籤下的class = mod book-list 的內容
list_soup = soup.find('div',class_='mod book-list')
#print(list_soup)
#通過list_soup 的內容判斷 是否要循環查找下去
try_times += 1;
if list_soup == None and try_times < 200:
continue
elif list_soup == None or len(list_soup) <= 1:
break
for book_info in list_soup.find_all('dd'):
title = book_info.find('a',class_='title').string
desc = book_info.find('div',class_='desc').string.strip()
desc_list = desc.split('/')
book_url = book_info.find('a',class_='title').get('href').strip()
try:
author_info = '作者/譯者:' + '/'.join(desc_list[0:-3])
except:
author_info = '作者/譯者: 暫無'
try:
pub_info = '出版信息:' + '/'.join(desc_list[-3:])
except:
pub_info = '出版信息: 暫無'
try:
rate_num = book_info.find('span',class_='rating_nums').string.strip()
except:
rate_num = '0.0'
try:
people_num = get_people_num(book_url)
except:
people_num = '0'
book_list.append([title,rate_num,people_num,author_info,pub_info])
try_times = 0
#print(book_list)
#page_num += 1
#if(page_num > 0):
#break
return book_list
通過書的url 進去詳細頁面獲取評論人數
def get_people_num(url):
response = requests.get(url,headers=headers[np.random.randint(0,len(headers))])
data = response.text
soup = BeautifulSoup(data,'lxml')
people_num = soup.find('a',class_='rating_people').find('span').string.strip()
return people_num
獲取列表中的每一個書名的關鍵詞 進而調用 book_spider
def do_spider(book_tag_list):
for book_tag in book_tag_list:
book_list = book_spider(book_tag)
print(book_list)
測試
if __name__ == "__main__":
#book_spider('小說')
book_tag_lists = ['個人管理', '時間管理', '投資', '文化', '宗教']
book_lists = do_spider(book_tag_lists)