本文僅供學習使用,如有侵權,聯繫刪除
豆瓣TOP250書籍頁面內容如下,此次將爬取圖片中的內容
from bs4 import BeautifulSoup
import lxml
import requests
import re
import csv
import random
import time
from lxml import etree
from requests.exceptions import RequestException
book_name_list = []
author_list = []
translator_list = []
publishing_house_list = []
publishing_time_list = []
price_list = []
score_list = []
comment_list = []
sentence_list = []
def get_one_page(url):
try:
headers = {
"User_Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
response = requests.get(url,headers=headers,timeout = 5)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
select = etree.HTML(html)
# 處理book_name
book_name = select.xpath('//div[@class="pl2"]/a/text()')
for i in range(len(book_name)):
book_name[i] = book_name[i].strip()
book_name = list(filter(None, book_name))
books_info = select.xpath('//p[@class="pl"]//text()')
score = select.xpath('//span[@class="rating_nums"]//text()')
comment = select.xpath('//span[@class="pl"]//text()')
message = select.xpath('//span[@class="inq"]//text()')
author_sub_list = []
translator_sub_list = []
publishing_house_sub_list = []
publishing_time_sub_list = []
price_sub_list = []
for i in range(len(books_info)):
info = str(books_info[i])
info = info.split('/')
while len(info) <= 4:
info.insert(1,'NA')
author_sub_list.append(info[0].strip())
translator_sub_list.append(info[1].strip())
publishing_house_sub_list.append(info[2].strip())
publishing_time_sub_list.append(info[3].strip())
price_sub_list.append(info[4].strip())
book_name_list.extend(book_name)
author_list.extend(author_sub_list)
translator_list.extend(translator_sub_list)
publishing_house_list.extend(publishing_house_sub_list)
publishing_time_list.extend(publishing_time_sub_list)
price_list.extend(price_sub_list)
score_list.extend(score)
comment_list.extend(comment)
sentence_list.extend(message)
def write_csv(filename):
with open(filename,'w+',encoding='utf-8-sig',newline='') as csvfile:
fieldnames = ["rank","book_name","author","translator","publishing_house","publishing_time","price","score","comment"]
writer = csv.DictWriter(csvfile,fieldnames=fieldnames)
writer.writeheader()
for i in range(len(book_name_list)):
writer.writerow({
"rank":str(i+1),
"book_name":book_name_list[i],
"author":author_list[i],
"translator":translator_list[i],
"publishing_house":publishing_house_list[i],
"publishing_time":publishing_time_list[i],
"price":price_list[i],
"score":score_list[i],
"comment":comment_list[i]
})
def main(offset):
url = 'https://book.douban.com/top250?start=' + str(offset)
html = get_one_page(url)
parse_one_page(html)
print("book_name={},author={},translator={},publishing_house={},publishing_time={},price={},score={},comment={},sentence={}".format(len(book_name_list),len(author_list),len(translator_list),len(publishing_house_list),len(publishing_time_list),len(price_list),len(score_list),len(comment_list),len(sentence_list)))
write_csv("douban_info.csv")
if __name__ == '__main__':
for i in range(10):
main(offset = i*25)