python_爬蟲_豆瓣TOP250_頁面內容

原創

2020-06-16 15:24

本文僅供學習使用，如有侵權，聯繫刪除

豆瓣TOP250書籍頁面內容如下，此次將爬取圖片中的內容

from bs4 import BeautifulSoup
import lxml
import requests
import re
import csv
import random
import time
from lxml import etree
from requests.exceptions import RequestException

book_name_list = []
author_list = []
translator_list = []
publishing_house_list = []
publishing_time_list = []
price_list = []
score_list = []
comment_list = []
sentence_list = []

def get_one_page(url):
    try:
        headers = {
                "User_Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
                }
        response = requests.get(url,headers=headers,timeout = 5)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None


def parse_one_page(html):
    select = etree.HTML(html)
    # 處理book_name
    book_name = select.xpath('//div[@class="pl2"]/a/text()')
    for i in range(len(book_name)):
        book_name[i] = book_name[i].strip()
    book_name = list(filter(None, book_name))

    books_info = select.xpath('//p[@class="pl"]//text()')
    score = select.xpath('//span[@class="rating_nums"]//text()')
    comment = select.xpath('//span[@class="pl"]//text()')
    message = select.xpath('//span[@class="inq"]//text()')

    author_sub_list = []
    translator_sub_list = []
    publishing_house_sub_list = []
    publishing_time_sub_list = []
    price_sub_list = []    


    
    for i in range(len(books_info)):
        info = str(books_info[i])
        info = info.split('/')
        while len(info) <= 4:
            info.insert(1,'NA')
        
        author_sub_list.append(info[0].strip())
        translator_sub_list.append(info[1].strip())
        publishing_house_sub_list.append(info[2].strip())
        publishing_time_sub_list.append(info[3].strip())
        price_sub_list.append(info[4].strip())

    book_name_list.extend(book_name)
    author_list.extend(author_sub_list)
    translator_list.extend(translator_sub_list)
    publishing_house_list.extend(publishing_house_sub_list)
    publishing_time_list.extend(publishing_time_sub_list)
    price_list.extend(price_sub_list)
    score_list.extend(score)
    comment_list.extend(comment)
    sentence_list.extend(message)


def write_csv(filename):
    with open(filename,'w+',encoding='utf-8-sig',newline='') as csvfile:
        fieldnames = ["rank","book_name","author","translator","publishing_house","publishing_time","price","score","comment"]
        writer = csv.DictWriter(csvfile,fieldnames=fieldnames)
        writer.writeheader()
        for i in range(len(book_name_list)):
            writer.writerow({
                "rank":str(i+1),
                "book_name":book_name_list[i],
                "author":author_list[i],
                "translator":translator_list[i],
                "publishing_house":publishing_house_list[i],
                "publishing_time":publishing_time_list[i],
                "price":price_list[i],
                "score":score_list[i],
                "comment":comment_list[i]
                })


def main(offset):
    url = 'https://book.douban.com/top250?start=' + str(offset)
    html = get_one_page(url)
    parse_one_page(html)
    print("book_name={},author={},translator={},publishing_house={},publishing_time={},price={},score={},comment={},sentence={}".format(len(book_name_list),len(author_list),len(translator_list),len(publishing_house_list),len(publishing_time_list),len(price_list),len(score_list),len(comment_list),len(sentence_list)))
    write_csv("douban_info.csv")

if __name__ == '__main__':
    for i in range(10):
        main(offset = i*25)

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

python_爬蟲_豆瓣TOP250_頁面內容

Python_Leetcode_7_整數反轉

Python_Leetcode_1_ 兩數之和

Python_Leetcode_3_無重複字符的最長子串

Python_文本分析_困惑度計算和一致性檢驗

Python_算法實現_(11)位運算

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結