python_爬蟲_豆瓣TOP250_url

本文僅供學習使用,如有侵權,聯繫刪除。

獲得豆瓣top 250書單的url

import lxml
import requests
import re
import csv
from requests.exceptions import RequestException

url_lt = []

def get_one_page(url):
    try:
        headers = {
                "User_Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
                }
        response = requests.get(url,headers=headers,timeout = 5)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None


def get_book_url_list(html):
    soup = BeautifulSoup(html,'lxml')
    url_list_info = soup.find_all(class_ = 'pl2')
    pattern = re.compile('<a.*?href=(.*?)onclick=.*?title.*?>.*?</a>',re.S)

    for url in url_list_info:
        url = str(url)
        url = re.search(pattern,url)
        url_lt.append(url.group(1).strip())


def main(offset):
    url = 'https://book.douban.com/top250?start=' + str(offset)
    html = get_one_page(url)
    get_book_url_list(html)
    print(len(url_lt))


def write_csv(file,url_list):
    with open(file,'a',encoding='utf-8',newline='') as csvfile:
        fieldnames = ["rank","book_url"]
        writer = csv.DictWriter(csvfile,fieldnames=fieldnames)
        writer.writeheader()
        for i in range(len(url_list)):
            writer.writerow({"rank":i+1,"book_url":url_list[i]})



if __name__ == '__main__':
    for i in range(10):
        main(i)
    write_csv("douban_TOP250_data.csv",url_lt)                                                                                                  
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章