python爬蟲--實戰豆瓣電影TOP250

爬取數據的地址:豆瓣電影TOP250


import  requests
from bs4 import BeautifulSoup

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'}


def get_detail_urls(url):
    resp = requests.get(url, headers=headers)
    # print(resp.text)

    # 獲取詳情頁面的url
    html = resp.text
    soup = BeautifulSoup(html, 'lxml')
    lis = soup.find('ol', class_='grid_view').find_all('li')
    detail_urls = []
    for li in lis:
        detail_url = li.find('a')['href']
        # print(detail_url)
        detail_urls.append(detail_url)
    return detail_urls

def parse_detail_url(detail_url,f,page):

    # 解析詳情頁面內容
    resp = requests.get(detail_url, headers=headers)
    # print(detail_url)
    html = resp.text
    soup = BeautifulSoup(html, 'lxml')
    # 獲取電影的名字
    name = list(soup.find('div', id='content').find('h1').stripped_strings)
    name = ''.join(name)
    # print(name)
    try:
        # 導演
        director = list(soup.find('div', id='info').find('span').find('span', class_='attrs').stripped_strings)
        director = ''.join(director)
        # print(director)
        # 編劇
        screenwriter = list(soup.find('div', id='info').find_all('span')[3].find('span', class_='attrs').stripped_strings)
        screenwriter = ''.join(screenwriter)
        # print(screenwriter)
        # 演員
        actor = list(soup.find('span', class_='actor').find('span', class_='attrs').stripped_strings)
        actor = ''.join(actor)
        # print(actor)
        # 評分
        score = soup.find('strong', class_='ll rating_num').string
        print(score)
        f.write('{},{},{},{},{}\n'.format(name,director,screenwriter,actor,score))
    except Exception as e:
        print('第{}頁,{},{}獲取失敗'.format(page,name,detail_url))

def main():
    base_url = 'https://movie.douban.com/top250?start={}&filter='
    # 創建Top250.csv文件,並用來存放數據
    with open('Top250.csv','a',encoding='utf-8') as f:
        # 第一頁參數爲0,第二頁爲25.。。。。。
        for x in range(0,226,25):
            # 獲取網頁上真實的頁數
            page = 1+x/25
            print('正在獲取第%d頁的數據'%page)
            url = base_url.format(x)
            detail_urls = get_detail_urls(url)
            for detail_url in detail_urls:
                parse_detail_url(detail_url,f,page)




if __name__ == '__main__':
    main()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章