python爬虫--实战豆瓣电影TOP250

爬取数据的地址:豆瓣电影TOP250


import  requests
from bs4 import BeautifulSoup

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'}


def get_detail_urls(url):
    resp = requests.get(url, headers=headers)
    # print(resp.text)

    # 获取详情页面的url
    html = resp.text
    soup = BeautifulSoup(html, 'lxml')
    lis = soup.find('ol', class_='grid_view').find_all('li')
    detail_urls = []
    for li in lis:
        detail_url = li.find('a')['href']
        # print(detail_url)
        detail_urls.append(detail_url)
    return detail_urls

def parse_detail_url(detail_url,f,page):

    # 解析详情页面内容
    resp = requests.get(detail_url, headers=headers)
    # print(detail_url)
    html = resp.text
    soup = BeautifulSoup(html, 'lxml')
    # 获取电影的名字
    name = list(soup.find('div', id='content').find('h1').stripped_strings)
    name = ''.join(name)
    # print(name)
    try:
        # 导演
        director = list(soup.find('div', id='info').find('span').find('span', class_='attrs').stripped_strings)
        director = ''.join(director)
        # print(director)
        # 编剧
        screenwriter = list(soup.find('div', id='info').find_all('span')[3].find('span', class_='attrs').stripped_strings)
        screenwriter = ''.join(screenwriter)
        # print(screenwriter)
        # 演员
        actor = list(soup.find('span', class_='actor').find('span', class_='attrs').stripped_strings)
        actor = ''.join(actor)
        # print(actor)
        # 评分
        score = soup.find('strong', class_='ll rating_num').string
        print(score)
        f.write('{},{},{},{},{}\n'.format(name,director,screenwriter,actor,score))
    except Exception as e:
        print('第{}页,{},{}获取失败'.format(page,name,detail_url))

def main():
    base_url = 'https://movie.douban.com/top250?start={}&filter='
    # 创建Top250.csv文件,并用来存放数据
    with open('Top250.csv','a',encoding='utf-8') as f:
        # 第一页参数为0,第二页为25.。。。。。
        for x in range(0,226,25):
            # 获取网页上真实的页数
            page = 1+x/25
            print('正在获取第%d页的数据'%page)
            url = base_url.format(x)
            detail_urls = get_detail_urls(url)
            for detail_url in detail_urls:
                parse_detail_url(detail_url,f,page)




if __name__ == '__main__':
    main()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章