豆瓣電影TOP250爬取,並獲得相關類型的推薦

import requests
import random
from bs4 import BeautifulSoup
import lxml
'''
https://movie.douban.com/top250
https://movie.douban.com/top250?start=25
https://movie.douban.com/top250?start=50&filter=
'''
header1 = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
              '(KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
    'Host': "movie.douban.com"
} #谷歌
header2 = {
    'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
                 " (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362",
    'Host': "movie.douban.com"
} # ie
header3 = {
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0",
    'Host': "movie.douban.com"
}
header_list = [header1, header2, header3]
datas = {}
comedy = {} #喜劇
love = {} #愛情
sci_fi = {} #科幻
thriller = {} #驚悚
crime = {} #犯罪
animation = {} #動畫
for i in range(1, 11):
    if i == 1:
        url = "https://movie.douban.com/top250"
    else:
        url = 'https://movie.douban.com/top250?start=%d&filter='%((i-1)*25)
    header = header_list[random.randint(0, 2)]
    req = requests.get(url, headers = header)
    html = req.text
    bf = BeautifulSoup(html, 'lxml')
    soup = bf.find_all('div', class_ = 'info')
    for item in soup:
       data = {}
       movie_name = item.find('a').find('span').string
       score_str = item.find('div', class_= 'star').find('span', class_ = 'rating_num').string
       score = float(score_str)
       director_str = item.find('div', class_ = 'bd').find('p')
       director_str =  str(director_str)
       director_str = director_str.replace(' ', '')
       director_str = director_str.replace('<pclass="">', '')
       director_str = director_str.replace('TimRobbins/...<br/>', '')
       director_str = director_str.replace('</p>', '')
       director_str = director_str.replace('...<br/>', '')
       director_str = director_str.split()
       director = director_str[0]
       starring = director_str[1]
       time = director_str[2]
       type = director_str[-1]
       data['name'] = movie_name
       data['director'] = director[3 : ]
       data['type'] = type
       data['time'] = time
       data['score'] = score
       datas[movie_name] = data
       if '喜劇' in type and score >= 9.0:
           comedy[movie_name] = data
       if '愛情' in type and score >= 9.0:
           love[movie_name] = data
       if '科幻' in type and score >= 9.0:
           sci_fi[movie_name] = data
       if '驚悚' in type and score >= 9.0:
           thriller[movie_name] = data
       if '犯罪' in type and score >= 9.0:
           crime[movie_name] = data
       if '動畫' in type and score >= 9.0:
           animation[movie_name] = data
#超級推薦:
print("豆瓣評分最高" + '>'*10)
datas = sorted(datas.items(), key = lambda x:x[1]['score'], reverse=True)
i = 0
tplt = "{0:{2}^10}\t\t\t{1:{2}<10}"
print(tplt.format("電影名稱", "評分", chr(12288)))
for value in datas:
    print(tplt.format(value[1]["name"], value[1]["score"], chr(12288)))
    i += 1
    if i == 10:
        break
print()

#喜劇電影
print("喜劇電影推薦" + '>'*10)
comedy = sorted(comedy.items(), key = lambda x:x[1]['score'], reverse=True)
i = 0
tplt = "{0:{2}^10}\t\t\t{1:{2}<10}"
print(tplt.format("電影名稱", "評分", chr(12288)))
for value in comedy:
    print(tplt.format(value[1]["name"], value[1]["score"], chr(12288)))
    i += 1
    if i == 10:
        break
print()
#愛情電影
print("愛情電影推薦" + '>'*10)
love = sorted(love.items(), key = lambda x:x[1]['score'], reverse=True)
i = 0
tplt = "{0:{2}^10}\t\t\t{1:{2}<10}"
print(tplt.format("電影名稱", "評分", chr(12288)))
for value in love:
    print(tplt.format(value[1]["name"], value[1]["score"], chr(12288)))
    i += 1
    if i == 10:
        break
print()
#科幻電影
print("科幻電影推薦" + '>'*10)
sci_fi = sorted(sci_fi.items(), key = lambda x:x[1]['score'], reverse=True)
i = 0
tplt = "{0:{2}^10}\t\t\t{1:{2}<10}"
print(tplt.format("電影名稱", "評分", chr(12288)))
for value in sci_fi:
    print(tplt.format(value[1]["name"], value[1]["score"], chr(12288)))
    i += 1
    if i == 10:
        break
print()

#驚悚電影
print("驚悚電影推薦" + '>'*10)
thriller = sorted(thriller.items(), key = lambda x:x[1]['score'], reverse=True)
i = 0
tplt = "{0:{2}^10}\t\t\t{1:{2}<10}"
print(tplt.format("電影名稱", "評分", chr(12288)))
for value in thriller:
    print(tplt.format(value[1]["name"], value[1]["score"], chr(12288)))
    i += 1
    if i == 10:
        break
print()

#犯罪電影
print("犯罪電影推薦" + '>'*10)
crime = sorted(crime.items(), key = lambda x:x[1]['score'], reverse=True)
i = 0
tplt = "{0:{2}^10}\t\t\t{1:{2}<10}"
print(tplt.format("電影名稱", "評分", chr(12288)))
for value in crime:
    print(tplt.format(value[1]["name"], value[1]["score"], chr(12288)))
    i += 1
    if i == 10:
        break
print()

#動畫電影
print("動畫電影推薦" + '>'*10)
animation = sorted(animation.items(), key = lambda x:x[1]['score'], reverse=True)
i = 0
tplt = "{0:{2}^10}\t\t\t{1:{2}<10}"
print(tplt.format("電影名稱", "評分", chr(12288)))
for value in animation:
    print(tplt.format(value[1]["name"], value[1]["score"], chr(12288)))
    i += 1
    if i == 10:
        break
print()

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章