import urllib.request
from bs4 import BeautifulSoup
import re
# 豆瓣電影top250
def __getHtml():
data = []
pageNum = 1
pageSize = 0
try:
while(pageSize <= 225):
# headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
# 'Referer':None #注意如果依然不能抓取的話,這裏可以設置抓取網站的host
# }
# opener = urllib.request.build_opener()
# opener.addheaders = [headers]
url = "https://movie.douban.com/top250?start="+str(pageSize)+"&filter="+str(pageNum)
#data['html%s' % i ]=urllib.request.urlopen(url).read().decode("utf-8")
data.append(urllib.request.urlopen(url).read().decode("utf-8"))
pageSize+=25
pageNum+=1
print(pageSize,pageNum)
except Exception as e:
raise e
return data
def __getData(html):
title=[]
rating_num=[]
range_num=[]
data={}
# bs4解析html
soup = BeautifulSoup(html,"html.parser")
for li in soup.find("ol",attrs={'class':'grid_view'}).find_all("li"):
title.append(li.find("span",class_="title").text)
rating_num.append(li.find("div",class_='star').find("span",class_='rating_num').text)
range_num.append(li.find("div",class_='pic').find("em").text)
data['title'] = title
data['rating_num'] = rating_num
data['range_num'] = range_num
return data
def __getMovies(data):
# reg = r'src="(.+?\.jpg)"'
# imgre = re.compile(reg)
# imglist = re.findall(imgre,html)
# i=0
# for pic in imglist:
# urllib.request.urlretrieve(pic,"F:\pic\%s.jpg" % i)
# i+=1
f = open('F://1.html','w')
f.write("<html>")
f.write("<body>")
f.write("<table>")
f.write("<thead>")
f.write("<tr>")
f.write("<th>電影</th>")
f.write("<th>排名</th>")
f.write("<th>評分</th>")
f.write("</tr>")
f.write("</thead>")
f.write("<tbody>")
for data in datas:
for i in range(0,25):
f.write("<tr>")
f.write("<td style='color:orange;text-align:center'>%s</td>" % data['title'][i])
f.write("<td style='color:blue;text-align:center'>%s</td>" % data['rating_num'][i])
f.write("<td style='color:red;text-align:center'>%s</td>" % data['range_num'][i])
f.write("</tr>")
f.write("</tbody>")
f.write("</thead>")
f.write("</table>")
f.write("</body>")
f.write("</html>")
f.close()
if __name__ == '__main__':
datas = []
htmls = __getHtml()
for i in range(len(htmls)):
data = __getData(htmls[i])
datas.append(data)
#print(htmls)
__getMovies(datas)
生成的html: