個人的第一個python爬蟲程序,如果錯誤,請指正。
程序用於抓取豆瓣電影正在熱播的電影信息,詳細代碼如下所示:
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import os,requests
from bs4 import BeautifulSoup
def get_url_page(url,headers):
try:
response = requests.get(url,headers = headers)
if response.status_code == 200:
return response.text
else:
raise Exception('獲取頁面信息異常,response_stauts = %s '%response.status_code)
except RequestException:
return None
if __name__ == '__main__':
print('採集豆瓣電影信息')
##獲取頁面信息
url = 'https://movie.douban.com/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0'}
page_content = get_url_page(url,headers)
##切換路徑
folder_path,file_name = os.path.split(os.path.realpath(__file__))
os.chdir(folder_path)
##存儲網頁html
html_file = os.path.splitext(file_name)[0] + '.html'
file = open(html_file,'w+',encoding='utf-8')
file.truncate()
file.write(page_content)
file.close()
##使用beautifulsoup解析
soup = BeautifulSoup(open(html_file,encoding='utf-8'),features='html.parser')
##soup = BeautifulSoup(page_content,'lxml')
for i,row in enumerate(soup.body.findAll('div',attrs = {"id":"screening"})):
movies_dict = {}
for j,col in enumerate(row.findAll('li',attrs = {"class":"ui-slide-item"})):
if 'data-director' in col.attrs:
##返回上級目錄
os.chdir(folder_path)
movies_path = os.path.join(folder_path,col.ul.li.a.img['alt'])
if not os.path.isdir(movies_path):
os.mkdir(movies_path)
##進入電影目錄,下載電影圖片
os.chdir(movies_path)
movie_name = col.ul.li.a.img['alt']
img_link = col.ul.li.a.img['src']
img = requests.get(img_link)
if img.status_code == 200:
open('1.jpg', 'wb').write(img.content) # 寫入文件
else:
print('圖片不存在')
#寫入電影信息
fo = open(movie_name + '.txt','w',encoding='utf-8')
fo.truncate()
fo.writelines('電影標題:' + col['data-title'] + '\n')
fo.writelines('導演:' + col['data-director'] + '\n')
fo.writelines('時長:' + col['data-duration'] + '\n')
fo.writelines('國家:' + col['data-region'] + '\n')
fo.writelines('評分:' + col['data-rate'] + '\n')
fo.close()
代碼運行效果如下所示: