Python3爬蟲入門之爬取豆瓣Top250電影名稱
準備工具
- Python3.5
- requests
- BeautifulSoup
- lxml
最終效果
- 首先看一下網站的結構
可以很清楚的看到每個電影對應了一個<li>
標籤,我們只需要一步一步的從<ol>
向下搜索,可以得到電影對應的名稱,即<span class="titile">肖申克的救贖</span>
這一行
- 接着看一下網頁內 後頁按鈕對應的代碼結構
可以看出後一頁的URL爲 https://movie.douban.com/top250?start=25&filter=
最後一頁這沒有這個標籤 對應None
這樣我們就可以進行翻頁了
直接上代碼
- 獲取html代碼
這裏使用requests模塊,獲取很方便
import requests
def download_page(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
data = requests.get(url, headers=headers).content
return data
- 解析html
獲取到html源碼後就要對其進行解析了,這裏使用BeautifulSoup模塊
from bs4 import BeautifulSoup
URL='https://movie.douban.com/top250'
def parse_html(html):
soup = BeautifulSoup(html,'lxml')
movie_name_list = []
movie_list_soup = soup.find('ol', attrs={'class':'grid_view'})
for movie_li in movie_list_soup.find_all('li'):
detail = movie_li.find('div', attrs={'class':'hd'})
movie_name = detail.find('span', attrs={'class':'title'}).getText()
movie_name_list.append(movie_name)
next_page = soup.find('span',attrs={'class':'next'}).find('a')
if next_page:
return movie_name_list,URL+next_page['href']
return movie_name_list,None
from bs4 import BeautifulSoup
URL='https://movie.douban.com/top250'
def parse_html1(html):
soup = BeautifulSoup(html, 'lxml');
movie_names = []
movie_list = soup.select('ol.grid_view li div.item div.info div.hd a')
for movie_title in movie_list:
movie_name = movie_title.find('span',class_='title')
movie_names.append(movie_name.getText())
next_page = soup.find('span',class_='next').find('a')
if next_page:
return movie_names,URL+next_page['href']
return movie_names,None
import requests
from bs4 import BeautifulSoup
URL='https://movie.douban.com/top250'
def download_page(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
data = requests.get(url, headers=headers).content
return data
def parse_html1(html):
soup = BeautifulSoup(html, 'lxml');
movie_names = []
movie_list = soup.select('ol.grid_view li div.item div.info div.hd a')
for movie_title in movie_list:
movie_name = movie_title.find('span',class_='title')
movie_names.append(movie_name.getText())
next_page = soup.find('span',class_='next').find('a')
if next_page:
return movie_names,URL+next_page['href']
return movie_names,None
def main():
url = URL
with codecs.open('e:/movies.txt','wb',encoding='utf-8') as fp:
while url:
html = download_page(url)
movies,url=parse_html1(html)
for movie_name in movies:
fp.write(movie_name)
fp.write('\r\n')
if __name__=='__main__':
main()