三種方法抓取貓眼電影top100信息

分別使用BeautifulSoup,xpath,和正則表達式提取貓眼電影top100的信息。程序很簡單,就不解釋了,直接上程序吧。

# coding:utf-8
import requests
import re
from lxml import html
from bs4 import BeautifulSoup

url = 'http://maoyan.com/board/4?' 

def getResponse(url,par=None):
    try:
        response = requests.get(url,params=par)#params的用法還是很重要的,自己上網查一查吧
        response.raise_for_status()
        response.encoding = 'utf-8'
        return response
    except:
        exit('url 解析失敗')

def bs4_info(response):
    soup = BeautifulSoup(response.text,'html.parser')
    names = [i.a.string for i in soup.find_all(name='p',attrs='name')]
    stars = [i.string.strip() for i in soup.find_all(name='p',attrs='star')]
    times = [i.string for i in soup.find_all(name='p',attrs='releasetime')] 
    scores_tag = [i.contents for i in soup.find_all(name='p',attrs='score')]
    scores = [item[0].string + item[1].string for item in scores_tag]

    return names,stars,times,scores

def lxml_info(response):
    element_html = html.fromstring(response.content.decode('utf-8'))#這個地方注意一下,不加decode的話中文的顯示會有問題

    names = element_html.xpath("//p[@class='name']/a/text()")#text後面要加括號
    stars = [i.strip() for i in element_html.xpath("//p[@class='star']/text()")]
    times = [i for i in element_html.xpath("//p[@class='releasetime']/text()")]
    scores_integer = element_html.xpath("//i[@class='integer']/text()")
    scores_fraction = element_html.xpath("//i[@class='fraction']/text()")
    scores = [scores[0] + scores[1] for scores in zip(scores_integer,scores_fraction)]

    return names,stars,times,scores

def re_info(response):
    text = response.text

    name_pattern = re.compile(r'<p class="name"><a href=".*?title="(.*?)" data-act')#括號表示要提取的內容
    time_pattern = re.compile(r'<p class="releasetime">上映時間:(.{10}).*?</p>')
    star_pattern = re.compile(r'<p class="star">.*?主演:(.*?)</p>',re.S)
    score_pattern = re.compile(r'<p class="score"><i class="integer">(.*?)</i><i class="fraction">(\d)</i></p>')

    names = name_pattern.findall(text)
    times = time_pattern.findall(text)
    stars = [x.strip() for x in star_pattern.findall(text)]
    scores = [score[0] + score[1] for score in score_pattern.findall(text)]

    return names,stars,times,scores

names,stars,times,scores = [],[],[],[]

for i in range(10):
    response = getResponse(url,par={'offset':str(10*i)})
    name,star,time,score = re_info(response)
    names += name
    stars += star
    times += time
    scores += score
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章