貓眼電影-爬取(Python)

此篇文章是根據https://mp.weixin.qq.com/s/rRtb8ATXrVxr3r5uLEhRtA這個文章的步驟進行爬取的。有興趣的可以直接到該作者的連接查看文章。

  介紹一下我的裝備:

  瀏覽器:谷歌瀏覽器

 

 

爬取步驟

一,到貓眼官網,電影選項,查看每個電影的連接。

選擇一個電影,右擊選擇“檢查(shift+ctrl+i)”,可以找到該電影的跳轉連接。

二、對電影詳情頁進行初步分析

右鍵檢查可以看到用戶評分,人評分,累計票房都做了反爬蟲的限制。

在開發人員工具中可以看到數字前面的class屬性爲stonefont,所以通過Ctrl+F找到獲取文字編碼的URL

得到了文字的編碼之後,需要對其進行解碼。作者是用pycharm來進行爬取的,我這邊先試一下jupyter,表示無法安裝合適的第三方包,所以我還是選擇用pycharm。

因爲原文中省略了一些步驟,所以剛開始看的時候的還有點懵。後來仔細閱讀和嘗試了一下,知道下載woff的意思是將貓眼上面的woff下載下來。直接複製粘貼上圖中format('woff')漆面的連接就可以下載下來了。

 

---------更新2019.01.25----------

按照作者之前的代碼操作,發現現在是無法運行的,所以借鑑了一下,然後又添加了自己的一些想法。

問題1:用BeautifulSoup無法爬取stonefont中的數據,爬取出來的都是空值。

解法:用正則表達式可以爬取。

問題2:每個網頁中的反爬文字下載鏈接都是不同的,會動態改變。而且每個woff文字對應的規則也不一致。

解法:可以將woff鏈接保存下來。但是怎麼將不同的woff規則對應到數字,暫無解決。

 

第一遍代碼:

import requests
import time
from fontTools.ttLib import TTFont
from bs4 import BeautifulSoup
import re
import lxml
import pymysql

# 經典電影爲showType=3,offset=30
# 抓取2018年上映的電影,
# 按照在網頁上面點擊的選項出現的鏈接爲:https://maoyan.com/films?showType=3&sortId=1&yearId=13,且網頁上面顯示的爲195頁。


# 步驟1,遍歷首頁連接,得到每個影片的連接。


head = """
accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
accept-encoding: gzip, deflate, br
accept-language: zh-CN,zh;q=0.9
cache-control: max-age=0
Host: maoyan.com
content-type: text/html; charset=utf-8
upgrade-insecure-requests: 1
user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36
"""


# 構建請求頭,可以在不同函數裏構建不同的請求頭
def str_to_dict(header):
    header_dict = {}
    header = header.split('\n')
    for h in header:
        h = h.strip()
        if h:
            k, v = h.split(':', 1)
            header_dict[k] = v.strip()
    return header_dict


# 獲取影片的詳情頁
def get_url():
        a = 194*30+1
        num2 = 0
        for i in range(0, a, 30):
            # time.sleep(5)
            url = 'https://maoyan.com/films?showType=3&sortId=1&yearId=13&offset=' + str(i)
            host = """referer: https://maoyan.com/films?showType=3&sortId=1&yearId=13"""
            header = head + host
            headers = str_to_dict(header)
            response = requests.get(url=url, headers=headers)
            html = response.text
            soup = BeautifulSoup(html, 'html.parser')
            data_1 = soup.find_all('div', {'class': 'channel-detail movie-item-title'})
            data_2 = soup.find_all('div', {'class': 'channel-detail channel-detail-orange'})
            urls_lib = []
            num = 0
            num2 += 1
            for item in data_1:
                # time.sleep(6)
                num += 1
                url_1 = item.select('a')[0]['href']
                if data_2[num-1].get_text() != '暫無評分':
                    url = 'https://maoyan.com' + url_1
                    urls_lib.append(url)
                    with open('movies_url.txt', 'a+', encoding='utf-8') as f:
                        f.write(url+ '\n')
                    f.close()
            print('----已進行{}頁數據---'.format(num2))
        print("已結束")

# 獲得電影詳情頁的信息
def get_message(url):
    time.sleep(3)
    data = {}
    headers = str_to_dict(head)
    response = requests.get(url=url, headers=headers)
    u = response.text
    # 獲取反爬字體的下載連接
    cmp = re.compile(",\n           url\('(//.*.woff)'\) format\('woff'\)")
    rst = cmp.findall(u)
    # 爬取反爬蟲的數字
    cmp2 = re.compile('<span class="stonefont">(.*?)</span>')
    scores = cmp2.findall(u)
    cmp3 = re.compile('<span class="stonefont">(.*?)</span><span class="unit">(.*?)</span>')
    price = cmp3.findall(u)
    # 獲取電影信息,因爲查看了幾頁都未發現暫無票房的情況,所以暫未做判斷
    soup = BeautifulSoup(u, "lxml")
    name = soup.find_all('h3', {'class': 'name'})
    ell = soup.find_all('li', {'class': 'ellipsis'})
    # 返回電影信息
    data["name"] = name[0].get_text()
    data["type"] = ell[0].get_text()
    data["country"] = ell[1].get_text().split('/')[0].strip().replace('\n', '')
    data["length"] = ell[1].get_text().split('/')[1].strip().replace('\n', '')
    data["released"] = ell[2].get_text()[:10]
    data["score"] = scores[0]
    data["score_hum"] = scores[1]
    data["price"] = str(price[0]).replace('\'', '')
    data["woff"] = str(rst[0]).replace('\'', '')
    print(data)
    return data
    # print(data)


def to_mysql(data):
    db = pymysql.connect(host='localhost', user='root', passwd='password', port=3306, db='maoyan')
    cursor = db.cursor()
    # table = 'films'
    # keys = ''.join(data.keys())
    # values = ''.join(['%s'] * len(data))
    sql = 'insert into films values (%s, %s, %s, %s, %s, %s, %s, %s, %s)'
    try:
        if cursor.execute(sql, (
        data["name"], data["type"], data["country"], data["length"], data["released"], data["score"], data["score_hum"],
        data["price"], data["woff"])):
            print("Successful")
            db.commit()
    except:
        print('Failed')
        db.rollback()
    db.close()


def get_all():
    # get_url()
    with open('movies_url.txt', 'r', encoding='utf-8') as f:
        num = 0
        for url in f:
            num += 1
            data = get_message(url.split('\n')[0])
            to_mysql(data)
            print('已寫入第{}條數據'.format(num))
    f.close()

get_all()
# url = 'https://maoyan.com/films/1229049'
#
# data = get_message(url)
# to_mysql(data)
# to_mysql(data)
# 轉化字體規則
# font = TTFont('maoyan.woff')
# font.saveXML('maoyan.xml')

之前隨機選擇幾頁都未發現暫無票房的情況,所以直接運行了以上代碼,然後寫入到115條數據時報錯,出現了暫無票房的情況。故需要再添加判斷再裏面。

    if price:
        data["price"] = str(price[0]).replace('\'', '')
    else:
        data["price"] = "暫無"

添加了票房判斷之後,再次爬取了155條數據,然後又出現問題了,居然還有沒有時長的。。。。。所以需要再次添加時長判斷。

    if '/' in ell[1].get_text():
        data["length"] = ell[1].get_text().split('/')[1].strip().replace('\n', '')
    else:
        data["length"] = ""

再次爬取了81條數據,然後,居然沒有評價的人數。不會我想要的數據,都要加一次判斷吧。。。。。

    if len(scores) == 2:
        data["score_hum"] = scores[1]
    else:
        data["score_hum"] = ""

終於順利爬完了。

未轉化反爬蟲文字的電影詳情爬取。將完整代碼放一下。

import requests
import time
from fontTools.ttLib import TTFont
from bs4 import BeautifulSoup
import re
import lxml
import pymysql

# 經典電影爲showType=3,offset=30
# 抓取2018年上映的電影,
# 按照在網頁上面點擊的選項出現的鏈接爲:https://maoyan.com/films?showType=3&sortId=1&yearId=13,且網頁上面顯示的爲195頁。


# 步驟1,遍歷首頁連接,得到每個影片的連接。


head = """
accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
accept-encoding: gzip, deflate, br
accept-language: zh-CN,zh;q=0.9
cache-control: max-age=0
Host: maoyan.com
content-type: text/html; charset=utf-8
upgrade-insecure-requests: 1
user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36
"""


# 構建請求頭,可以在不同函數裏構建不同的請求頭
def str_to_dict(header):
    header_dict = {}
    header = header.split('\n')
    for h in header:
        h = h.strip()
        if h:
            k, v = h.split(':', 1)
            header_dict[k] = v.strip()
    return header_dict


# 獲取影片的詳情頁
def get_url():
        a = 194*30+1
        num2 = 0
        for i in range(0, a, 30):
            # time.sleep(5)
            url = 'https://maoyan.com/films?showType=3&sortId=1&yearId=13&offset=' + str(i)
            host = """referer: https://maoyan.com/films?showType=3&sortId=1&yearId=13"""
            header = head + host
            headers = str_to_dict(header)
            response = requests.get(url=url, headers=headers)
            html = response.text
            soup = BeautifulSoup(html, 'html.parser')
            data_1 = soup.find_all('div', {'class': 'channel-detail movie-item-title'})
            data_2 = soup.find_all('div', {'class': 'channel-detail channel-detail-orange'})
            urls_lib = []
            num = 0
            num2 += 1
            for item in data_1:
                # time.sleep(6)
                num += 1
                url_1 = item.select('a')[0]['href']
                if data_2[num-1].get_text() != '暫無評分':
                    url = 'https://maoyan.com' + url_1
                    urls_lib.append(url)
                    with open('movies_url.txt', 'a+', encoding='utf-8') as f:
                        f.write(url+ '\n')
                    f.close()
            print('----已進行{}頁數據---'.format(num2))
        print("已結束")

# 獲得電影詳情頁的信息
def get_message(url):
    time.sleep(3)
    data = {}
    headers = str_to_dict(head)
    response = requests.get(url=url, headers=headers)
    u = response.text
    # 獲取反爬字體的下載連接
    cmp = re.compile(",\n           url\('(//.*.woff)'\) format\('woff'\)")
    rst = cmp.findall(u)
    # 爬取反爬蟲的數字
    cmp2 = re.compile('<span class="stonefont">(.*?)</span>')
    scores = cmp2.findall(u)
    cmp3 = re.compile('<span class="stonefont">(.*?)</span><span class="unit">(.*?)</span>')
    price = cmp3.findall(u)
    # 獲取電影信息,因爲查看了幾頁都未發現暫無票房的情況,所以暫未做判斷
    soup = BeautifulSoup(u, "lxml")
    name = soup.find_all('h3', {'class': 'name'})
    ell = soup.find_all('li', {'class': 'ellipsis'})
    # 返回電影信息
    data["name"] = name[0].get_text()
    data["type"] = ell[0].get_text()
    data["country"] = ell[1].get_text().split('/')[0].strip().replace('\n', '')
    if '/' in ell[1].get_text():
        data["length"] = ell[1].get_text().split('/')[1].strip().replace('\n', '')
    else:
        data["length"] = ""
    data["released"] = ell[2].get_text()[:10]
    data["score"] = scores[0]
    if len(scores) == 2:
        data["score_hum"] = scores[1]
    else:
        data["score_hum"] = ""
    if price:
        data["price"] = str(price[0]).replace('\'', '')
    else:
        data["price"] = "暫無"
    data["woff"] = str(rst[0]).replace('\'', '')
    print(data)
    return data


def to_mysql(data):
    db = pymysql.connect(host='localhost', user='root', passwd='password', port=3306, db='maoyan')
    cursor = db.cursor()
    sql = 'insert into films values (%s, %s, %s, %s, %s, %s, %s, %s, %s)'
    try:
        if cursor.execute(sql, (
        data["name"], data["type"], data["country"], data["length"], data["released"], data["score"], data["score_hum"],
        data["price"], data["woff"])):
            print("Successful")
            db.commit()
    except:
        print('Failed')
        db.rollback()
    db.close()


def get_all():
    # get_url()
    with open('movies_url.txt', 'r', encoding='utf-8') as f:
        num = 0
        for url in f:
            num += 1
            data = get_message(url.split('\n')[0])
            to_mysql(data)
            print('已寫入第{}條數據'.format(num))
    f.close()

get_all()

 

現在還有一個難點,就是怎麼用每個電影頁的文字規則,去替換爬取的反爬文字。

 

---------未完待續----------

 

用SQL查詢了一下,woff的下載鏈接一共有41條不同的規則。現在有一個笨辦法,就是在百度字體編輯器中,把41條規則都查詢一遍,然後按照對應的規則,將反爬文字轉化成正常文字。

 

之前一直未理解作者文章的中的文字轉化的原理和步驟,然後搜索了一下,看了這篇文字之後大致理解了一下,就是通過圖形判斷數字。Python爬蟲實例:爬取貓眼電影——破解字體反爬。仔細閱讀了Python爬蟲實例文章後,才發現這個裏面的思路,和操作纔是我實際需要的。這個比微信文章裏面的更容易理解和操作一些。

 

結合微信文章和爬蟲實例文章,綜合成一下代碼,親測可用成功。

import os
import time
import re
import requests
from fontTools.ttLib import TTFont
# from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import pymysql


head = """
accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
accept-encoding: gzip, deflate, br
accept-language: zh-CN,zh;q=0.9
cache-control: max-age=0
Host: maoyan.com
content-type: text/html; charset=utf-8
upgrade-insecure-requests: 1
user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36
"""




os.makedirs('font', exist_ok=True)
regex_woff = re.compile("(?<=url\(').*\.woff(?='\))")
regex_text = re.compile('(?<=<span class="stonefont">).*?(?=</span>)')
regex_font = re.compile('(?<=&#x).{4}(?=;)')

basefont = TTFont('base.woff')
fontdict = {'uniF411': '8', 'uniF043': '4', 'uniE50C': '1', 'uniE3FF': '9', 'uniEC2B': '2', 'uniF290': '5', 'uniEFC2': '3', 'uniE4B7': '0', 'uniF675': '6', 'uniF2EE': '7'}

def str_to_dict(header):
    header_dict = {}
    header = header.split('\n')
    for h in header:
        h = h.strip()
        if h:
            k, v = h.split(':', 1)
            header_dict[k] = v.strip()
    return header_dict

# 獲取影片的詳情頁
def get_url():
    a = 194 * 30 + 1
    num2 = 0
    for i in range(0, a, 30):
        # time.sleep(5)
        url = 'https://maoyan.com/films?showType=3&sortId=1&yearId=13&offset=' + str(i)
        host = """referer: https://maoyan.com/films?showType=3&sortId=1&yearId=13"""
        header = head + host
        headers = str_to_dict(header)
        response = requests.get(url=url, headers=headers)
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        data_1 = soup.find_all('div', {'class': 'channel-detail movie-item-title'})
        data_2 = soup.find_all('div', {'class': 'channel-detail channel-detail-orange'})
        urls_lib = []
        num = 0
        num2 += 1
        for item in data_1:
            # time.sleep(6)
            num += 1
            url_1 = item.select('a')[0]['href']
            if data_2[num - 1].get_text() != '暫無評分':
                url = 'https://maoyan.com' + url_1
                urls_lib.append(url)
                with open('movies_url.txt', 'a+', encoding='utf-8') as f:
                    f.write(url + '\n')
                f.close()
        print('----已進行{}頁數據---'.format(num2))
    print("已結束")


def get_moviescore(url):
    host = 'https://maoyan.com'
    headers = str_to_dict(head)
    html = requests.get(url, headers=headers).text
    time.sleep(3)
    soup = BeautifulSoup(html, 'lxml')
    msg = {}

    dsoup = BeautifulSoup(html, 'lxml')
    msg['name'] = dsoup.find(class_='name').text
    ell = dsoup.find_all('li', {'class': 'ellipsis'})
    msg['type'] = ell[0].text
    msg['country'] = ell[1].text.split('/')[0].strip()
    if len(ell[1].text.split('/')) != 1:
        msg['length'] = ell[1].text.split('/')[1].strip()
    else:
        msg['length'] = ''
    msg['release-time'] = ell[2].text[:10]

    # 下載字體文件
    woff = regex_woff.search(html).group()
    wofflink = 'http:' + woff
    localname = 'font\\' + os.path.basename(wofflink)
    if not os.path.exists(localname):
        downloads(wofflink, localname)
    font = TTFont(localname)

    # 其中含有 unicode 字符,BeautifulSoup 無法正常顯示,只能用原始文本通過正則獲取
    ms = regex_text.findall(html)
    if len(ms) < 3:
        msg['score'] = '0'
        msg['score-num'] = '0'
        msg['box-office'] = '0'
    else:
        msg['score'] = get_fontnumber(font, ms[0])
        msg['score-num'] = get_fontnumber(font, ms[1])
        msg['box-office'] = get_fontnumber(font, ms[2]) + dsoup.find('span', class_='unit').text
    to_mysql(msg)


def to_mysql(msg):
    db = pymysql.connect(host='localhost', user='root', passwd='password', port=3306, db='maoyan')
    cursor = db.cursor()
    sql = 'insert into films values (%s, %s, %s, %s, %s, %s, %s, %s)'
    try:
        if cursor.execute(sql, (
                msg["name"], msg["type"], msg["country"], msg["length"], msg["release-time"], msg["score"],
                msg["score-num"], msg["box-office"]
        )):
            print("Successful")
            db.commit()
    except:
        print('Failed')
        db.rollback()
    db.close()

def get_fontnumber(newfont, text):
    ms = regex_font.findall(text)
    for m in ms:
        text = text.replace(f'&#x{m};', get_num(newfont, f'uni{m.upper()}'))
    return text


def get_num(newfont, name):
    uni = newfont['glyf'][name]
    for k, v in fontdict.items():
        if uni == basefont['glyf'][k]:
            return v


def downloads(url, localfn):
    with open(localfn, 'wb+') as sw:
        sw.write(requests.get(url).content)



def get_all():
    get_url()
    with open('movies_url.txt', 'r', encoding='utf-8') as f:
        num = 0
        for url in f:
            num += 1
            data = get_moviescore(url.split('\n')[0])
            print('已寫入第{}條數據'.format(num))
    f.close()

get_all()

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章