此篇文章是根據https://mp.weixin.qq.com/s/rRtb8ATXrVxr3r5uLEhRtA這個文章的步驟進行爬取的。有興趣的可以直接到該作者的連接查看文章。
介紹一下我的裝備:
瀏覽器:谷歌瀏覽器
爬取步驟
一,到貓眼官網,電影選項,查看每個電影的連接。
選擇一個電影,右擊選擇“檢查(shift+ctrl+i)”,可以找到該電影的跳轉連接。
二、對電影詳情頁進行初步分析
右鍵檢查可以看到用戶評分,人評分,累計票房都做了反爬蟲的限制。
在開發人員工具中可以看到數字前面的class屬性爲stonefont,所以通過Ctrl+F找到獲取文字編碼的URL
得到了文字的編碼之後,需要對其進行解碼。作者是用pycharm來進行爬取的,我這邊先試一下jupyter,表示無法安裝合適的第三方包,所以我還是選擇用pycharm。
因爲原文中省略了一些步驟,所以剛開始看的時候的還有點懵。後來仔細閱讀和嘗試了一下,知道下載woff的意思是將貓眼上面的woff下載下來。直接複製粘貼上圖中format('woff')漆面的連接就可以下載下來了。
---------更新2019.01.25----------
按照作者之前的代碼操作,發現現在是無法運行的,所以借鑑了一下,然後又添加了自己的一些想法。
問題1:用BeautifulSoup無法爬取stonefont中的數據,爬取出來的都是空值。
解法:用正則表達式可以爬取。
問題2:每個網頁中的反爬文字下載鏈接都是不同的,會動態改變。而且每個woff文字對應的規則也不一致。
解法:可以將woff鏈接保存下來。但是怎麼將不同的woff規則對應到數字,暫無解決。
第一遍代碼:
import requests
import time
from fontTools.ttLib import TTFont
from bs4 import BeautifulSoup
import re
import lxml
import pymysql
# 經典電影爲showType=3,offset=30
# 抓取2018年上映的電影,
# 按照在網頁上面點擊的選項出現的鏈接爲:https://maoyan.com/films?showType=3&sortId=1&yearId=13,且網頁上面顯示的爲195頁。
# 步驟1,遍歷首頁連接,得到每個影片的連接。
head = """
accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
accept-encoding: gzip, deflate, br
accept-language: zh-CN,zh;q=0.9
cache-control: max-age=0
Host: maoyan.com
content-type: text/html; charset=utf-8
upgrade-insecure-requests: 1
user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36
"""
# 構建請求頭,可以在不同函數裏構建不同的請求頭
def str_to_dict(header):
header_dict = {}
header = header.split('\n')
for h in header:
h = h.strip()
if h:
k, v = h.split(':', 1)
header_dict[k] = v.strip()
return header_dict
# 獲取影片的詳情頁
def get_url():
a = 194*30+1
num2 = 0
for i in range(0, a, 30):
# time.sleep(5)
url = 'https://maoyan.com/films?showType=3&sortId=1&yearId=13&offset=' + str(i)
host = """referer: https://maoyan.com/films?showType=3&sortId=1&yearId=13"""
header = head + host
headers = str_to_dict(header)
response = requests.get(url=url, headers=headers)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
data_1 = soup.find_all('div', {'class': 'channel-detail movie-item-title'})
data_2 = soup.find_all('div', {'class': 'channel-detail channel-detail-orange'})
urls_lib = []
num = 0
num2 += 1
for item in data_1:
# time.sleep(6)
num += 1
url_1 = item.select('a')[0]['href']
if data_2[num-1].get_text() != '暫無評分':
url = 'https://maoyan.com' + url_1
urls_lib.append(url)
with open('movies_url.txt', 'a+', encoding='utf-8') as f:
f.write(url+ '\n')
f.close()
print('----已進行{}頁數據---'.format(num2))
print("已結束")
# 獲得電影詳情頁的信息
def get_message(url):
time.sleep(3)
data = {}
headers = str_to_dict(head)
response = requests.get(url=url, headers=headers)
u = response.text
# 獲取反爬字體的下載連接
cmp = re.compile(",\n url\('(//.*.woff)'\) format\('woff'\)")
rst = cmp.findall(u)
# 爬取反爬蟲的數字
cmp2 = re.compile('<span class="stonefont">(.*?)</span>')
scores = cmp2.findall(u)
cmp3 = re.compile('<span class="stonefont">(.*?)</span><span class="unit">(.*?)</span>')
price = cmp3.findall(u)
# 獲取電影信息,因爲查看了幾頁都未發現暫無票房的情況,所以暫未做判斷
soup = BeautifulSoup(u, "lxml")
name = soup.find_all('h3', {'class': 'name'})
ell = soup.find_all('li', {'class': 'ellipsis'})
# 返回電影信息
data["name"] = name[0].get_text()
data["type"] = ell[0].get_text()
data["country"] = ell[1].get_text().split('/')[0].strip().replace('\n', '')
data["length"] = ell[1].get_text().split('/')[1].strip().replace('\n', '')
data["released"] = ell[2].get_text()[:10]
data["score"] = scores[0]
data["score_hum"] = scores[1]
data["price"] = str(price[0]).replace('\'', '')
data["woff"] = str(rst[0]).replace('\'', '')
print(data)
return data
# print(data)
def to_mysql(data):
db = pymysql.connect(host='localhost', user='root', passwd='password', port=3306, db='maoyan')
cursor = db.cursor()
# table = 'films'
# keys = ''.join(data.keys())
# values = ''.join(['%s'] * len(data))
sql = 'insert into films values (%s, %s, %s, %s, %s, %s, %s, %s, %s)'
try:
if cursor.execute(sql, (
data["name"], data["type"], data["country"], data["length"], data["released"], data["score"], data["score_hum"],
data["price"], data["woff"])):
print("Successful")
db.commit()
except:
print('Failed')
db.rollback()
db.close()
def get_all():
# get_url()
with open('movies_url.txt', 'r', encoding='utf-8') as f:
num = 0
for url in f:
num += 1
data = get_message(url.split('\n')[0])
to_mysql(data)
print('已寫入第{}條數據'.format(num))
f.close()
get_all()
# url = 'https://maoyan.com/films/1229049'
#
# data = get_message(url)
# to_mysql(data)
# to_mysql(data)
# 轉化字體規則
# font = TTFont('maoyan.woff')
# font.saveXML('maoyan.xml')
之前隨機選擇幾頁都未發現暫無票房的情況,所以直接運行了以上代碼,然後寫入到115條數據時報錯,出現了暫無票房的情況。故需要再添加判斷再裏面。
if price:
data["price"] = str(price[0]).replace('\'', '')
else:
data["price"] = "暫無"
添加了票房判斷之後,再次爬取了155條數據,然後又出現問題了,居然還有沒有時長的。。。。。所以需要再次添加時長判斷。
if '/' in ell[1].get_text():
data["length"] = ell[1].get_text().split('/')[1].strip().replace('\n', '')
else:
data["length"] = ""
再次爬取了81條數據,然後,居然沒有評價的人數。不會我想要的數據,都要加一次判斷吧。。。。。
if len(scores) == 2:
data["score_hum"] = scores[1]
else:
data["score_hum"] = ""
終於順利爬完了。
未轉化反爬蟲文字的電影詳情爬取。將完整代碼放一下。
import requests
import time
from fontTools.ttLib import TTFont
from bs4 import BeautifulSoup
import re
import lxml
import pymysql
# 經典電影爲showType=3,offset=30
# 抓取2018年上映的電影,
# 按照在網頁上面點擊的選項出現的鏈接爲:https://maoyan.com/films?showType=3&sortId=1&yearId=13,且網頁上面顯示的爲195頁。
# 步驟1,遍歷首頁連接,得到每個影片的連接。
head = """
accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
accept-encoding: gzip, deflate, br
accept-language: zh-CN,zh;q=0.9
cache-control: max-age=0
Host: maoyan.com
content-type: text/html; charset=utf-8
upgrade-insecure-requests: 1
user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36
"""
# 構建請求頭,可以在不同函數裏構建不同的請求頭
def str_to_dict(header):
header_dict = {}
header = header.split('\n')
for h in header:
h = h.strip()
if h:
k, v = h.split(':', 1)
header_dict[k] = v.strip()
return header_dict
# 獲取影片的詳情頁
def get_url():
a = 194*30+1
num2 = 0
for i in range(0, a, 30):
# time.sleep(5)
url = 'https://maoyan.com/films?showType=3&sortId=1&yearId=13&offset=' + str(i)
host = """referer: https://maoyan.com/films?showType=3&sortId=1&yearId=13"""
header = head + host
headers = str_to_dict(header)
response = requests.get(url=url, headers=headers)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
data_1 = soup.find_all('div', {'class': 'channel-detail movie-item-title'})
data_2 = soup.find_all('div', {'class': 'channel-detail channel-detail-orange'})
urls_lib = []
num = 0
num2 += 1
for item in data_1:
# time.sleep(6)
num += 1
url_1 = item.select('a')[0]['href']
if data_2[num-1].get_text() != '暫無評分':
url = 'https://maoyan.com' + url_1
urls_lib.append(url)
with open('movies_url.txt', 'a+', encoding='utf-8') as f:
f.write(url+ '\n')
f.close()
print('----已進行{}頁數據---'.format(num2))
print("已結束")
# 獲得電影詳情頁的信息
def get_message(url):
time.sleep(3)
data = {}
headers = str_to_dict(head)
response = requests.get(url=url, headers=headers)
u = response.text
# 獲取反爬字體的下載連接
cmp = re.compile(",\n url\('(//.*.woff)'\) format\('woff'\)")
rst = cmp.findall(u)
# 爬取反爬蟲的數字
cmp2 = re.compile('<span class="stonefont">(.*?)</span>')
scores = cmp2.findall(u)
cmp3 = re.compile('<span class="stonefont">(.*?)</span><span class="unit">(.*?)</span>')
price = cmp3.findall(u)
# 獲取電影信息,因爲查看了幾頁都未發現暫無票房的情況,所以暫未做判斷
soup = BeautifulSoup(u, "lxml")
name = soup.find_all('h3', {'class': 'name'})
ell = soup.find_all('li', {'class': 'ellipsis'})
# 返回電影信息
data["name"] = name[0].get_text()
data["type"] = ell[0].get_text()
data["country"] = ell[1].get_text().split('/')[0].strip().replace('\n', '')
if '/' in ell[1].get_text():
data["length"] = ell[1].get_text().split('/')[1].strip().replace('\n', '')
else:
data["length"] = ""
data["released"] = ell[2].get_text()[:10]
data["score"] = scores[0]
if len(scores) == 2:
data["score_hum"] = scores[1]
else:
data["score_hum"] = ""
if price:
data["price"] = str(price[0]).replace('\'', '')
else:
data["price"] = "暫無"
data["woff"] = str(rst[0]).replace('\'', '')
print(data)
return data
def to_mysql(data):
db = pymysql.connect(host='localhost', user='root', passwd='password', port=3306, db='maoyan')
cursor = db.cursor()
sql = 'insert into films values (%s, %s, %s, %s, %s, %s, %s, %s, %s)'
try:
if cursor.execute(sql, (
data["name"], data["type"], data["country"], data["length"], data["released"], data["score"], data["score_hum"],
data["price"], data["woff"])):
print("Successful")
db.commit()
except:
print('Failed')
db.rollback()
db.close()
def get_all():
# get_url()
with open('movies_url.txt', 'r', encoding='utf-8') as f:
num = 0
for url in f:
num += 1
data = get_message(url.split('\n')[0])
to_mysql(data)
print('已寫入第{}條數據'.format(num))
f.close()
get_all()
現在還有一個難點,就是怎麼用每個電影頁的文字規則,去替換爬取的反爬文字。
---------未完待續----------
用SQL查詢了一下,woff的下載鏈接一共有41條不同的規則。現在有一個笨辦法,就是在百度字體編輯器中,把41條規則都查詢一遍,然後按照對應的規則,將反爬文字轉化成正常文字。
之前一直未理解作者文章的中的文字轉化的原理和步驟,然後搜索了一下,看了這篇文字之後大致理解了一下,就是通過圖形判斷數字。Python爬蟲實例:爬取貓眼電影——破解字體反爬。仔細閱讀了Python爬蟲實例文章後,才發現這個裏面的思路,和操作纔是我實際需要的。這個比微信文章裏面的更容易理解和操作一些。
結合微信文章和爬蟲實例文章,綜合成一下代碼,親測可用成功。
import os
import time
import re
import requests
from fontTools.ttLib import TTFont
# from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import pymysql
head = """
accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
accept-encoding: gzip, deflate, br
accept-language: zh-CN,zh;q=0.9
cache-control: max-age=0
Host: maoyan.com
content-type: text/html; charset=utf-8
upgrade-insecure-requests: 1
user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36
"""
os.makedirs('font', exist_ok=True)
regex_woff = re.compile("(?<=url\(').*\.woff(?='\))")
regex_text = re.compile('(?<=<span class="stonefont">).*?(?=</span>)')
regex_font = re.compile('(?<=&#x).{4}(?=;)')
basefont = TTFont('base.woff')
fontdict = {'uniF411': '8', 'uniF043': '4', 'uniE50C': '1', 'uniE3FF': '9', 'uniEC2B': '2', 'uniF290': '5', 'uniEFC2': '3', 'uniE4B7': '0', 'uniF675': '6', 'uniF2EE': '7'}
def str_to_dict(header):
header_dict = {}
header = header.split('\n')
for h in header:
h = h.strip()
if h:
k, v = h.split(':', 1)
header_dict[k] = v.strip()
return header_dict
# 獲取影片的詳情頁
def get_url():
a = 194 * 30 + 1
num2 = 0
for i in range(0, a, 30):
# time.sleep(5)
url = 'https://maoyan.com/films?showType=3&sortId=1&yearId=13&offset=' + str(i)
host = """referer: https://maoyan.com/films?showType=3&sortId=1&yearId=13"""
header = head + host
headers = str_to_dict(header)
response = requests.get(url=url, headers=headers)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
data_1 = soup.find_all('div', {'class': 'channel-detail movie-item-title'})
data_2 = soup.find_all('div', {'class': 'channel-detail channel-detail-orange'})
urls_lib = []
num = 0
num2 += 1
for item in data_1:
# time.sleep(6)
num += 1
url_1 = item.select('a')[0]['href']
if data_2[num - 1].get_text() != '暫無評分':
url = 'https://maoyan.com' + url_1
urls_lib.append(url)
with open('movies_url.txt', 'a+', encoding='utf-8') as f:
f.write(url + '\n')
f.close()
print('----已進行{}頁數據---'.format(num2))
print("已結束")
def get_moviescore(url):
host = 'https://maoyan.com'
headers = str_to_dict(head)
html = requests.get(url, headers=headers).text
time.sleep(3)
soup = BeautifulSoup(html, 'lxml')
msg = {}
dsoup = BeautifulSoup(html, 'lxml')
msg['name'] = dsoup.find(class_='name').text
ell = dsoup.find_all('li', {'class': 'ellipsis'})
msg['type'] = ell[0].text
msg['country'] = ell[1].text.split('/')[0].strip()
if len(ell[1].text.split('/')) != 1:
msg['length'] = ell[1].text.split('/')[1].strip()
else:
msg['length'] = ''
msg['release-time'] = ell[2].text[:10]
# 下載字體文件
woff = regex_woff.search(html).group()
wofflink = 'http:' + woff
localname = 'font\\' + os.path.basename(wofflink)
if not os.path.exists(localname):
downloads(wofflink, localname)
font = TTFont(localname)
# 其中含有 unicode 字符,BeautifulSoup 無法正常顯示,只能用原始文本通過正則獲取
ms = regex_text.findall(html)
if len(ms) < 3:
msg['score'] = '0'
msg['score-num'] = '0'
msg['box-office'] = '0'
else:
msg['score'] = get_fontnumber(font, ms[0])
msg['score-num'] = get_fontnumber(font, ms[1])
msg['box-office'] = get_fontnumber(font, ms[2]) + dsoup.find('span', class_='unit').text
to_mysql(msg)
def to_mysql(msg):
db = pymysql.connect(host='localhost', user='root', passwd='password', port=3306, db='maoyan')
cursor = db.cursor()
sql = 'insert into films values (%s, %s, %s, %s, %s, %s, %s, %s)'
try:
if cursor.execute(sql, (
msg["name"], msg["type"], msg["country"], msg["length"], msg["release-time"], msg["score"],
msg["score-num"], msg["box-office"]
)):
print("Successful")
db.commit()
except:
print('Failed')
db.rollback()
db.close()
def get_fontnumber(newfont, text):
ms = regex_font.findall(text)
for m in ms:
text = text.replace(f'&#x{m};', get_num(newfont, f'uni{m.upper()}'))
return text
def get_num(newfont, name):
uni = newfont['glyf'][name]
for k, v in fontdict.items():
if uni == basefont['glyf'][k]:
return v
def downloads(url, localfn):
with open(localfn, 'wb+') as sw:
sw.write(requests.get(url).content)
def get_all():
get_url()
with open('movies_url.txt', 'r', encoding='utf-8') as f:
num = 0
for url in f:
num += 1
data = get_moviescore(url.split('\n')[0])
print('已寫入第{}條數據'.format(num))
f.close()
get_all()