題記
本文旨在記錄爬取貓眼電影國內票房榜單的過程,以及對腳本內字體文件反爬函數的說明。
環境
系統: Windows 10
Python版本: Python 3.7
爬取時間: 2019.3.19
難點說明
爬取貓眼電影的過程中,發現在票房數據出現了亂碼,經百度搜索相關信息,閱讀多篇文章,才知道爬蟲的博大精深。
- 下載一個基本字體路徑,找到它對應的數字及其編碼
- 每一次爬取網頁時,都要先下載該網頁的字體文件,然後與基本字體文件作對比,獲得爬取網頁的數字對應編碼。
本次主要參考了: Python爬蟲雜記 - 字體文件反爬(二)
代碼說明
from fontTools.ttLib import TTFont
def get_maoyan_dict(self, url):
# url: 字體文件下載鏈接
#
# 首先下載新的字體文件
font_woff = requests.get(url, stream=True)
with open('maoyan.woff', 'wb') as w:
for bunk in font_woff:
w.write(bunk)
# 使用fonttools對基本字體文件做解析
base_font = TTFont('basefont.woff')
base_num = ['8', '7', '9', '0', '1', '5', '4', '6', '3', '2']
base_code = ['uniF860', 'uniF408', 'uniEF2B', 'uniF875', 'uniE03A', 'uniEA55', 'uniEE0E', 'uniF7A4', 'uniE3B1', 'uniF813']
# 將新下載的字體文件與基本字體文件做對比
onlineFonts = TTFont('maoyan.woff')
uni_list = onlineFonts.getGlyphNames()[1:-1]
temp = {}
for i in range(10):
onlineGlyph = onlineFonts['glyf'][uni_list[i]]
for j in range(10):
baseGlyph = base_font['glyf'][base_code[j]]
if onlineGlyph == baseGlyph:
temp[uni_list[i][3:].lower()] = base_num[j]
return temp
源碼
該源碼實際測試可行,待優化地方極多
# -*- coding; utf-8 -*-
import requests
from bs4 import BeautifulSoup
import struct
import zlib
import re
from fontTools.ttLib import TTFont
class get_maoyan():
def __init__(self):
self.header = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Cookie": "__mta=250379193.1530718965618.1552724185734.1552724186874.62; _"
"lxsdk_cuid=16465f5cf2dc8-0994698fdc2ac7-16386950-fa000-16465f5cf2dc8; "
"uuid_n_v=v1; uuid=7C57E28047C311E991B9E9D2BD010A06F94D2F7FE0214E68BD819AD9CE2300A3; _"
"csrf=c2cda66b44c7a9967419cf9f1380acb41f32261f8158b5d047a141f9beefb3a3; _"
"lxsdk=7C57E28047C311E991B9E9D2BD010A06F94D2F7FE0214E68BD819AD9CE2300A3; __"
"mta=250379193.1530718965618.1552724158451.1552724183308.61; _"
"lxsdk_s=1698590d4c2-02c-2b-16a%7C%7C45",
"Host": "maoyan.com",
"Referer": "https://maoyan.com/board/6",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36"}
self.url = 'https://maoyan.com/board/1'
self.maoyan_dict = dict()
def get_page_info(self):
repon = requests.get(self.url, headers=self.header)
if repon.status_code == 200:
soup = BeautifulSoup(repon.text, 'lxml')
else:
return -1
# 獲得當前頁面的字體文件路徑
woff_style = soup.select('style')[0]
# print(woff_style)
for line in woff_style.text.split('\n'):
if 'woff' in line:
is_line = line
font_url = 'http:' + re.search('//vfile.*?(woff)', is_line).group(0)
self.maoyan_dict = self.get_maoyan_dict(font_url)
# print(self.maoyan_dict)
movies = soup.select('.board-wrapper')
movie_list = movies[0].select('dd')
for i in range(len(movie_list)):
movie_name = movie_list[i].select('.movie-item-info')[0].select('.name')[0].text
movie_star = movie_list[i].select('.star')[0].text[3:]
movie_date = movie_list[i].select('.releasetime')[0].text
real_money = movie_list[i].select('.realtime')[0].select('.stonefont')[0].text
real_unit = movie_list[i].select('.realtime')[0].text[-2]
total_money = movie_list[i].select('.total-boxoffice')[0].select('.stonefont')[0].text
total_unit = movie_list[i].select('.total-boxoffice')[0].text[-2]
print('電影: ', movie_name)
print('主演: ', movie_star)
print('上映時間: ', movie_date)
print('實時票房: ', self.convert_boxoffice(real_money), real_unit)
print('總票房: ', self.convert_boxoffice(total_money), total_unit)
def get_maoyan_dict(self, url):
font_woff = requests.get(url, stream=True)
with open('maoyan.woff', 'wb') as w:
for bunk in font_woff:
w.write(bunk)
base_font = TTFont('basefont.woff')
base_num = ['8', '7', '9', '0', '1', '5', '4', '6', '3', '2']
base_code = ['uniF860', 'uniF408', 'uniEF2B', 'uniF875', 'uniE03A', 'uniEA55', 'uniEE0E', 'uniF7A4', 'uniE3B1', 'uniF813']
onlineFonts = TTFont('maoyan.woff')
uni_list = onlineFonts.getGlyphNames()[1:-1]
temp = {}
for i in range(10):
onlineGlyph = onlineFonts['glyf'][uni_list[i]]
for j in range(10):
baseGlyph = base_font['glyf'][base_code[j]]
if onlineGlyph == baseGlyph:
temp[uni_list[i][3:].lower()] = base_num[j]
return temp
def convert_boxoffice(self, money):
byte_money = money.__repr__()
# print(byte_money)
money_num_list = byte_money.split('\\u')
# print(money_num_list)
rea_money = ''
for byte_num in money_num_list:
num = self.maoyan_dict.get(byte_num[:4], byte_num[:4])
# print(byte_num, num)
rea_money = f'{rea_money}{num}{byte_num[4:]}'
return rea_money[1:-1]
if __name__ == '__main__':
a = get_maoyan()
a.get_page_info()