十三、學習分佈式爬蟲之字體反爬

字體反爬

字體反爬原理

  1. 網頁開發者自己創造一種字體,因爲在字體中每個文字都有其代號,那麼以後在網頁中不會直接顯示這個文字的最終的效果,而是顯示他的代號,因此即使獲取到了網頁中的文本內容,也只是獲取到文字的代號,而不是文字本身。
  2. 因爲創造字體費時費力,並且如果把中國3000多常用漢字都實現,那麼這個字體將達到幾十兆,也會影響網頁的加載。一般情況下爲了反爬蟲,僅會針對0-9以及少數漢字進行單獨創建,其他的還是使用用戶系統中自帶的字體。
    在這裏插入圖片描述
    在這裏插入圖片描述
    在這裏插入圖片描述
    58同城租房字體反爬實例
import io
import re
import base64
from fontTools.ttLib import TTFont
#pip install fontTools安裝模塊
import requests

#這裏的操作只是爲了將字體保存爲xml文件和ttf文件,用來做文字和形狀的映關係
# font_face = "AAEAAAALAIAAAwAwR1NVQiCLJXoAAAE4AAAAVE9TLzL4XQjtAAABjAAAAFZjbWFwq8R/YwAAAhAAAAIuZ2x5ZuWIN0cAAARYAAADdGhlYWQYboylAAAA4AAAADZoaGVhCtADIwAAALwAAAAkaG10eC7qAAAAAAHkAAAALGxvY2ED7gSyAAAEQAAAABhtYXhwARgANgAAARgAAAAgbmFtZTd6VP8AAAfMAAACanBvc3QFRAYqAAAKOAAAAEUAAQAABmb+ZgAABLEAAAAABGgAAQAAAAAAAAAAAAAAAAAAAAsAAQAAAAEAAOIAgERfDzz1AAsIAAAAAADac6DKAAAAANpzoMoAAP/mBGgGLgAAAAgAAgAAAAAAAAABAAAACwAqAAMAAAAAAAIAAAAKAAoAAAD/AAAAAAAAAAEAAAAKADAAPgACREZMVAAObGF0bgAaAAQAAAAAAAAAAQAAAAQAAAAAAAAAAQAAAAFsaWdhAAgAAAABAAAAAQAEAAQAAAABAAgAAQAGAAAAAQAAAAEERAGQAAUAAAUTBZkAAAEeBRMFmQAAA9cAZAIQAAACAAUDAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFBmRWQAQJR2n6UGZv5mALgGZgGaAAAAAQAAAAAAAAAAAAAEsQAABLEAAASxAAAEsQAABLEAAASxAAAEsQAABLEAAASxAAAEsQAAAAAABQAAAAMAAAAsAAAABAAAAaYAAQAAAAAAoAADAAEAAAAsAAMACgAAAaYABAB0AAAAFAAQAAMABJR2lY+ZPJpLnjqeo59kn5Kfpf//AACUdpWPmTyaS546nqOfZJ+Sn6T//wAAAAAAAAAAAAAAAAAAAAAAAAABABQAFAAUABQAFAAUABQAFAAUAAAABwAFAAYABAAIAAMACgACAAEACQAAAQYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADAAAAAAAiAAAAAAAAAAKAACUdgAAlHYAAAAHAACVjwAAlY8AAAAFAACZPAAAmTwAAAAGAACaSwAAmksAAAAEAACeOgAAnjoAAAAIAACeowAAnqMAAAADAACfZAAAn2QAAAAKAACfkgAAn5IAAAACAACfpAAAn6QAAAABAACfpQAAn6UAAAAJAAAAAAAAACgAPgBmAJoAvgDoASQBOAF+AboAAgAA/+YEWQYnAAoAEgAAExAAISAREAAjIgATECEgERAhIFsBEAECAez+6/rs/v3IATkBNP7S/sEC6AGaAaX85v54/mEBigGB/ZcCcwKJAAABAAAAAAQ1Bi4ACQAAKQE1IREFNSURIQQ1/IgBW/6cAicBWqkEmGe0oPp7AAEAAAAABCYGJwAXAAApATUBPgE1NCYjIgc1NjMyFhUUAgcBFSEEGPxSAcK6fpSMz7y389Hym9j+nwLGqgHButl0hI2wx43iv5D+69b+pwQAAQAA/+YEGQYnACEAABMWMzI2NRAhIzUzIBE0ISIHNTYzMhYVEAUVHgEVFAAjIiePn8igu/5bgXsBdf7jo5CYy8bw/sqow/7T+tyHAQN7nYQBJqIBFP9uuVjPpf7QVwQSyZbR/wBSAAACAAAAAARoBg0ACgASAAABIxEjESE1ATMRMyERNDcjBgcBBGjGvv0uAq3jxv58BAQOLf4zAZL+bgGSfwP8/CACiUVaJlH9TwABAAD/5gQhBg0AGAAANxYzMjYQJiMiBxEhFSERNjMyBBUUACEiJ7GcqaDEx71bmgL6/bxXLPUBEv7a/v3Zbu5mswEppA4DE63+SgX42uH+6kAAAAACAAD/5gRbBicAFgAiAAABJiMiAgMzNjMyEhUUACMiABEQACEyFwEUFjMyNjU0JiMiBgP6eYTJ9AIFbvHJ8P7r1+z+8wFhASClXv1Qo4eAoJeLhKQFRj7+ov7R1f762eP+3AFxAVMBmgHjLfwBmdq8lKCytAAAAAABAAAAAARNBg0ABgAACQEjASE1IQRN/aLLAkD8+gPvBcn6NwVgrQAAAwAA/+YESgYnABUAHwApAAABJDU0JDMyFhUQBRUEERQEIyIkNRAlATQmIyIGFRQXNgEEFRQWMzI2NTQBtv7rAQTKufD+3wFT/un6zf7+AUwBnIJvaJLz+P78/uGoh4OkAy+B9avXyqD+/osEev7aweXitAEohwF7aHh9YcJlZ/7qdNhwkI9r4QAAAAACAAD/5gRGBicAFwAjAAA3FjMyEhEGJwYjIgA1NAAzMgAREAAhIicTFBYzMjY1NCYjIga5gJTQ5QICZvHD/wABGN/nAQT+sP7Xo3FxoI16pqWHfaTSSgFIAS4CAsIBDNbkASX+lf6l/lP+MjUEHJy3p3en274AAAAAABAAxgABAAAAAAABAA8AAAABAAAAAAACAAcADwABAAAAAAADAA8AFgABAAAAAAAEAA8AJQABAAAAAAAFAAsANAABAAAAAAAGAA8APwABAAAAAAAKACsATgABAAAAAAALABMAeQADAAEECQABAB4AjAADAAEECQACAA4AqgADAAEECQADAB4AuAADAAEECQAEAB4A1gADAAEECQAFABYA9AADAAEECQAGAB4BCgADAAEECQAKAFYBKAADAAEECQALACYBfmZhbmdjaGFuLXNlY3JldFJlZ3VsYXJmYW5nY2hhbi1zZWNyZXRmYW5nY2hhbi1zZWNyZXRWZXJzaW9uIDEuMGZhbmdjaGFuLXNlY3JldEdlbmVyYXRlZCBieSBzdmcydHRmIGZyb20gRm9udGVsbG8gcHJvamVjdC5odHRwOi8vZm9udGVsbG8uY29tAGYAYQBuAGcAYwBoAGEAbgAtAHMAZQBjAHIAZQB0AFIAZQBnAHUAbABhAHIAZgBhAG4AZwBjAGgAYQBuAC0AcwBlAGMAcgBlAHQAZgBhAG4AZwBjAGgAYQBuAC0AcwBlAGMAcgBlAHQAVgBlAHIAcwBpAG8AbgAgADEALgAwAGYAYQBuAGcAYwBoAGEAbgAtAHMAZQBjAHIAZQB0AEcAZQBuAGUAcgBhAHQAZQBkACAAYgB5ACAAcwB2AGcAMgB0AHQAZgAgAGYAcgBvAG0AIABGAG8AbgB0AGUAbABsAG8AIABwAHIAbwBqAGUAYwB0AC4AaAB0AHQAcAA6AC8ALwBmAG8AbgB0AGUAbABsAG8ALgBjAG8AbQAAAAIAAAAAAAAAFAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACwECAQMBBAEFAQYBBwEIAQkBCgELAQwAAAAAAAAAAAAAAAAAAAAA"
# b = base64.b64decode(font_face) #將字體進行base64解析
# with open('58sourse.ttf','wb') as f:
#     f.write(b) #保存字體
# base_font.saveXML('58sourse.xml')  #保存成xml文件

base_font = TTFont('58sourse.ttf') #讀取字體的映射關係
#獲取字體形狀
glyf = base_font['glyf']

#文字和形狀的映射關係
base_font_map = {
    0:glyf['glyph00001'],
    1:glyf['glyph00002'],
    2:glyf['glyph00003'],
    3:glyf['glyph00004'],
    4:glyf['glyph00005'],
    5:glyf['glyph00006'],
    6:glyf['glyph00007'],
    7:glyf['glyph00008'],
    8:glyf['glyph00009'],
    9:glyf['glyph00010'],
}

#從網絡上抓取源代碼,獲取code->name->文字形狀的映射關係
url = 'https://haikou.58.com/chuzu/'
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36',
}
resp = requests.get(url,headers=headers)
text = resp.text  #源代碼

font_face = re.search(r"@font-face.+?base64,(.+?)'\)",text).group(1)
#將字體的url路徑font_face用base64解碼後存放於內存管道中,這樣程序運行更快,不必將它存放在文件中讀取,因爲進行io操作程序會運行很慢
font_bytes = io.BytesIO(base64.b64decode(font_face))
current_font = TTFont(font_bytes)

#code->name
code_name_map = current_font.getBestCmap() #獲取cmap返回一個字典
current_glyf = current_font['glyf'] #獲取glyf返回一個對象
# print(code_name_map)
# print(current_glyf)
for code,name in code_name_map.items():  #code->name
    # print({'code':hex(code),'name':name})  #將code轉化爲16進制
    current_shape = current_glyf[name]
    # print(current_shape)
    for number,shape in base_font_map.items(): #name->形狀
        if current_shape == shape:
            webcode = str(hex(code)).replace('0',"&#",1) + ';'
            # print({'code':hex(code),'number':number})
            text = re.sub(webcode,str(number),text) #替換

with open('58.html','w',encoding='utf-8') as f:
    f.write(text)

實習僧字體反爬實例

import base64
import io
from fontTools.ttLib import TTFont
import requests
import re

font_face = ""
# b = base64.b64decode(font_face)
# with open('shixi.ttf','wb') as f:
#     f.write(b)

font_bytes = io.BytesIO(base64.b64decode(font_face))
base_font = TTFont(font_bytes)
# base_font.saveXML('shixi.xml')
#獲取字體形狀
glyf = base_font['glyf']

#字體和形狀映射
base_font_map = {
    0:glyf['uni30'],
    1:glyf['uni31'],
    2:glyf['uni32'],
    3:glyf['uni33'],
    4:glyf['uni34'],
    5:glyf['uni35'],
    6:glyf['uni36'],
    7:glyf['uni37'],
    8:glyf['uni38'],
    9:glyf['uni39'],
}

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36',
}
url = 'https://www.shixiseng.com/intern/inn_njckxepejl1q'

resp = requests.get(url,headers=headers)
text = resp.text
print(text)
current_font_face = re.search(r'font-family:myFont.+?base64,(.+?)"\)',text).group(1)
# print(current_font_face)
current_font_bytes = io.BytesIO(base64.b64decode(current_font_face))
current_font = TTFont(current_font_bytes)
#code->name
code_name_map = current_font.getBestCmap()
#name->字體
current_glyf = current_font['glyf']
for code,name in code_name_map.items():
    # print({'code':hex(code),'name':name})
    current_shape = current_glyf[name]
    for number,shape in base_font_map.items():
        if current_shape == shape:
            webcode = str(hex(code)).replace('0','&#',1)
            text = re.sub(webcode,str(number),text)

with open('shixi1.html','w',encoding='utf-8') as f:
    f.write(text)

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章