mport pytesseract
import requests
import re
from bs4 import BeautifulSoup
from PIL import Image
def get_data():
# 先請求網頁,獲取基本的數據
url = 'http://wh.ziroom.com/z/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36'}
r = requests.get(url, headers).text
html = BeautifulSoup(r, features='lxml')
# 查找第一間房子的價格數據
list_box = html.find('div', attrs={'class': 'Z_list-box'})
# 獲取所有房子的信息item 這裏只獲取一個
for item in list_box.find_all('div', attrs={'class': 'item'}):
break
# 獲取偏移量和圖片的URI
nums = item.find_all('span', attrs={'class': 'num'})
positions = []
for num in nums:
position = num.get('style').split(':')[-1][:-2]
uri = re.search(r'\((.*?)\)', num.get('style').split(':')[1]).group(1)
positions.append(float(position.strip()))
return positions, uri
def parse_data(positions, uri):
# 獲取png
r = requests.get('http:' + uri).content
with open('img.png', 'wb') as f:
f.write(r)
# 讀取
img = Image.open('img.png')
img.show()
# 初始化
pytesseract.pytesseract.tesseract_cmd = r'D:\Tesseract-OCR\tesseract.exe'
# 配置
config = r'--tessdata-dir "D:\Tesseract-OCR\tessdata"'
# 識別
nums = pytesseract.image_to_string(img, lang='eng', config=config).strip()
# 識別的時候會多一些空格,去掉
n = [num for num in nums if num != ' ']
print(n)
# 根據偏移量獲取價格的數字
positions = [int(abs(position / 21.4)) for position in positions]
price = ''
for p in positions:
price += n[p]
print(price)
if __name__ == '__main__':
p, u = get_data()
parse_data(p, u)
準確率很高,試了很多次結果都是對的上的,運行效果:
不想寫怎麼分析,人!懶!該!打!具體分析可參見下鏈接,前輩的分析,很全~~