ISBN識別
學校三級項目需要批量識別ISBN中的數字
實現的大致思路如下:
對原始圖片按尺寸自動調整大小,高斯濾波去噪,灰度化,二值化,邊緣檢測後閉操作,查找最大輪廓,獲取最小外接矩形及旋轉角度,旋轉擺正圖片,水平投影,提取字符區域,用pytesseract
識別字符
項目代碼存放在三個文件中
1.工具類ocr_tool.py
import re
from difflib import SequenceMatcher
# 提取字符串中的數字
def obtain_digit(data):
s = re.findall(r"\d+", data)
return ''.join(s)
# 統計正確識別的數字個數
def crct_digit_cnt(crct_isbn, recog_isbn):
return SequenceMatcher(None, crct_isbn, recog_isbn).find_longest_match(0, len(crct_isbn), 0, len(recog_isbn)).size
2.圖片預處理相關函數img_process_tool.py
from math import fabs, sin, radians, cos
import cv2 as cv
import numpy as np
def img_show_wait(img, window_name, duration=100):
"""
顯示圖片
:param img:輸入圖片
:param window_name: 顯示圖片的窗口名稱
:param duration: 顯示圖片的時長,默認等待鍵入任意按鍵,不自動關閉窗口
"""
cv.imshow(window_name, img)
cv.waitKey(duration)
def get_projection_list(binary_img, direction='horizontal'):
"""
獲取指定方向的投影
:param binary_img: 輸入的二值圖
:param direction: 投影方向
:return: 投影方向上的像素統計圖
"""
h, w = binary_img.shape[:2]
row_list = [0] * h
col_list = [0] * w
for row in range(h):
for col in range(w):
if binary_img[row, col] == 255: # 統計白色像素點個數
row_list[row] = row_list[row] + 1
col_list[col] = col_list[col] + 1
if direction == 'horizontal':
return row_list
else:
return col_list
def draw_projection(data_list, rows, cols, direction='horizontal'):
"""
繪製指定方向上的投影圖並返回
:param data_list: 用於繪製投影的數據
:param rows: 原始圖像的行數
:param cols: 原始圖像的列數
:param direction: 指定水平或者垂直方向投影
:return: 投影圖像
"""
img_proj = np.ones(shape=(rows, cols), dtype=np.uint8) * 255
row_max = np.max(data_list)
if direction == 'horizontal':
# 繪製水平投影圖
weight = cols / row_max
for row in range(rows):
pt1 = (0, row)
pt2 = (int(weight * data_list[row]), row)
cv.line(img_proj, pt1, pt2, (0,), 1)
img_show_wait(img_proj, 'horizontal projection')
else:
# 繪製垂直投影圖
weight = rows / row_max
for col in range(cols):
pt1 = (col, rows - 1)
pt2 = (col, rows - 1 - int(weight * data_list[col]))
cv.line(img_proj, pt1, pt2, (0,), 1)
img_show_wait(img_proj, 'vertical projection')
return img_proj
def split_projection_list(proj_list: list, min_val=0):
"""
將投影得到的像素統計區間分割出像素集中區域,返回ROI區域的座標區間
:param proj_list: 投影統計數據
:param min_val: 用於劃定區間的一個閾值
:return: ROI區域的座標區間
"""
start = 0
end = None
split_list = []
for idx, value in enumerate(proj_list):
if value > min_val:
end = idx
else:
if end is not None:
split_list.append((start, end))
end = None
start = idx
return split_list
def img_rotate(img, degree):
"""
對圖片進行旋轉
:param img:輸入圖片
:param degree: 旋轉角度
:return:
"""
height, width = img.shape[:2]
heightNew = int(width * fabs(sin(radians(degree))) + height * fabs(cos(radians(degree)))) # 擴充畫布
widthNew = int(height * fabs(sin(radians(degree))) + width * fabs(cos(radians(degree)))) # 擴充畫布
matRotation = cv.getRotationMatrix2D((width // 2, height // 2), degree, 1) # 獲取旋轉矩陣
matRotation[0, 2] += (widthNew - width) // 2 # 旋轉後平移
matRotation[1, 2] += (heightNew - height) // 2 # 旋轉後平移
imgRotation = cv.warpAffine(img, matRotation, (widthNew, heightNew), borderValue=(255, 255, 255)) # 獲取旋轉後的圖片
return imgRotation, matRotation
def draw_box(img, box):
"""
在指定圖片上畫矩形框
:param img: 輸入圖片
:param box: 矩形框的座標,一個長度爲8的座標序列
:return:
"""
cv.line(img, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 3)
cv.line(img, (box[2], box[3]), (box[4], box[5]), (0, 255, 0), 3)
cv.line(img, (box[0], box[1]), (box[6], box[7]), (0, 255, 0), 3)
cv.line(img, (box[4], box[5]), (box[6], box[7]), (0, 255, 0), 3)
return img
def img_resize(img_original):
"""
根據給定圖片的尺寸自動調整爲相應的大小
:param img_original: 輸入圖片
:return:
"""
rows, cols = img_original.shape[:2]
if rows > 1300:
img_original = cv.resize(img_original, None, fx=0.25, fy=0.25, interpolation=cv.INTER_CUBIC) # 調整大小
elif 750 < rows <= 1300:
img_original = cv.resize(img_original, None, fx=0.5, fy=0.5, interpolation=cv.INTER_CUBIC) # 調整大小
elif 500 < rows <= 750:
img_original = cv.resize(img_original, None, fx=0.75, fy=0.75, interpolation=cv.INTER_CUBIC) # 調整大小
return img_original
def adaptive_threshold(gray, block_size=5, c=10, inv=False):
"""
對給定圖片進行二值化處理
:param gray: 輸入的灰度圖
:param block_size: 卷積核大小
:param c:
:param inv: 是否反轉,默認情況下,不反轉,字符區域顯示爲白色,不相關區域顯示爲黑色
:return: 返回二值化後的圖像
"""
if not inv:
thresholdType = cv.THRESH_BINARY
else:
thresholdType = cv.THRESH_BINARY_INV
# 自適應閾值化能夠根據圖像不同區域亮度分佈,改變閾值
binary_img = cv.adaptiveThreshold(
gray, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, thresholdType, block_size, c)
return binary_img
def img_preprocess(img, kernel=None):
"""
對圖片進行預處理,包括高斯濾波去噪,轉爲灰度圖,自適應二值化處理,閉運算處理
:param img: 輸入圖片
:param kernel: 卷積核
:return: 預處理後得到的圖像
"""
img_blur = cv.GaussianBlur(img, (3, 3), 0) # 高斯濾波進行去噪
img_gray = cv.cvtColor(img_blur, cv.COLOR_BGR2GRAY) # 轉爲灰度圖
ret, img_bin = cv.threshold(img_gray, 0, 255, cv.THRESH_BINARY_INV + cv.THRESH_OTSU) # 自適應二值化
img_canny = cv.Canny(img_bin, 50, 50) # 邊緣檢測
if kernel is None:
kernel = cv.getStructuringElement(cv.MORPH_RECT, (5, 5), (-1, -1))
img_close = cv.morphologyEx(img_canny, cv.MORPH_CLOSE, kernel) # 先膨脹後腐蝕,減少連通區域(閉運算)
img_show_wait(img_close, 'original img close')
return img_close
if __name__ == '__main__':
# 測試圖片切割序列
path_to_img = r'D:\projects_python\workingon\isbnocr\pageocr\xxx.png'
img_original = cv.imread(path_to_img)
img_pre = img_preprocess(img_original)
horizontal_projection_list = get_projection_list(img_pre, 'horizontal')
rows, cols = img_pre.shape[:2]
draw_projection(horizontal_projection_list, rows, cols, 'horizontal')
3.核心代碼isbnocr.py
import os
from os import listdir
import pytesseract
from img_process_tool import *
from ocr_tool import *
def img_isbn_area(img_original):
"""
對原始圖片傾斜擺正後提取出原始圖片中的字符區域
:param img_original: 輸入圖片
:return: 提取出的字符區域
"""
kernel = cv.getStructuringElement(cv.MORPH_RECT, (100, 5), (-1, -1))
img_pre = img_preprocess(img_original, kernel)
# 查找最大輪廓
contours, hierarchy, = cv.findContours(img_pre, 1, 2)
length = len(contours)
index = 0 # 存放最大輪廓的索引
max_area = cv.contourArea(contours[index]) # 存放最大輪廓的面積
for i in range(length):
cnt = contours[i]
area = cv.contourArea(cnt)
if area > max_area:
index = i
max_area = area
# 利用最大輪廓計算傾斜角度,將圖片擺正
cnt = contours[index]
min_area_rect = cv.minAreaRect(cnt) # 獲取最小外接矩形,返回一個rect,ndarray類型
angle = min_area_rect[2] # 獲取旋轉角度
if angle > 80:
angle = min_area_rect[2] - 90 # 調整旋轉角度
img_rotated, mat_rotation = img_rotate(img_original, angle) # 開始旋轉
img_rotated_pre = img_preprocess(img_rotated, kernel)
# 水平投影
proj_list = get_projection_list(img_rotated_pre)
split_list = split_projection_list(proj_list, 0)
img_h, img_w = img_rotated_pre.shape[:2]
x, y, w, h = 0, 0, img_w, img_h
for start, end in split_list:
if end - start > img_h * 0.5:
continue
x, y, w, h = 0, start, img_w, end - start
roi = img_rotated[y:y + h, x:x + w]
img_show_wait(roi, 'roi')
proj_list = get_projection_list(img_rotated_pre, 'vertical')
split_vertical_list = split_projection_list(proj_list, 0)
if len(split_vertical_list) < 12:
continue
return img_rotated[y:y + h, x:x + w]
def split_digits(img_text):
"""
對提取出的字符區域進行垂直投影,切割出單個字符區域
:param img_text: 輸入圖像
:return: 單個字符區域
"""
isbn_gray = cv.cvtColor(img_text, cv.COLOR_BGR2GRAY)
ret, isbn_bin = cv.threshold(isbn_gray, 0, 255, cv.THRESH_OTSU + cv.THRESH_BINARY_INV)
img_show_wait(isbn_bin, 'isbn area bin')
ver_proj_list = get_projection_list(isbn_bin, 'vertical')
rows, cols = img_text.shape[:2]
draw_projection(ver_proj_list, rows, cols, 'vertical')
digit_col_list = split_projection_list(ver_proj_list)
digits = []
for i in range(len(digit_col_list)):
digit_col = digit_col_list[i]
digits.append(img_text[:, digit_col[0]:digit_col[1]])
return digits
def digit_recog(path_to_image):
"""
識別單張圖片中的ISBN編號
:param path_to_image: 圖片絕對路徑
:return:
"""
img_original = cv.imread(path_to_image)
img_resized = img_resize(img_original)
isbn_area = img_isbn_area(img_resized)
recog_isbn = obtain_digit(pytesseract.image_to_string(isbn_area))
print(f'【圖片{os.path.basename(path_to_image)}】的識別結果爲:{recog_isbn}')
if isbn_area is not None:
digits = split_digits(isbn_area)
if digits is not None:
for i in range(len(digits)):
img_show_wait(digits[i], 'digit')
def digit_recog_batch(path_to_images):
"""
批量識別圖片
:param path_to_images: 圖片存放的文件夾絕對路徑
:return:
"""
img_cnt = 0 # 圖片數量
digit_cnt = 0 # 數字數量
digit_recognized = 0 # 正確識別數字
isbn_recognized = 0 # 正確識別完整ISBN
for file in listdir(path_to_images):
img_abs_path = os.path.join(path_to_images, file) # 圖像的絕對路徑
img_basename = os.path.basename(img_abs_path) # 圖片的文件名
original_image = cv.imread(img_abs_path, cv.IMREAD_COLOR)
isbn_area = img_isbn_area(img_resize(original_image)) # 獲取圖片中的字符區域
if isbn_area is not None:
recog_isbn = obtain_digit(pytesseract.image_to_string(isbn_area))
print(f'【圖片{img_basename}】的識別結果爲:{recog_isbn}')
if recog_isbn:
img_cnt += 1
digit_cnt += len(obtain_digit(img_basename))
digit_recognized += crct_digit_cnt(obtain_digit(img_basename), recog_isbn)
isbn_recognized += 1 if recog_isbn.find(obtain_digit(file)) != -1 else 0
print("正確識別的ISBN個數:" + str(isbn_recognized) + "/" + str(img_cnt))
print("正確識別的數字個數:" + str(digit_recognized) + "/" + str(digit_cnt))
print("識別正確率:" + str(isbn_recognized / img_cnt))
print("識別準確率:" + str(digit_recognized / digit_cnt))
# todo:字符識別可以用神經網絡或者模板匹配
if __name__ == "__main__":
pytesseract.pytesseract.tesseract_cmd = r"D:\software\Tesseract-OCR\tesseract.exe"
path_to_image = r'D:\projects_python\workingon\isbnocr\isbn_recognition\ISBN 978-7-5099-1125-9.png'
path_to_images = r'D:\projects_python\workingon\isbnocr\isbn_recognition\images'
# digit_recog(path_to_image) # 識別單張圖片
digit_recog_batch(path_to_images) # 批量識別圖片中的ISBN編號
這個項目,坐下來感覺比較有參考價值的還是圖片預處理的思路和相關實現,至於字符識別,由於時間關係沒來得及寫,就用了比較粗陋的方法實現,後面有時間改成模板匹配或者神經網絡識別字符。
圖片的源代碼和數據集都同步到gitee上了,地址在這裏isbn-ocr: 計算機視覺課程設計 識別ISBN中的數字 (gitee.com)