ISBN數字識別

ISBN識別

學校三級項目需要批量識別ISBN中的數字

實現的大致思路如下:

對原始圖片按尺寸自動調整大小,高斯濾波去噪,灰度化,二值化,邊緣檢測後閉操作,查找最大輪廓,獲取最小外接矩形及旋轉角度,旋轉擺正圖片,水平投影,提取字符區域,用pytesseract識別字符

項目代碼存放在三個文件中

1.工具類ocr_tool.py

import re
from difflib import SequenceMatcher


# 提取字符串中的數字
def obtain_digit(data):
    s = re.findall(r"\d+", data)
    return ''.join(s)


# 統計正確識別的數字個數
def crct_digit_cnt(crct_isbn, recog_isbn):
    return SequenceMatcher(None, crct_isbn, recog_isbn).find_longest_match(0, len(crct_isbn), 0, len(recog_isbn)).size

2.圖片預處理相關函數img_process_tool.py

from math import fabs, sin, radians, cos

import cv2 as cv
import numpy as np


def img_show_wait(img, window_name, duration=100):
    """
    顯示圖片
    :param img:輸入圖片
    :param window_name: 顯示圖片的窗口名稱
    :param duration: 顯示圖片的時長,默認等待鍵入任意按鍵,不自動關閉窗口
    """
    cv.imshow(window_name, img)
    cv.waitKey(duration)


def get_projection_list(binary_img, direction='horizontal'):
    """
    獲取指定方向的投影
    :param binary_img: 輸入的二值圖
    :param direction: 投影方向
    :return: 投影方向上的像素統計圖
    """
    h, w = binary_img.shape[:2]
    row_list = [0] * h
    col_list = [0] * w
    for row in range(h):
        for col in range(w):
            if binary_img[row, col] == 255:  # 統計白色像素點個數
                row_list[row] = row_list[row] + 1
                col_list[col] = col_list[col] + 1
    if direction == 'horizontal':
        return row_list
    else:
        return col_list


def draw_projection(data_list, rows, cols, direction='horizontal'):
    """
    繪製指定方向上的投影圖並返回
    :param data_list: 用於繪製投影的數據
    :param rows: 原始圖像的行數
    :param cols: 原始圖像的列數
    :param direction: 指定水平或者垂直方向投影
    :return: 投影圖像
    """
    img_proj = np.ones(shape=(rows, cols), dtype=np.uint8) * 255
    row_max = np.max(data_list)
    if direction == 'horizontal':
        # 繪製水平投影圖
        weight = cols / row_max
        for row in range(rows):
            pt1 = (0, row)
            pt2 = (int(weight * data_list[row]), row)
            cv.line(img_proj, pt1, pt2, (0,), 1)
        img_show_wait(img_proj, 'horizontal projection')
    else:
        # 繪製垂直投影圖
        weight = rows / row_max
        for col in range(cols):
            pt1 = (col, rows - 1)
            pt2 = (col, rows - 1 - int(weight * data_list[col]))
            cv.line(img_proj, pt1, pt2, (0,), 1)
        img_show_wait(img_proj, 'vertical projection')
    return img_proj


def split_projection_list(proj_list: list, min_val=0):
    """
    將投影得到的像素統計區間分割出像素集中區域,返回ROI區域的座標區間
    :param proj_list: 投影統計數據
    :param min_val: 用於劃定區間的一個閾值
    :return: ROI區域的座標區間
    """
    start = 0
    end = None
    split_list = []
    for idx, value in enumerate(proj_list):
        if value > min_val:
            end = idx
        else:
            if end is not None:
                split_list.append((start, end))
                end = None
            start = idx
    return split_list


def img_rotate(img, degree):
    """
    對圖片進行旋轉
    :param img:輸入圖片
    :param degree: 旋轉角度
    :return:
    """
    height, width = img.shape[:2]
    heightNew = int(width * fabs(sin(radians(degree))) + height * fabs(cos(radians(degree))))  # 擴充畫布
    widthNew = int(height * fabs(sin(radians(degree))) + width * fabs(cos(radians(degree))))  # 擴充畫布
    matRotation = cv.getRotationMatrix2D((width // 2, height // 2), degree, 1)  # 獲取旋轉矩陣
    matRotation[0, 2] += (widthNew - width) // 2  # 旋轉後平移
    matRotation[1, 2] += (heightNew - height) // 2  # 旋轉後平移
    imgRotation = cv.warpAffine(img, matRotation, (widthNew, heightNew), borderValue=(255, 255, 255))  # 獲取旋轉後的圖片
    return imgRotation, matRotation


def draw_box(img, box):
    """
    在指定圖片上畫矩形框
    :param img: 輸入圖片
    :param box: 矩形框的座標,一個長度爲8的座標序列
    :return:
    """
    cv.line(img, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 3)
    cv.line(img, (box[2], box[3]), (box[4], box[5]), (0, 255, 0), 3)
    cv.line(img, (box[0], box[1]), (box[6], box[7]), (0, 255, 0), 3)
    cv.line(img, (box[4], box[5]), (box[6], box[7]), (0, 255, 0), 3)
    return img


def img_resize(img_original):
    """
    根據給定圖片的尺寸自動調整爲相應的大小
    :param img_original: 輸入圖片
    :return:
    """
    rows, cols = img_original.shape[:2]
    if rows > 1300:
        img_original = cv.resize(img_original, None, fx=0.25, fy=0.25, interpolation=cv.INTER_CUBIC)  # 調整大小
    elif 750 < rows <= 1300:
        img_original = cv.resize(img_original, None, fx=0.5, fy=0.5, interpolation=cv.INTER_CUBIC)  # 調整大小
    elif 500 < rows <= 750:
        img_original = cv.resize(img_original, None, fx=0.75, fy=0.75, interpolation=cv.INTER_CUBIC)  # 調整大小
    return img_original


def adaptive_threshold(gray, block_size=5, c=10, inv=False):
    """
    對給定圖片進行二值化處理
    :param gray: 輸入的灰度圖
    :param block_size: 卷積核大小
    :param c:
    :param inv: 是否反轉,默認情況下,不反轉,字符區域顯示爲白色,不相關區域顯示爲黑色
    :return: 返回二值化後的圖像
    """
    if not inv:
        thresholdType = cv.THRESH_BINARY
    else:
        thresholdType = cv.THRESH_BINARY_INV
    # 自適應閾值化能夠根據圖像不同區域亮度分佈,改變閾值
    binary_img = cv.adaptiveThreshold(
        gray, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, thresholdType, block_size, c)
    return binary_img


def img_preprocess(img, kernel=None):
    """
    對圖片進行預處理,包括高斯濾波去噪,轉爲灰度圖,自適應二值化處理,閉運算處理
    :param img: 輸入圖片
    :param kernel: 卷積核
    :return: 預處理後得到的圖像
    """
    img_blur = cv.GaussianBlur(img, (3, 3), 0)  # 高斯濾波進行去噪
    img_gray = cv.cvtColor(img_blur, cv.COLOR_BGR2GRAY)  # 轉爲灰度圖
    ret, img_bin = cv.threshold(img_gray, 0, 255, cv.THRESH_BINARY_INV + cv.THRESH_OTSU)  # 自適應二值化
    img_canny = cv.Canny(img_bin, 50, 50)  # 邊緣檢測
    if kernel is None:
        kernel = cv.getStructuringElement(cv.MORPH_RECT, (5, 5), (-1, -1))
    img_close = cv.morphologyEx(img_canny, cv.MORPH_CLOSE, kernel)  # 先膨脹後腐蝕,減少連通區域(閉運算)
    img_show_wait(img_close, 'original img close')
    return img_close


if __name__ == '__main__':
    # 測試圖片切割序列
    path_to_img = r'D:\projects_python\workingon\isbnocr\pageocr\xxx.png'
    img_original = cv.imread(path_to_img)
    img_pre = img_preprocess(img_original)
    horizontal_projection_list = get_projection_list(img_pre, 'horizontal')
    rows, cols = img_pre.shape[:2]
    draw_projection(horizontal_projection_list, rows, cols, 'horizontal')

3.核心代碼isbnocr.py

import os
from os import listdir

import pytesseract

from img_process_tool import *
from ocr_tool import *


def img_isbn_area(img_original):
    """
    對原始圖片傾斜擺正後提取出原始圖片中的字符區域
    :param img_original: 輸入圖片
    :return: 提取出的字符區域
    """
    kernel = cv.getStructuringElement(cv.MORPH_RECT, (100, 5), (-1, -1))
    img_pre = img_preprocess(img_original, kernel)
    # 查找最大輪廓
    contours, hierarchy, = cv.findContours(img_pre, 1, 2)
    length = len(contours)
    index = 0  # 存放最大輪廓的索引
    max_area = cv.contourArea(contours[index])  # 存放最大輪廓的面積
    for i in range(length):
        cnt = contours[i]
        area = cv.contourArea(cnt)
        if area > max_area:
            index = i
            max_area = area
    # 利用最大輪廓計算傾斜角度,將圖片擺正
    cnt = contours[index]
    min_area_rect = cv.minAreaRect(cnt)  # 獲取最小外接矩形,返回一個rect,ndarray類型
    angle = min_area_rect[2]  # 獲取旋轉角度
    if angle > 80:
        angle = min_area_rect[2] - 90  # 調整旋轉角度
    img_rotated, mat_rotation = img_rotate(img_original, angle)  # 開始旋轉
    img_rotated_pre = img_preprocess(img_rotated, kernel)
    # 水平投影
    proj_list = get_projection_list(img_rotated_pre)
    split_list = split_projection_list(proj_list, 0)
    img_h, img_w = img_rotated_pre.shape[:2]
    x, y, w, h = 0, 0, img_w, img_h
    for start, end in split_list:
        if end - start > img_h * 0.5:
            continue
        x, y, w, h = 0, start, img_w, end - start
        roi = img_rotated[y:y + h, x:x + w]
        img_show_wait(roi, 'roi')
        proj_list = get_projection_list(img_rotated_pre, 'vertical')
        split_vertical_list = split_projection_list(proj_list, 0)
        if len(split_vertical_list) < 12:
            continue
    return img_rotated[y:y + h, x:x + w]


def split_digits(img_text):
    """
    對提取出的字符區域進行垂直投影,切割出單個字符區域
    :param img_text: 輸入圖像
    :return: 單個字符區域
    """
    isbn_gray = cv.cvtColor(img_text, cv.COLOR_BGR2GRAY)
    ret, isbn_bin = cv.threshold(isbn_gray, 0, 255, cv.THRESH_OTSU + cv.THRESH_BINARY_INV)
    img_show_wait(isbn_bin, 'isbn area bin')
    ver_proj_list = get_projection_list(isbn_bin, 'vertical')
    rows, cols = img_text.shape[:2]
    draw_projection(ver_proj_list, rows, cols, 'vertical')
    digit_col_list = split_projection_list(ver_proj_list)
    digits = []
    for i in range(len(digit_col_list)):
        digit_col = digit_col_list[i]
        digits.append(img_text[:, digit_col[0]:digit_col[1]])
    return digits


def digit_recog(path_to_image):
    """
    識別單張圖片中的ISBN編號
    :param path_to_image: 圖片絕對路徑
    :return:
    """
    img_original = cv.imread(path_to_image)
    img_resized = img_resize(img_original)
    isbn_area = img_isbn_area(img_resized)
    recog_isbn = obtain_digit(pytesseract.image_to_string(isbn_area))
    print(f'【圖片{os.path.basename(path_to_image)}】的識別結果爲:{recog_isbn}')
    if isbn_area is not None:
        digits = split_digits(isbn_area)
        if digits is not None:
            for i in range(len(digits)):
                img_show_wait(digits[i], 'digit')


def digit_recog_batch(path_to_images):
    """
    批量識別圖片
    :param path_to_images: 圖片存放的文件夾絕對路徑
    :return:
    """
    img_cnt = 0  # 圖片數量
    digit_cnt = 0  # 數字數量
    digit_recognized = 0  # 正確識別數字
    isbn_recognized = 0  # 正確識別完整ISBN
    for file in listdir(path_to_images):
        img_abs_path = os.path.join(path_to_images, file)  # 圖像的絕對路徑
        img_basename = os.path.basename(img_abs_path)  # 圖片的文件名
        original_image = cv.imread(img_abs_path, cv.IMREAD_COLOR)
        isbn_area = img_isbn_area(img_resize(original_image))  # 獲取圖片中的字符區域
        if isbn_area is not None:
            recog_isbn = obtain_digit(pytesseract.image_to_string(isbn_area))
            print(f'【圖片{img_basename}】的識別結果爲:{recog_isbn}')
            if recog_isbn:
                img_cnt += 1
                digit_cnt += len(obtain_digit(img_basename))
                digit_recognized += crct_digit_cnt(obtain_digit(img_basename), recog_isbn)
                isbn_recognized += 1 if recog_isbn.find(obtain_digit(file)) != -1 else 0

    print("正確識別的ISBN個數:" + str(isbn_recognized) + "/" + str(img_cnt))
    print("正確識別的數字個數:" + str(digit_recognized) + "/" + str(digit_cnt))
    print("識別正確率:" + str(isbn_recognized / img_cnt))
    print("識別準確率:" + str(digit_recognized / digit_cnt))


# todo:字符識別可以用神經網絡或者模板匹配
if __name__ == "__main__":
    pytesseract.pytesseract.tesseract_cmd = r"D:\software\Tesseract-OCR\tesseract.exe"
    path_to_image = r'D:\projects_python\workingon\isbnocr\isbn_recognition\ISBN 978-7-5099-1125-9.png'
    path_to_images = r'D:\projects_python\workingon\isbnocr\isbn_recognition\images'
    # digit_recog(path_to_image)  # 識別單張圖片
    digit_recog_batch(path_to_images)  # 批量識別圖片中的ISBN編號

這個項目,坐下來感覺比較有參考價值的還是圖片預處理的思路和相關實現,至於字符識別,由於時間關係沒來得及寫,就用了比較粗陋的方法實現,後面有時間改成模板匹配或者神經網絡識別字符。

圖片的源代碼和數據集都同步到gitee上了,地址在這裏isbn-ocr: 計算機視覺課程設計 識別ISBN中的數字 (gitee.com)

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章