Tesseract-ocr圖片文字識別

原創

ChihPingM

2020-06-03 04:42

1.下載安裝tesseract-ocr-setup-4.00.00dev.exe

http://digi.bib.uni-mannheim.de/tesseract/tesseract-ocr-setup-4.00.00dev.exe

2.下載簡體中文語言包chi_sim.traineddata

https://github.com/tesseract-ocr/tessdata/find/master

3.圖片二值化處理

package com.daorigin.AI.ocr;

import java.awt.image.BufferedImage;
import java.awt.image.ConvolveOp;
import java.awt.image.Kernel;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;

import javax.imageio.ImageIO;

public class ImgSharper{
	
	public static void main(String[] args){
		try {
//			getSharperPicture("E://民事起訴狀.JPG","E://民事起訴狀2.JPG","jpg");
//			binaryImage("E://民事起訴狀.JPG","E://民事起訴狀1.JPG","jpg");
			binaryImage("E://民事起訴狀2.JPG","E://民事起訴狀3.JPG","jpg");
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	
	/**
	 *  圖片銳化
	 * @param originalPic
	 * @return
	 * @throws IOException 
	 * @throws FileNotFoundException 
	 */
	public static void getSharperPicture(String in,String out,String type) throws FileNotFoundException, IOException{  
		BufferedImage originalPic = ImageIO.read(new FileInputStream(in));
		int imageWidth = originalPic.getWidth();  
        int imageHeight = originalPic.getHeight();  
  
//        BufferedImage newPic = new BufferedImage(imageWidth, imageHeight,  
//                BufferedImage.TYPE_3BYTE_BGR);  
        BufferedImage newPic = new BufferedImage(imageWidth, imageHeight,  
                BufferedImage.TYPE_3BYTE_BGR);  
//        float[] data =  
//        { -1.0f, -1.0f, -1.0f, -1.0f, 10.0f, -1.0f, -1.0f, -1.0f, -1.0f };  
        float[] data = { 
		        -1.0f, -1.0f, -1.0f,
		        -1.0f, 9.0f, -1.0f,
		        -1.0f, -1.0f, -1.0f 
		};
  
        Kernel kernel = new Kernel(3, 3, data);  
        ConvolveOp co = new ConvolveOp(kernel, ConvolveOp.EDGE_NO_OP, null);  
        co.filter(originalPic, newPic);  
        ImageIO.write(newPic, type, new File(out));
    } 
	
	/**
	 * 二值化
	 * @throws IOException
	 */
	public static void binaryImage(String in,String out,String type) throws IOException{
		BufferedImage image = ImageIO.read(new FileInputStream(in));;
		
		int width = image.getWidth();
		int height = image.getHeight();
		
		BufferedImage grayImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_BINARY);//重點，技巧在這個參數BufferedImage.TYPE_BYTE_BINARY
		for(int i= 0 ; i < width ; i++){
		    for(int j = 0 ; j < height; j++){
			int rgb = image.getRGB(i, j);
			grayImage.setRGB(i, j, rgb);
		    }
		}
		ImageIO.write(grayImage, type, new File(out));
	}
}

4.程序調用

package com.daorigin.AI.ocr;

import java.io.BufferedReader;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;

import org.jdesktop.swingx.util.OS;

public class OCRHelper {
	private final String LANG_OPTION = "-l";
	private final String EOL = System.getProperty("line.separator");
	/**
	 * 文件位置我防止在，項目同一路徑
	 */
	private String tessPath = new File("C:\\Program Files (x86)\\Tesseract-OCR").getAbsolutePath();

	/**
	 * @param imageFile
	 *            傳入的圖像文件
	 * @param imageFormat
	 *            傳入的圖像格式
	 * @return 識別後的字符串
	 */
	public String recognizeText(File imageFile) throws Exception {
		/**
		 * 設置輸出文件的保存的文件目錄
		 */
		File outputFile = new File(imageFile.getParentFile(), "output");

		StringBuffer strB = new StringBuffer();
		List<String> cmd = new ArrayList<String>();
		if (OS.isWindowsXP()) {
			cmd.add(tessPath + "\\tesseract");
		} else if (OS.isLinux()) {
			cmd.add("tesseract");
		} else {
			cmd.add(tessPath + "\\tesseract");
		}
		cmd.add("");
		cmd.add(outputFile.getName());
		cmd.add(LANG_OPTION);
		cmd.add("chi_sim");
		// cmd.add("eng");

		ProcessBuilder pb = new ProcessBuilder();
		/**
		 * Sets this process builder's working directory.
		 */
		pb.directory(imageFile.getParentFile());
		cmd.set(1, imageFile.getName());
		pb.command(cmd);
		pb.redirectErrorStream(true);
		Process process = pb.start();
		// tesseract.exe 1.jpg 1 -l chi_sim
		// Runtime.getRuntime().exec("tesseract.exe 1.jpg 1 -l chi_sim");
		/**
		 * the exit value of the process. By convention, 0 indicates normal
		 * termination.
		 */
		// System.out.println(cmd.toString());
		int w = process.waitFor();
		if (w == 0)// 0代表正常退出
		{
			BufferedReader in = new BufferedReader(
					new InputStreamReader(new FileInputStream(outputFile.getAbsolutePath() + ".txt"), "UTF-8"));
			String str;

			while ((str = in.readLine()) != null) {
				strB.append(str).append(EOL);
			}
			in.close();
		} else {
			String msg;
			switch (w) {
			case 1:
				msg = "Errors accessing files. There may be spaces in your image's filename.";
				break;
			case 29:
				msg = "Cannot recognize the image or its selected region.";
				break;
			case 31:
				msg = "Unsupported image format.";
				break;
			default:
				msg = "Errors occurred.";
			}
			throw new RuntimeException(msg);
		}
		new File(outputFile.getAbsolutePath() + ".txt").delete();
		return strB.toString().replaceAll("\\s*", "");
	}
}

package com.daorigin.AI.ocr;

import java.io.File;

public class Test {
	public static void main(String[] args) {
		try {
			ImgSharper is = new ImgSharper();
			is.binaryImage("E:/民事起訴狀第二頁.JPG", "E:/民事起訴狀第二頁0.JPG", "jpg");
			File file = new File("E:/民事起訴狀第二頁0.JPG");
			String recognizeText = new OCRHelper().recognizeText(file);
			System.out.println(recognizeText + "\t");

		} catch (Exception e) {
			e.printStackTrace();
		}

	}
}

基本經過二值化和銳化處理的圖片，分辨準確率可以達到95%以上。

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

Tesseract-ocr圖片文字識別

如何使用 JS 判斷用戶是否處於活躍狀態

lightdb秒級增加列和刪除列（not null帶默認值）

lightdb數據庫超時相關控制參數

通過HPA+CronHPA組合應對業務複雜彈性伸縮場景

❤️‍🔥 Solon Cloud Event 新的事務特性與應用

lightdb mysql 8.0兼容之不可見主鍵

使用 JS 實現在瀏覽器控制檯打印圖片 console.image()

基於Ubuntu-22.04安裝K8s-v1.28.2實驗（四）使用域名訪問網站應用

docx4j添加批註

Tesseract-ocr圖片文字識別

docx4j文檔差異比較

openCV圖片傾斜矯正（java版）

eclipse中junit test或者run main方法報錯

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結