package com.test;
import java.awt.Rectangle;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.apache.pdfbox.text.PDFTextStripperByArea;
public class PDFUtils {
public static Map<String ,Object> readPdf(File file){
Map<String ,Object> map=new HashMap<>();
List<String> pdfLines=new ArrayList<>();
try {
PDDocument document = PDDocument.load(file);
int pages = document.getNumberOfPages();
//自己劃定區間,將文件內容全部包含在內
Rectangle rectBase = new Rectangle(0, 0, 682, 800);
for (int i = 0; i < pages; i++) {
PDPage page = document.getPage(i);
PDFTextStripperByArea stripper;
stripper = new PDFTextStripperByArea();
stripper.setSortByPosition(true);
stripper.addRegion("base", rectBase);
stripper.extractRegions(page);
//獲取每一頁的數據信息
String lines = stripper.getTextForRegion("base");
pdfLines.add(lines);
}
map.put("ok", true);
map.put("pdfLines", pdfLines);
} catch (InvalidPasswordException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally {
return map;
}
}
public static void main(String[] args) {
//File file = new File("resource/12345.pdf");
File file = new File("d:/12345.pdf");
Map<String, Object> readPdf = readPdf(file);
if((boolean) readPdf.get("ok")){
List<String> pdfLines = (List)readPdf.get("pdfLines");
for (String object : pdfLines) {
/**
* 將該頁信息進行切割
* 因爲在Linux上面使用\r容易出現問題(下標越界比較多)。小心使用
*/
//String[] line = object.split("\r\n");
String[] line = object.split("\\n");
for (String string : line) {
System.out.println(string);
}
}
}
}
}
jar包下載鏈接鏈接:https://pan.baidu.com/s/1GdEAtYGK0b9qgx_SvuegGA 密碼:evmt
以及關於本地解析成功,但是Linux上面報錯,,可以參考系統\r\n問題