一、使用 itextpdf 推薦使用
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itextpdf</artifactId>
<version>5.5.13.1</version>
</dependency>
PdfKeyWordPosition.java
package com.util;
import com.itextpdf.awt.geom.Rectangle2D;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* 獲取pdf關鍵字座標
*/
public class PdfKeyWordPosition {
private static final Logger log = LoggerFactory.getLogger(PdfKeyWordPosition.class);
/**
* 獲取關鍵字座標
* @param pdfData
* @param keyWord
* @return
*/
public static List<Map<String, Object>> getWordsPcoordinate(byte[] pdfData, String keyWord){
List<Map<String, Object>> result = new ArrayList<>();
PdfReader reader = null;
try {
// pdfData :可以是二進制,也可以是文件路徑,兩種方式選擇一種
reader = new PdfReader(pdfData);
//獲取pdf頁數
int pages = reader.getNumberOfPages();
for (int pageNum = 1; pageNum <= pages; pageNum++) {
//每頁的寬度
Float width = reader.getPageSize(pageNum).getWidth();
//每頁的高度
Float height = reader.getPageSize(pageNum).getHeight();
RenderListenerHelper renderListenerHelper = new RenderListenerHelper(pageNum, width, height);
//解析pdf,定位位置
PdfContentStreamProcessor processor = new PdfContentStreamProcessor(renderListenerHelper);
PdfDictionary pageDic = reader.getPageN(pageNum);
PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES);
processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNum), resourcesDic);
//文本內容
String content = renderListenerHelper.getContent();
//文本每個字對應的座標
List<Map<String, Object>> charPositions = renderListenerHelper.getCharPositions();
for (int i = 0; i < content.length(); i++){
//獲取關鍵字所在位置
int keyIndex = content.indexOf(keyWord, i);
if (keyIndex == -1){
break;
}
result.add(charPositions.get(keyIndex));
i = keyIndex + 1;
}
}
} catch (Exception e){
log.error("獲取pdf關鍵字座標失敗:{}", e);
} finally {
reader.close();
}
return result;
}
/**
* 重寫 itextpdf 的 RenderListener 類裏的方法
*/
private static class RenderListenerHelper implements RenderListener {
private int pageNum;
private float pageWidth;
private float pageHeight;
private StringBuilder contentBuilder = new StringBuilder();
private List<Map<String, Object>> charPositions = new ArrayList<>();
public RenderListenerHelper(int pageNum, float pageWidth, float pageHeight) {
this.pageNum = pageNum;
this.pageWidth = pageWidth;
this.pageHeight = pageHeight;
}
public String getContent() {
return contentBuilder.toString();
}
public List<Map<String, Object>> getCharPositions() {
return charPositions;
}
//step 2 遇到"BT"執行
@Override
public void beginTextBlock() {
}
//step 3 文字主要處理方法
@Override
public void renderText(TextRenderInfo renderInfo) {
//獲取文本內容每個字信息集合
List<TextRenderInfo> characterRenderInfos = renderInfo.getCharacterRenderInfos();
for (TextRenderInfo textRenderInfo : characterRenderInfos) {
String word = textRenderInfo.getText();
if (word.length() > 1) {
word = word.substring(word.length() - 1);
}
//關鍵字上邊緣座標
//Rectangle2D.Float boundingRectange = textRenderInfo.getAscentLine().getBoundingRectange();
//關鍵字標準座標(中間)
Rectangle2D.Float boundingRectange = textRenderInfo.getBaseline().getBoundingRectange();
//關鍵字下邊緣座標
//Rectangle2D.Float boundingRectange = textRenderInfo.getDescentLine().getBoundingRectange();
//正常座標
Float x = boundingRectange.x;
Float y = boundingRectange.y;
/*
//中心座標
float x = (float)boundingRectange.getCenterX();
float y = (float)boundingRectange.getCenterY();
//最大最小座標
double x = boundingRectange.getMinX();
double y = boundingRectange.getMaxY();
//這兩個是關鍵字在所在頁面的XY軸的百分比
float xPercent = Math.round(x / pageWidth * 10000) / 10000f;
float yPercent = Math.round((1 - y / pageHeight) * 10000) / 10000f;
*/
Map<String, Object> coordinate = new HashMap<>();
coordinate.put("x", x);
coordinate.put("y", y);
coordinate.put("pageNum", pageNum); //頁數
coordinate.put("fontWidth", boundingRectange.width); //字體長度
coordinate.put("fontHeight", boundingRectange.height); //字段高度
charPositions.add(coordinate);
contentBuilder.append(word);
}
}
//step 4(最後執行的,只執行一次),遇到“ET”執行
@Override
public void endTextBlock() {
}
//step 1(圖片處理方法)
@Override
public void renderImage(ImageRenderInfo renderInfo) {
}
}
public static void main(String[] args) {
try {
InputStream is = null;
ByteArrayOutputStream bos = new ByteArrayOutputStream();
try {
is = new FileInputStream("D:\\test.pdf");
byte[] buffer = new byte[is.available()];
Integer n = 0;
while ((n = is.read(buffer)) != -1) {
bos.write(buffer, 0, n);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
bos.close();
if (is != null) {
is.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
byte[] bytes = bos.toByteArray();
List<Map<String,Object>> wordsPcoordinates = getWordsPcoordinate(bytes,"日期");
for (Map<String, Object> map : wordsPcoordinates){
System.out.println("x座標 -> " + map.get("x"));
System.out.println("y座標 -> " + map.get("y"));
System.out.println("頁數 -> " + map.get("pageNum"));
System.out.println("字體長度 -> " + map.get("fontWidth"));
System.out.println("字段高度 -> " + map.get("fontHeight"));
System.out.println("");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
二、使用 pdfbox
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.20</version>
</dependency>
PdfBoxKeyWordPosition.java
package com.util;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* 繼承 pdfbox 中 PDFTextStripper類,獲取關鍵字座標
*/
public class PdfBoxKeyWordPosition extends PDFTextStripper {
private static final Logger log = LoggerFactory.getLogger(PdfBoxKeyWordPosition.class);
//關鍵字字符數組
private char[] key;
//PDF文件路徑
private String pdfPath;
//二進制文件
private byte[] bytes;
//座標集合
private List<Map<String, Object>> coordinates = new ArrayList<>();
// 當前頁座標集合
private List<Map<String, Object>> pageList = new ArrayList<>();
/*//使用文件路徑
public PdfBoxKeyWordPosition(String keyWords, String pdfPath) throws IOException {
super();
super.setSortByPosition(true);
this.pdfPath = pdfPath;
char[] key = new char[keyWords.length()];
for (int i = 0; i < keyWords.length(); i++) {
key[i] = keyWords.charAt(i);
}
this.key = key;
}*/
//使用二進制數據
public PdfBoxKeyWordPosition(String keyWords, byte[] bytes) throws IOException {
super();
super.setSortByPosition(true);
this.bytes = bytes;
char[] key = new char[keyWords.length()];
for (int i = 0; i < keyWords.length(); i++) {
key[i] = keyWords.charAt(i);
}
this.key = key;
}
// 獲取座標信息
public List<Map<String, Object>> getCoordinate(){
try {
//document = PDDocument.load(new File(pdfPath)); 文件地址
document = PDDocument.load(bytes); //文件二進制數據
int pages = document.getNumberOfPages();
for (int i = 1; i <= pages; i++) {
super.setSortByPosition(true);
super.setStartPage(i);
super.setEndPage(i);
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
super.writeText(document, dummy);
for (Map<String, Object> li : pageList) {
li.put("pageNum", i);
}
coordinates.addAll(pageList);
pageList.clear();
}
} catch (Exception e) {
log.error("獲取pdf關鍵字座標失敗:{}", e);
} finally {
pageList.clear();
try {
if (document != null) {
document.close();
}
} catch (IOException e) {
log.error("關閉文件失敗:{}", e);
}
}
return coordinates;
}
// 獲取座標信息
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
for (int i = 0; i < textPositions.size(); i++) {
String str = textPositions.get(i).getUnicode();
//找到 key 中第一位所在位置
if (str.equals(String.valueOf(key[0]))) {
int count = 0;
for (int j = 0; j < key.length; j++) {
String s = "";
try {
s = textPositions.get(i + j).getUnicode();
} catch (Exception e) {
s = "";
}
//判斷key 中每一位是否和文本中順序對應,一旦不等說明 關鍵字與本段落不等,則停止本次循環
if (s.equals(String.valueOf(key[j]))) {
count++;
} else if (count > 0){
break;
}
}
//判斷 key 中字 在文本是否連續,是則獲取座標
if (count == key.length) {
Map<String, Object> coordinate = new HashMap<>();
TextPosition tp = textPositions.get(i);
// X座標 在這裏加上了字體的長度,也可以直接 tp.getX()
Float x = tp.getX() + tp.getFontSize();
// Y座標 在這裏減去的字體的長度,也可以直接 tp.getPageHeight() - tp.getY()
Float y = tp.getPageHeight() - tp.getY() - 4 * tp.getFontSize();
coordinate.put("x", x);
coordinate.put("y", y);
pageList.add(coordinate);
}
}
}
}
public static void main(String[] args) {
try {
InputStream is = null;
ByteArrayOutputStream bos = new ByteArrayOutputStream();
try {
is = new FileInputStream("D:\\test.pdf");
byte[] buffer = new byte[is.available()];
Integer n = 0;
while ((n = is.read(buffer)) != -1) {
bos.write(buffer, 0, n);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
bos.close();
if (is != null) {
is.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
byte[] bytes = bos.toByteArray();
PdfBoxKeyWordPosition pdf = new PdfBoxKeyWordPosition("日期", bytes);
List<Map<String,Object>> wordsPcoordinates = pdf.getCoordinate();
for (Map<String, Object> map : wordsPcoordinates){
System.out.println("x座標 -> " + map.get("x"));
System.out.println("y座標 -> " + map.get("y"));
System.out.println("頁面 -> " + map.get("pageNum"));
System.out.println("");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}