Java 獲取PDF關鍵字座標

一、使用 itextpdf  推薦使用

<dependency>
    <groupId>com.itextpdf</groupId>
    <artifactId>itextpdf</artifactId>
    <version>5.5.13.1</version>
</dependency>

PdfKeyWordPosition.java

package com.util;

import com.itextpdf.awt.geom.Rectangle2D;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * 獲取pdf關鍵字座標
 */
public class PdfKeyWordPosition {

    private static final Logger log = LoggerFactory.getLogger(PdfKeyWordPosition.class);

    /**
     * 獲取關鍵字座標
     * @param pdfData
     * @param keyWord
     * @return
     */
    public static List<Map<String, Object>> getWordsPcoordinate(byte[] pdfData, String keyWord){

        List<Map<String, Object>> result = new ArrayList<>();

        PdfReader reader = null;

       try {

           // pdfData :可以是二進制,也可以是文件路徑,兩種方式選擇一種
           reader = new PdfReader(pdfData);

           //獲取pdf頁數
           int pages = reader.getNumberOfPages();

           for (int pageNum = 1; pageNum <= pages; pageNum++) {

               //每頁的寬度
               Float width = reader.getPageSize(pageNum).getWidth();

               //每頁的高度
               Float height = reader.getPageSize(pageNum).getHeight();

               RenderListenerHelper renderListenerHelper = new RenderListenerHelper(pageNum, width, height);

               //解析pdf,定位位置
               PdfContentStreamProcessor processor = new PdfContentStreamProcessor(renderListenerHelper);
               PdfDictionary pageDic = reader.getPageN(pageNum);
               PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES);

               processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNum), resourcesDic);

               //文本內容
               String content = renderListenerHelper.getContent();
               //文本每個字對應的座標
               List<Map<String, Object>> charPositions = renderListenerHelper.getCharPositions();

               for (int i = 0; i < content.length(); i++){

                   //獲取關鍵字所在位置
                   int keyIndex = content.indexOf(keyWord, i);

                   if (keyIndex == -1){
                       break;
                   }

                   result.add(charPositions.get(keyIndex));

                   i = keyIndex + 1;
               }
           }

       } catch (Exception e){
           log.error("獲取pdf關鍵字座標失敗:{}", e);
       } finally {
           reader.close();
       }

        return result;
    }

    /**
     * 重寫 itextpdf 的 RenderListener 類裏的方法
     */
    private static class RenderListenerHelper implements RenderListener {

        private int pageNum;

        private float pageWidth;

        private float pageHeight;

        private StringBuilder contentBuilder = new StringBuilder();

        private List<Map<String, Object>> charPositions = new ArrayList<>();

        public RenderListenerHelper(int pageNum, float pageWidth, float pageHeight) {
            this.pageNum = pageNum;
            this.pageWidth = pageWidth;
            this.pageHeight = pageHeight;
        }

        public String getContent() {
            return contentBuilder.toString();
        }

        public List<Map<String, Object>> getCharPositions() {
            return charPositions;
        }

        //step 2 遇到"BT"執行
        @Override
        public void beginTextBlock() {

        }

        //step 3 文字主要處理方法
        @Override
        public void renderText(TextRenderInfo renderInfo) {

            //獲取文本內容每個字信息集合
            List<TextRenderInfo> characterRenderInfos = renderInfo.getCharacterRenderInfos();

            for (TextRenderInfo textRenderInfo : characterRenderInfos) {

                String word = textRenderInfo.getText();

                if (word.length() > 1) {
                    word = word.substring(word.length() - 1);
                }

                //關鍵字上邊緣座標
                //Rectangle2D.Float boundingRectange = textRenderInfo.getAscentLine().getBoundingRectange();

                //關鍵字標準座標(中間)
                Rectangle2D.Float boundingRectange = textRenderInfo.getBaseline().getBoundingRectange();

                //關鍵字下邊緣座標
                //Rectangle2D.Float boundingRectange = textRenderInfo.getDescentLine().getBoundingRectange();

                //正常座標
                Float x = boundingRectange.x;
                Float y = boundingRectange.y;

                /*
                //中心座標
                float x = (float)boundingRectange.getCenterX();
                float y = (float)boundingRectange.getCenterY();

                //最大最小座標
                double x = boundingRectange.getMinX();
                double y = boundingRectange.getMaxY();

                //這兩個是關鍵字在所在頁面的XY軸的百分比
                float xPercent = Math.round(x / pageWidth * 10000) / 10000f;
                float yPercent = Math.round((1 - y / pageHeight) * 10000) / 10000f;
                 */

                Map<String, Object> coordinate = new HashMap<>();
                coordinate.put("x", x);
                coordinate.put("y", y);
                coordinate.put("pageNum", pageNum); //頁數
                coordinate.put("fontWidth", boundingRectange.width); //字體長度
                coordinate.put("fontHeight", boundingRectange.height); //字段高度

                charPositions.add(coordinate);
                contentBuilder.append(word);
            }
        }

        //step 4(最後執行的,只執行一次),遇到“ET”執行
        @Override
        public void endTextBlock() {

        }

        //step 1(圖片處理方法)
        @Override
        public void renderImage(ImageRenderInfo renderInfo) {

        }
    }

    public static void main(String[] args) {

        try {

            InputStream is = null;

            ByteArrayOutputStream bos = new ByteArrayOutputStream();
            try {

                is = new FileInputStream("D:\\test.pdf");

                byte[] buffer = new byte[is.available()];

                Integer n = 0;

                while ((n = is.read(buffer)) != -1) {
                    bos.write(buffer, 0, n);
                }

            } catch (IOException e) {

                e.printStackTrace();
            } finally {

                try {
                    bos.close();

                    if (is != null) {
                        is.close();
                    }

                } catch (IOException e) {
                    e.printStackTrace();
                }
            }

            byte[] bytes = bos.toByteArray();

            List<Map<String,Object>> wordsPcoordinates = getWordsPcoordinate(bytes,"日期");

            for (Map<String, Object> map : wordsPcoordinates){

                System.out.println("x座標 -> " + map.get("x"));
                System.out.println("y座標 -> " + map.get("y"));
                System.out.println("頁數 -> " + map.get("pageNum"));
                System.out.println("字體長度 -> " + map.get("fontWidth"));
                System.out.println("字段高度 -> " + map.get("fontHeight"));

                System.out.println("");
            }

        } catch (Exception e) {
            e.printStackTrace();
        }

    }
}

二、使用 pdfbox

<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>2.0.20</version>
</dependency>

PdfBoxKeyWordPosition.java

package com.util;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * 繼承 pdfbox 中 PDFTextStripper類,獲取關鍵字座標
 */
public class PdfBoxKeyWordPosition extends PDFTextStripper {

    private static final Logger log = LoggerFactory.getLogger(PdfBoxKeyWordPosition.class);

    //關鍵字字符數組
    private char[] key;

    //PDF文件路徑
    private String pdfPath;

    //二進制文件
    private byte[] bytes;

    //座標集合
    private List<Map<String, Object>> coordinates = new ArrayList<>();

    // 當前頁座標集合
    private List<Map<String, Object>> pageList = new ArrayList<>();

    /*//使用文件路徑
    public PdfBoxKeyWordPosition(String keyWords, String pdfPath) throws IOException {
        super();
        super.setSortByPosition(true);
        this.pdfPath = pdfPath;
        char[] key = new char[keyWords.length()];
        for (int i = 0; i < keyWords.length(); i++) {
            key[i] = keyWords.charAt(i);
        }
        this.key = key;
    }*/

    //使用二進制數據
    public PdfBoxKeyWordPosition(String keyWords, byte[] bytes) throws IOException {
        super();
        super.setSortByPosition(true);
        this.bytes = bytes;
        char[] key = new char[keyWords.length()];
        for (int i = 0; i < keyWords.length(); i++) {
            key[i] = keyWords.charAt(i);
        }
        this.key = key;
    }

    // 獲取座標信息
    public List<Map<String, Object>> getCoordinate(){

        try {

           //document = PDDocument.load(new File(pdfPath)); 文件地址
            document = PDDocument.load(bytes); //文件二進制數據

            int pages = document.getNumberOfPages();

            for (int i = 1; i <= pages; i++) {

                super.setSortByPosition(true);

                super.setStartPage(i);

                super.setEndPage(i);

                Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());

                super.writeText(document, dummy);

                for (Map<String, Object> li : pageList) {
                    li.put("pageNum", i);
                }

                coordinates.addAll(pageList);

                pageList.clear();
            }

        } catch (Exception e) {
            log.error("獲取pdf關鍵字座標失敗:{}", e);
        } finally {

            pageList.clear();

            try {
                if (document != null) {
                    document.close();
                }
            } catch (IOException e) {
                log.error("關閉文件失敗:{}", e);
            }
        }

        return coordinates;
    }

    // 獲取座標信息
    @Override
    protected void writeString(String string, List<TextPosition> textPositions) throws IOException {

        for (int i = 0; i < textPositions.size(); i++) {

            String str = textPositions.get(i).getUnicode();

            //找到 key 中第一位所在位置
            if (str.equals(String.valueOf(key[0]))) {

                int count = 0;

                for (int j = 0; j < key.length; j++) {

                    String s = "";

                    try {
                        s = textPositions.get(i + j).getUnicode();
                    } catch (Exception e) {
                        s = "";
                    }

                    //判斷key 中每一位是否和文本中順序對應,一旦不等說明 關鍵字與本段落不等,則停止本次循環
                    if (s.equals(String.valueOf(key[j]))) {
                        count++;
                    } else if (count > 0){
                        break;
                    }
                }

                //判斷 key 中字 在文本是否連續,是則獲取座標
                if (count == key.length) {

                    Map<String, Object> coordinate = new HashMap<>();

                    TextPosition tp = textPositions.get(i);

                    // X座標 在這裏加上了字體的長度,也可以直接 tp.getX()
                    Float x = tp.getX() + tp.getFontSize();

                    // Y座標 在這裏減去的字體的長度,也可以直接 tp.getPageHeight() - tp.getY()
                    Float y = tp.getPageHeight() - tp.getY() - 4 * tp.getFontSize();

                    coordinate.put("x", x);
                    coordinate.put("y", y);

                    pageList.add(coordinate);
                }
            }
        }
    }

    public static void main(String[] args) {

        try {

            InputStream is = null;

            ByteArrayOutputStream bos = new ByteArrayOutputStream();
            try {

                is = new FileInputStream("D:\\test.pdf");

                byte[] buffer = new byte[is.available()];

                Integer n = 0;

                while ((n = is.read(buffer)) != -1) {
                    bos.write(buffer, 0, n);
                }

            } catch (IOException e) {

                e.printStackTrace();
            } finally {

                try {
                    bos.close();

                    if (is != null) {
                        is.close();
                    }

                } catch (IOException e) {
                    e.printStackTrace();
                }
            }

            byte[] bytes = bos.toByteArray();

            PdfBoxKeyWordPosition pdf = new PdfBoxKeyWordPosition("日期", bytes);

            List<Map<String,Object>> wordsPcoordinates = pdf.getCoordinate();

            for (Map<String, Object> map : wordsPcoordinates){

                System.out.println("x座標 -> " + map.get("x"));
                System.out.println("y座標 -> " + map.get("y"));
                System.out.println("頁面 -> " + map.get("pageNum"));

                System.out.println("");
            }

        } catch (Exception e) {
            e.printStackTrace();
        }

    }
}

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章