使用POI簡單讀word取內容

1、添加依賴

<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-ooxml</artifactId>
			<version>3.15</version>
		</dependency>
		
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-scratchpad</artifactId>
			<version>3.15</version>
		</dependency>

2、工具類


import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.*;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xwpf.usermodel.*;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 *
 * 讀取word文檔中表格數據,支持doc、docx
 * @author Fise19
 *
 */
public class ReadWordUtil {
    public static void main(String[] args) {
        ReadWordUtil test = new ReadWordUtil();
        String filePath = "C:/Users/00/Desktop/檢測報告/123.doc";
//		String filePath = "D:\\new\\測試.doc";
        List<List<String>> lists = test.tableInWord(filePath, 1);
        for(List<String> ss : lists){
            for(String s : ss){
                System.out.println(">>>>>>>>"+s);
            }
        }
    }
    /**
     * 讀取文檔中表格
     * @param filePath
     */
    public static List<List<String>> tableInWord(String filePath,Integer orderNum){
        try{
            FileInputStream in = new FileInputStream(filePath);//載入文檔
            // 處理docx格式 即office2007以後版本
            if(filePath.toLowerCase().endsWith("docx")){
                //word 2007 圖片不會被讀取, 表格中的數據會被放在字符串的最後
                XWPFDocument xwpf = new XWPFDocument(in);//得到word文檔的信息
                Iterator<XWPFTable> itpre = xwpf.getTablesIterator();//得到word中的表格
                int total = 0;
                while (itpre.hasNext()) {
                    itpre.next();
                    total += 1;
                }
                Iterator<XWPFTable> it = xwpf.getTablesIterator();//得到word中的表格
                // 設置需要讀取的表格  set是設置需要讀取的第幾個表格,total是文件中表格的總數
                int set = orderNum;
                int num = set;
                // 過濾前面不需要的表格
                for (int i = 0; i < set-1; i++) {
                    it.hasNext();
                    it.next();
                }
                List<List<String>> tableList = new ArrayList<>();
                while(it.hasNext()){
                    XWPFTable table = it.next();
                    System.out.println("這是第" + num + "個表的數據");
                    List<XWPFTableRow> rows = table.getRows();
                    //讀取每一行數據
                    for (int i = 0; i < rows.size(); i++) {
                        XWPFTableRow  row = rows.get(i);
                        //讀取每一列數據
                        List<XWPFTableCell> cells = row.getTableCells();
                        List<String> rowList = new ArrayList<>();
                        for (int j = 0; j < cells.size(); j++) {
                            XWPFTableCell cell = cells.get(j);
                            rowList.add(cell.getText());
                            //輸出當前的單元格的數據
                            System.out.print(cell.getText()+"["+i+","+j+"]" + "\t");
                        }
                        tableList.add(rowList);
                        System.out.println();
                    }
                    // 過濾多餘的表格
                    while (num < total) {
                        it.hasNext();
                        it.next();
                        num += 1;
                    }
                }
                return tableList;
            }else{
                // 處理doc格式 即office2003版本
                POIFSFileSystem pfs = new POIFSFileSystem(in);
                HWPFDocument hwpf = new HWPFDocument(pfs);
                Range range = hwpf.getRange();//得到文檔的讀取範圍
                TableIterator itpre = new TableIterator(range);;//得到word中的表格
                int total = 0;
                while (itpre.hasNext()) {
                    itpre.next();
                    total += 1;
                }
                TableIterator it = new TableIterator(range);
                // 迭代文檔中的表格
                // 如果有多個表格只讀取需要的一個 set是設置需要讀取的第幾個表格,total是文件中表格的總數
                int set = orderNum;
                int num = set;
                for (int i = 0; i < set-1; i++) {
                    it.hasNext();
                    it.next();
                }
                List<List<String>> tableList = new ArrayList<>();
                while (it.hasNext()) {
                    Table tb = (Table) it.next();
                    System.out.println("這是第" + num + "個表的數據");
                    //迭代行,默認從0開始,可以依據需要設置i的值,改變起始行數,也可設置讀取到那行,只需修改循環的判斷條件即可
                    for (int i = 0; i < tb.numRows(); i++) {
                        List<String> rowList = new ArrayList<>();
                        TableRow tr = tb.getRow(i);
                        //迭代列,默認從0開始
                        for (int j = 0; j < tr.numCells(); j++) {
                            TableCell td = tr.getCell(j);//取得單元格
                            //取得單元格的內容
                            for(int k = 0; k < td.numParagraphs(); k++){
                                Paragraph para = td.getParagraph(k);
                                String s = para.text();
                                //去除後面的特殊符號
                                if(null != s && !"".equals(s)){
                                    s = s.substring(0, s.length()-1);
                                }
                                rowList.add(s);
                                System.out.print(s+"["+i+","+j+"]" + "\t");
                            }
                        }
                        tableList.add(rowList);
                        System.out.println();
                    }
                    // 過濾多餘的表格
                    while (num < total) {
                        it.hasNext();
                        it.next();
                        num += 1;
                    }
                }
                return tableList;
            }
        }catch(Exception e){
            e.printStackTrace();
        }
        return null;
    }

    public static List<String> getWordTitles(String path) throws IOException {
        InputStream is = new FileInputStream(path);
        List<String> list = new ArrayList<String>();
        XWPFDocument doc = new XWPFDocument(is);
        List<XWPFParagraph> paras = doc.getParagraphs();
        for (XWPFParagraph graph : paras) {
            String text = graph.getParagraphText();
            String style = graph.getStyle();
            if ("1".equals(style)) {
                System.out.println(text+"--["+style+"]");
            }else if ("2".equals(style)) {
                System.out.println(text+"--["+style+"]");
            }else if ("3".equals(style)) {
                System.out.println(text+"--["+style+"]");
            }else{
                continue;
            }
            list.add(text);
        }
        return list;
    }

    /**
    * <b> 獲得段落內容及表格內容
    * </b><br><br><i>Description</i> :  待優化
    * @return void
    * <br><br>Date: 2019/11/16 18:01     <br>Author : dxl
    */
    public static void getWordText() throws IOException {
        XWPFDocument document = new XWPFDocument(new FileInputStream("C:/Users/00/Desktop/檢測報告/123.docx"));
        try {
            // 獲取word中的所有段落與表格
            List<IBodyElement> elements = document.getBodyElements();
            for (IBodyElement element : elements) {
                // 段落
                if (element instanceof XWPFParagraph) {
                    getParagraphText((XWPFParagraph) element);
                }
                // 表格
                else if (element instanceof XWPFTable) {
                    getTabelText((XWPFTable) element);
                }
            }
        } finally {
            document.close();
        }
    }
    /**
     * 獲取段落內容
     *
     * @param paragraph
     */
    private static void getParagraphText(XWPFParagraph paragraph) {
        // 獲取段落中所有內容
        List<XWPFRun> runs = paragraph.getRuns();
        if (runs.size() == 0) {
            System.out.println("按了回車(新段落)");
            return;
        }
        StringBuffer runText = new StringBuffer();
        for (XWPFRun run : runs) {
            runText.append(run.text());
        }
        if (runText.length() > 0) {
            runText.append(",對齊方式:").append(paragraph.getAlignment().name());
            System.out.println(runText);
        }
    }

    /**
     * 獲取表格內容
     *
     * @param table
     */
    private static void getTabelText(XWPFTable table) {
        List<XWPFTableRow> rows = table.getRows();

        for (XWPFTableRow row : rows) {
            List<XWPFTableCell> cells = row.getTableCells();
            for (XWPFTableCell cell : cells) {
                // 簡單獲取內容(簡單方式是不能獲取字體對齊方式的)
                // System.out.println(cell.getText());
                // 一個單元格可以理解爲一個word文檔,單元格里也可以加段落與表格
                List<XWPFParagraph> paragraphs = cell.getParagraphs();
                for (XWPFParagraph paragraph : paragraphs) {
                    getParagraphText(paragraph);
                }
            }
        }
    }
}

 

發佈了64 篇原創文章 · 獲贊 6 · 訪問量 8萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章