1、添加依賴
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.15</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.15</version>
</dependency>
2、工具類
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.*;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xwpf.usermodel.*;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
*
* 讀取word文檔中表格數據,支持doc、docx
* @author Fise19
*
*/
public class ReadWordUtil {
public static void main(String[] args) {
ReadWordUtil test = new ReadWordUtil();
String filePath = "C:/Users/00/Desktop/檢測報告/123.doc";
// String filePath = "D:\\new\\測試.doc";
List<List<String>> lists = test.tableInWord(filePath, 1);
for(List<String> ss : lists){
for(String s : ss){
System.out.println(">>>>>>>>"+s);
}
}
}
/**
* 讀取文檔中表格
* @param filePath
*/
public static List<List<String>> tableInWord(String filePath,Integer orderNum){
try{
FileInputStream in = new FileInputStream(filePath);//載入文檔
// 處理docx格式 即office2007以後版本
if(filePath.toLowerCase().endsWith("docx")){
//word 2007 圖片不會被讀取, 表格中的數據會被放在字符串的最後
XWPFDocument xwpf = new XWPFDocument(in);//得到word文檔的信息
Iterator<XWPFTable> itpre = xwpf.getTablesIterator();//得到word中的表格
int total = 0;
while (itpre.hasNext()) {
itpre.next();
total += 1;
}
Iterator<XWPFTable> it = xwpf.getTablesIterator();//得到word中的表格
// 設置需要讀取的表格 set是設置需要讀取的第幾個表格,total是文件中表格的總數
int set = orderNum;
int num = set;
// 過濾前面不需要的表格
for (int i = 0; i < set-1; i++) {
it.hasNext();
it.next();
}
List<List<String>> tableList = new ArrayList<>();
while(it.hasNext()){
XWPFTable table = it.next();
System.out.println("這是第" + num + "個表的數據");
List<XWPFTableRow> rows = table.getRows();
//讀取每一行數據
for (int i = 0; i < rows.size(); i++) {
XWPFTableRow row = rows.get(i);
//讀取每一列數據
List<XWPFTableCell> cells = row.getTableCells();
List<String> rowList = new ArrayList<>();
for (int j = 0; j < cells.size(); j++) {
XWPFTableCell cell = cells.get(j);
rowList.add(cell.getText());
//輸出當前的單元格的數據
System.out.print(cell.getText()+"["+i+","+j+"]" + "\t");
}
tableList.add(rowList);
System.out.println();
}
// 過濾多餘的表格
while (num < total) {
it.hasNext();
it.next();
num += 1;
}
}
return tableList;
}else{
// 處理doc格式 即office2003版本
POIFSFileSystem pfs = new POIFSFileSystem(in);
HWPFDocument hwpf = new HWPFDocument(pfs);
Range range = hwpf.getRange();//得到文檔的讀取範圍
TableIterator itpre = new TableIterator(range);;//得到word中的表格
int total = 0;
while (itpre.hasNext()) {
itpre.next();
total += 1;
}
TableIterator it = new TableIterator(range);
// 迭代文檔中的表格
// 如果有多個表格只讀取需要的一個 set是設置需要讀取的第幾個表格,total是文件中表格的總數
int set = orderNum;
int num = set;
for (int i = 0; i < set-1; i++) {
it.hasNext();
it.next();
}
List<List<String>> tableList = new ArrayList<>();
while (it.hasNext()) {
Table tb = (Table) it.next();
System.out.println("這是第" + num + "個表的數據");
//迭代行,默認從0開始,可以依據需要設置i的值,改變起始行數,也可設置讀取到那行,只需修改循環的判斷條件即可
for (int i = 0; i < tb.numRows(); i++) {
List<String> rowList = new ArrayList<>();
TableRow tr = tb.getRow(i);
//迭代列,默認從0開始
for (int j = 0; j < tr.numCells(); j++) {
TableCell td = tr.getCell(j);//取得單元格
//取得單元格的內容
for(int k = 0; k < td.numParagraphs(); k++){
Paragraph para = td.getParagraph(k);
String s = para.text();
//去除後面的特殊符號
if(null != s && !"".equals(s)){
s = s.substring(0, s.length()-1);
}
rowList.add(s);
System.out.print(s+"["+i+","+j+"]" + "\t");
}
}
tableList.add(rowList);
System.out.println();
}
// 過濾多餘的表格
while (num < total) {
it.hasNext();
it.next();
num += 1;
}
}
return tableList;
}
}catch(Exception e){
e.printStackTrace();
}
return null;
}
public static List<String> getWordTitles(String path) throws IOException {
InputStream is = new FileInputStream(path);
List<String> list = new ArrayList<String>();
XWPFDocument doc = new XWPFDocument(is);
List<XWPFParagraph> paras = doc.getParagraphs();
for (XWPFParagraph graph : paras) {
String text = graph.getParagraphText();
String style = graph.getStyle();
if ("1".equals(style)) {
System.out.println(text+"--["+style+"]");
}else if ("2".equals(style)) {
System.out.println(text+"--["+style+"]");
}else if ("3".equals(style)) {
System.out.println(text+"--["+style+"]");
}else{
continue;
}
list.add(text);
}
return list;
}
/**
* <b> 獲得段落內容及表格內容
* </b><br><br><i>Description</i> : 待優化
* @return void
* <br><br>Date: 2019/11/16 18:01 <br>Author : dxl
*/
public static void getWordText() throws IOException {
XWPFDocument document = new XWPFDocument(new FileInputStream("C:/Users/00/Desktop/檢測報告/123.docx"));
try {
// 獲取word中的所有段落與表格
List<IBodyElement> elements = document.getBodyElements();
for (IBodyElement element : elements) {
// 段落
if (element instanceof XWPFParagraph) {
getParagraphText((XWPFParagraph) element);
}
// 表格
else if (element instanceof XWPFTable) {
getTabelText((XWPFTable) element);
}
}
} finally {
document.close();
}
}
/**
* 獲取段落內容
*
* @param paragraph
*/
private static void getParagraphText(XWPFParagraph paragraph) {
// 獲取段落中所有內容
List<XWPFRun> runs = paragraph.getRuns();
if (runs.size() == 0) {
System.out.println("按了回車(新段落)");
return;
}
StringBuffer runText = new StringBuffer();
for (XWPFRun run : runs) {
runText.append(run.text());
}
if (runText.length() > 0) {
runText.append(",對齊方式:").append(paragraph.getAlignment().name());
System.out.println(runText);
}
}
/**
* 獲取表格內容
*
* @param table
*/
private static void getTabelText(XWPFTable table) {
List<XWPFTableRow> rows = table.getRows();
for (XWPFTableRow row : rows) {
List<XWPFTableCell> cells = row.getTableCells();
for (XWPFTableCell cell : cells) {
// 簡單獲取內容(簡單方式是不能獲取字體對齊方式的)
// System.out.println(cell.getText());
// 一個單元格可以理解爲一個word文檔,單元格里也可以加段落與表格
List<XWPFParagraph> paragraphs = cell.getParagraphs();
for (XWPFParagraph paragraph : paragraphs) {
getParagraphText(paragraph);
}
}
}
}
}