POI讀取word2007+

    很多人使用POI讀取word的時候都會這麼寫：
InputStream inputStream = new FileInputStream(new File("e://company/test.docx"));
XWPFDocument document = new XWPFDocument(inputStream);
System.out.println(new XWPFWordExtractor(document).getText());
    但是這個方法其實有非常多的問題的，文本框裏面的內容讀取不到，換行也有問題。那麼我改進了一下：
/**
 * 處理2007+的WORD
 * @param filePath 文件地址
 * @return word內容
 */
private static String read2007(String filePath) {
    InputStream inputStream = null;
    StringBuffer content = new StringBuffer();
    try {
        inputStream = new FileInputStream(new File(filePath));
        XWPFDocument document = new XWPFDocument(inputStream);
        // 讀取非表格文本框
        for (XWPFParagraph xwpfParagraph : document.getParagraphs()) {
            for(XWPFRun xwpfRun : xwpfParagraph.getRuns()) {
                content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE);
            }
        }
        // 讀取表格內文本框
        for(XWPFTable xwpfTable : document.getTables()) {
            for (XWPFTableRow xwpfTableRow : xwpfTable.getRows()) {
                for (XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) {
                    for (XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) {
                        for(XWPFRun xwpfRun : xwpfParagraph.getRuns()) {
                            content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE);
                        }
                    }
                }
            }
        }
        // 讀取表格內容
        for(XWPFTable xwpfTable : document.getTables()) {
            for (XWPFTableRow xwpfTableRow : xwpfTable.getRows()) {
                for (XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) {
                    for (XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) {
                        content.append(xwpfParagraph.getText()).append(NEW_LINE);
                    }
                }
            }
        }
        return content.toString();
    } catch (IOException e) {
        logger.error("解析word錯誤，文件地址：" + filePath, e);
    } finally {
        IOUtils.closeQuietly(inputStream);
    }
    return null;
}
 
/**
 * 獲取XML內容，可以使用遞歸cursor.getDomNode()
 * @param xml xml
 * @return xml內容
 */
private static String getXMLContent(String xml) {
    StringBuffer content = new StringBuffer();
    Document document;
    try {
        document = DocumentHelper.parseText(xml);
        List<?> namespaces = document.getRootElement().declaredNamespaces(); // 判斷是否有表格包含文本框
        boolean hasboxintab = false;
        for (Object object : namespaces) {
            Namespace namespace = (Namespace) object;
            if(NAMESPANCE_OF_TEXTBOX_IN_TABLE.equals(namespace.getPrefix())) {
                hasboxintab = true;
                break;
            }
        }
        if(!hasboxintab)
            return content.toString();
        for(Object node : document.selectNodes("//mc:Fallback//w:p")) {
            for(Object nodeb : ((Node) node).selectNodes(".//w:t")) {
                if(StringUtils.isNotEmpty(((Node) nodeb).getText()))
                    content.append(((Node) nodeb).getText());
            }
            content.append(NEW_LINE);
        }
    } catch (DocumentException e) {
        logger.error("XML轉化錯誤，內容：" + xml, e);
    }
    return content.toString();
}
    2003版本簡單一些：
/**
 * 處理2003的WORD
 * @param filePath 文件地址
 * @return word內容
 */
private static String read2003(String filePath) {
    InputStream inputStream = null;
    StringBuffer content = new StringBuffer();
    try {
        inputStream = new FileInputStream(new File(filePath));
        HWPFDocument document = new HWPFDocument(inputStream);
        String text = null;
        for (int i = 0; i < document.getMainTextboxRange().numParagraphs(); i++) { // 文本框
            text = document.getMainTextboxRange().getParagraph(i).text();
            if(StringUtils.isNotEmpty(text))
                content.append(text).append(NEW_LINE);
        }
        for (int i = 0; i < document.getRange().numParagraphs(); i++) { // 非文本框
            text = document.getRange().getParagraph(i).text();
            if(StringUtils.isNotEmpty(text) && StringUtils.isNotEmpty(text.trim())) // 注意這裏的trim()方法否者會出現亂碼
                content.append(text.trim()).append(NEW_LINE);
        }
        return content.toString();
    } catch (FileNotFoundException e) {
        logger.error("解析word錯誤，文件地址：" + filePath, e);
    } catch (IOException e) {
        logger.error("解析word錯誤，文件地址：" + filePath, e);
    } finally {
        IOUtils.closeQuietly(inputStream);
    }
    return null;
}
    注意：讀取出的內容爲表格裏面的內容，文本框內容和直接寫在編輯區裏面的文本，其他的一些諸如：批註，引用等一些信息可能讀取不到，需要的請自行解決。
    比較完整的代碼：
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
 
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Node;
 
/**
 * WordReaderUtils - WORD 讀取
 * 
 * @author 500d Team
 * @version 1.0
 */
public class WordReaderUtils {
 
    private static final String WORD_2003 = "doc";
    private static final String WORD_2007 = "docx";
    private static final Logger logger = Logger.getLogger(WordReaderUtils.class);
    public static final String NEW_LINE = "\r\n";
     
    public static String read(String filePath) {
        File wordFile = StringUtils.isNotEmpty(filePath) ? new File(filePath) : null;
        if (wordFile == null || !wordFile.exists() || !wordFile.isFile())
            return null;
        String extension = FilenameUtils.getExtension(filePath);
        if(StringUtils.isEmpty(extension))
            return null;
        String content = null;
        if(WORD_2003.equals(extension.toLowerCase()))
            content = read2003(filePath);
        else if(WORD_2007.equals(extension.toLowerCase()))
            content = read2007(filePath);
        return Crossover.handle(content);
    }
     
    /**
     * 處理2003的WORD
     * @param filePath 文件地址
     * @return word內容
     */
    private static String read2003(String filePath) {
        InputStream inputStream = null;
        StringBuffer content = new StringBuffer();
        try {
            inputStream = new FileInputStream(new File(filePath));
            HWPFDocument document = new HWPFDocument(inputStream);
            String text = null;
            for (int i = 0; i < document.getMainTextboxRange().numParagraphs(); i++) {
                text = document.getMainTextboxRange().getParagraph(i).text();
                if(StringUtils.isNotEmpty(text))
                    content.append(text).append(NEW_LINE);
            }
            for (int i = 0; i < document.getRange().numParagraphs(); i++) {
                text = document.getRange().getParagraph(i).text();
                if(StringUtils.isNotEmpty(text) && StringUtils.isNotEmpty(text.trim())) // 注意這裏的trim()方法否者會出現亂碼
                    content.append(text.trim()).append(NEW_LINE);
            }
            return content.toString();
        } catch (FileNotFoundException e) {
            logger.error("解析word錯誤，文件地址：" + filePath, e);
        } catch (IOException e) {
            logger.error("解析word錯誤，文件地址：" + filePath, e);
        } finally {
            IOUtils.closeQuietly(inputStream);
        }
        return null;
    }
     
    /**
     * 處理2007+的WORD
     * @param filePath 文件地址
     * @return word內容
     */
    private static String read2007(String filePath) {
        InputStream inputStream = null;
        StringBuffer content = new StringBuffer();
        try {
            inputStream = new FileInputStream(new File(filePath));
            XWPFDocument document = new XWPFDocument(inputStream);
            // 讀取非表格文本框
            for (XWPFParagraph xwpfParagraph : document.getParagraphs()) {
                for(XWPFRun xwpfRun : xwpfParagraph.getRuns()) {
                    content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE);
                }
            }
            // 讀取表格內文本框
            for(XWPFTable xwpfTable : document.getTables()) {
                for (XWPFTableRow xwpfTableRow : xwpfTable.getRows()) {
                    for (XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) {
                        for (XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) {
                            for(XWPFRun xwpfRun : xwpfParagraph.getRuns()) {
                                content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE);
                            }
                        }
                    }
                }
            }
            // 讀取表格內容
            for(XWPFTable xwpfTable : document.getTables()) {
                for (XWPFTableRow xwpfTableRow : xwpfTable.getRows()) {
                    for (XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) {
                        for (XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) {
                            content.append(xwpfParagraph.getText()).append(NEW_LINE);
                        }
                    }
                }
            }
            return content.toString();
        } catch (IOException e) {
            logger.error("解析word錯誤，文件地址：" + filePath, e);
        } finally {
            IOUtils.closeQuietly(inputStream);
        }
        return null;
    }
     
    /**
     * 獲取XML內容，可以使用遞歸cursor.getDomNode()
     * @param xml xml
     * @return xml內容
     */
    private static String getXMLContent(String xml) {
        StringBuffer content = new StringBuffer();
        Document document;
        try {
            document = DocumentHelper.parseText(xml);
            List<?> namespaces = document.getRootElement().declaredNamespaces(); // 判斷是否有表格包含文本框
            boolean hasboxintab = false;
            for (Object object : namespaces) {
                Namespace namespace = (Namespace) object;
                if(NAMESPANCE_OF_TEXTBOX_IN_TABLE.equals(namespace.getPrefix())) {
                    hasboxintab = true;
                    break;
                }
            }
            if(!hasboxintab)
                return content.toString();
            for(Object node : document.selectNodes("//mc:Fallback//w:p")) {
                for(Object nodeb : ((Node) node).selectNodes(".//w:t")) {
                    if(StringUtils.isNotEmpty(((Node) nodeb).getText()))
                        content.append(((Node) nodeb).getText());
                }
                content.append(NEW_LINE);
            }
        } catch (DocumentException e) {
            logger.error("XML轉化錯誤，內容：" + xml, e);
        }
        return content.toString();
    }
     
    public static void main(String[] args) throws Exception {
//      System.out.println(read("e://company/test.doc"));
//      System.out.println(read("e://company/test.docx"));
    }
     
}
參考文檔：http://www.acgist.com/article/206.html
POI讀取word2007+

10分鐘搞定Mysql主從部署配置

如何使用 JS 判斷用戶是否處於活躍狀態

「Pygors跨平臺GUI」2：安裝MinGW-w64、MSYS2還是WSL2

[轉帖]

python列出centos7內存使用前50的進程信息

「Pygors跨平臺GUI」1：Pygors跨平臺GUI應用研究

一鍵自動化博客發佈工具,用過的人都說好(掘金篇)

lightdb數據庫超時相關控制參數

lightdb秒級增加列和刪除列（not null帶默認值）

Java ThreadPoolShutdown

POI讀取word2007+

java實現文件變化監控

Layer-LayUI插件功能介紹

FreeMarker 中文官方參考手冊

Idea2017破解下載地址

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結