很多人使用POI讀取word的時候都會這麼寫:
1 2 3 | XWPFDocument document = new XWPFDocument(inputStream); System.out.println( new XWPFWordExtractor(document).getText()); |
但是這個方法其實有非常多的問題的,文本框裏面的內容讀取不到,換行也有問題。那麼我改進了一下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 | /** * 處理2007+的WORD * @param filePath 文件地址 * @return word內容 */ private static String read2007(String filePath) { InputStream inputStream = null ; StringBuffer content = new StringBuffer(); try { inputStream = new FileInputStream( new File(filePath)); XWPFDocument document = new XWPFDocument(inputStream); // 讀取非表格文本框 for (XWPFParagraph xwpfParagraph : document.getParagraphs()) { for (XWPFRun xwpfRun : xwpfParagraph.getRuns()) { content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE); } } // 讀取表格內文本框 for (XWPFTable xwpfTable : document.getTables()) { for (XWPFTableRow xwpfTableRow : xwpfTable.getRows()) { for (XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) { for (XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) { for (XWPFRun xwpfRun : xwpfParagraph.getRuns()) { content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE); } } } } } // 讀取表格內容 for (XWPFTable xwpfTable : document.getTables()) { for (XWPFTableRow xwpfTableRow : xwpfTable.getRows()) { for (XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) { for (XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) { content.append(xwpfParagraph.getText()).append(NEW_LINE); } } } } return content.toString(); } catch (IOException e) { logger.error( "解析word錯誤,文件地址:" + filePath, e); } finally { IOUtils.closeQuietly(inputStream); } return null ; } /** * 獲取XML內容,可以使用遞歸cursor.getDomNode() * @param xml xml * @return xml內容 */ private static String getXMLContent(String xml) { StringBuffer content = new StringBuffer(); Document document; try { document = DocumentHelper.parseText(xml); List<?> namespaces = document.getRootElement().declaredNamespaces(); // 判斷是否有表格包含文本框 boolean hasboxintab = false ; for (Object object : namespaces) { Namespace namespace = (Namespace) object; if (NAMESPANCE_OF_TEXTBOX_IN_TABLE.equals(namespace.getPrefix())) { hasboxintab = true ; break ; } } if (!hasboxintab) return content.toString(); for (Object node : document.selectNodes( "//mc:Fallback//w:p" )) { for (Object nodeb : ((Node) node).selectNodes( ".//w:t" )) { if (StringUtils.isNotEmpty(((Node) nodeb).getText())) content.append(((Node) nodeb).getText()); } content.append(NEW_LINE); } } catch (DocumentException e) { logger.error( "XML轉化錯誤,內容:" + xml, e); } return content.toString(); } |
2003版本簡單一些:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | /** * 處理2003的WORD * @param filePath 文件地址 * @return word內容 */ private static String read2003(String filePath) { InputStream inputStream = null ; StringBuffer content = new StringBuffer(); try { inputStream = new FileInputStream( new File(filePath)); HWPFDocument document = new HWPFDocument(inputStream); String text = null ; for ( int i = 0 ; i < document.getMainTextboxRange().numParagraphs(); i++) { // 文本框 text = document.getMainTextboxRange().getParagraph(i).text(); if (StringUtils.isNotEmpty(text)) content.append(text).append(NEW_LINE); } for ( int i = 0 ; i < document.getRange().numParagraphs(); i++) { // 非文本框 text = document.getRange().getParagraph(i).text(); if (StringUtils.isNotEmpty(text) && StringUtils.isNotEmpty(text.trim())) // 注意這裏的trim()方法否者會出現亂碼 content.append(text.trim()).append(NEW_LINE); } return content.toString(); } catch (FileNotFoundException e) { logger.error( "解析word錯誤,文件地址:" + filePath, e); } catch (IOException e) { logger.error( "解析word錯誤,文件地址:" + filePath, e); } finally { IOUtils.closeQuietly(inputStream); } return null ; } |
注意:讀取出的內容爲表格裏面的內容,文本框內容和直接寫在編輯區裏面的文本,其他的一些諸如:批註,引用等一些信息可能讀取不到,需要的請自行解決。
比較完整的代碼:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 | import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFRun; import org.apache.poi.xwpf.usermodel.XWPFTable; import org.apache.poi.xwpf.usermodel.XWPFTableCell; import org.apache.poi.xwpf.usermodel.XWPFTableRow; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.DocumentHelper; import org.dom4j.Node; /** * WordReaderUtils - WORD 讀取 * * @author 500d Team * @version 1.0 */ public class WordReaderUtils { private static final String WORD_2003 = "doc" ; private static final String WORD_2007 = "docx" ; private static final Logger logger = Logger.getLogger(WordReaderUtils. class ); public static final String NEW_LINE = "\r\n" ; public static String read(String filePath) { File wordFile = StringUtils.isNotEmpty(filePath) ? new File(filePath) : null ; if (wordFile == null || !wordFile.exists() || !wordFile.isFile()) return null ; String extension = FilenameUtils.getExtension(filePath); if (StringUtils.isEmpty(extension)) return null ; String content = null ; if (WORD_2003.equals(extension.toLowerCase())) content = read2003(filePath); else if (WORD_2007.equals(extension.toLowerCase())) content = read2007(filePath); return Crossover.handle(content); } /** * 處理2003的WORD * @param filePath 文件地址 * @return word內容 */ private static String read2003(String filePath) { InputStream inputStream = null ; StringBuffer content = new StringBuffer(); try { inputStream = new FileInputStream( new File(filePath)); HWPFDocument document = new HWPFDocument(inputStream); String text = null ; for ( int i = 0 ; i < document.getMainTextboxRange().numParagraphs(); i++) { text = document.getMainTextboxRange().getParagraph(i).text(); if (StringUtils.isNotEmpty(text)) content.append(text).append(NEW_LINE); } for ( int i = 0 ; i < document.getRange().numParagraphs(); i++) { text = document.getRange().getParagraph(i).text(); if (StringUtils.isNotEmpty(text) && StringUtils.isNotEmpty(text.trim())) // 注意這裏的trim()方法否者會出現亂碼 content.append(text.trim()).append(NEW_LINE); } return content.toString(); } catch (FileNotFoundException e) { logger.error( "解析word錯誤,文件地址:" + filePath, e); } catch (IOException e) { logger.error( "解析word錯誤,文件地址:" + filePath, e); } finally { IOUtils.closeQuietly(inputStream); } return null ; } /** * 處理2007+的WORD * @param filePath 文件地址 * @return word內容 */ private static String read2007(String filePath) { InputStream inputStream = null ; StringBuffer content = new StringBuffer(); try { inputStream = new FileInputStream( new File(filePath)); XWPFDocument document = new XWPFDocument(inputStream); // 讀取非表格文本框 for (XWPFParagraph xwpfParagraph : document.getParagraphs()) { for (XWPFRun xwpfRun : xwpfParagraph.getRuns()) { content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE); } } // 讀取表格內文本框 for (XWPFTable xwpfTable : document.getTables()) { for (XWPFTableRow xwpfTableRow : xwpfTable.getRows()) { for (XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) { for (XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) { for (XWPFRun xwpfRun : xwpfParagraph.getRuns()) { content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE); } } } } } // 讀取表格內容 for (XWPFTable xwpfTable : document.getTables()) { for (XWPFTableRow xwpfTableRow : xwpfTable.getRows()) { for (XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) { for (XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) { content.append(xwpfParagraph.getText()).append(NEW_LINE); } } } } return content.toString(); } catch (IOException e) { logger.error( "解析word錯誤,文件地址:" + filePath, e); } finally { IOUtils.closeQuietly(inputStream); } return null ; } /** * 獲取XML內容,可以使用遞歸cursor.getDomNode() * @param xml xml * @return xml內容 */ private static String getXMLContent(String xml) { StringBuffer content = new StringBuffer(); Document document; try { document = DocumentHelper.parseText(xml); List<?> namespaces = document.getRootElement().declaredNamespaces(); // 判斷是否有表格包含文本框 boolean hasboxintab = false ; for (Object object : namespaces) { Namespace namespace = (Namespace) object; if (NAMESPANCE_OF_TEXTBOX_IN_TABLE.equals(namespace.getPrefix())) { hasboxintab = true ; break ; } } if (!hasboxintab) return content.toString(); for (Object node : document.selectNodes( "//mc:Fallback//w:p" )) { for (Object nodeb : ((Node) node).selectNodes( ".//w:t" )) { if (StringUtils.isNotEmpty(((Node) nodeb).getText())) content.append(((Node) nodeb).getText()); } content.append(NEW_LINE); } } catch (DocumentException e) { logger.error( "XML轉化錯誤,內容:" + xml, e); } return content.toString(); } public static void main(String[] args) throws Exception { // System.out.println(read("e://company/test.doc")); // System.out.println(read("e://company/test.docx")); } } |
參考文檔:http://www.acgist.com/article/206.html