import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableIterator;
import org.apache.poi.hwpf.usermodel.TableRow;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
public class ExportDocImpl
{
public void testWord(){
try{
FileInputStream in = new FileInputStream("D:\\2003.doc");//載入文檔
POIFSFileSystem pfs = new POIFSFileSystem(in);
HWPFDocument hwpf = new HWPFDocument(pfs);
Range range = hwpf.getRange();//得到文檔的讀取範圍
TableIterator it = new TableIterator(range);
//迭代文檔中的表格
while (it.hasNext()) {
Table tb = (Table) it.next();
//迭代行,默認從0開始
for (int i = 0; i < tb.numRows(); i++) {
TableRow tr = tb.getRow(i);
//迭代列,默認從0開始
for (int j = 0; j < tr.numCells(); j++) {
TableCell td = tr.getCell(j);//取得單元格
//取得單元格的內容
for(int k=0;k<td.numParagraphs();k++){
Paragraph para =td.getParagraph(k);
String s = para.text();
} //end for
} //end for
} //end for
} //end while
}catch(Exception e){
e.printStackTrace();
}
}//end method
public static void main(String[] args){
ExportDocImpl ExportDocImpl = new ExportDocImpl();
ExportDocImpl.testWord1();
}
public void testWord1(){
try {
//word 2003: 圖片不會被讀取
InputStream is = new FileInputStream(new File("D:\\2003.doc"));
WordExtractor ex = new WordExtractor(is);
String str1 = ex.getText();
str1 = str1.replaceAll("", ";");
String text2003 = str1;
System.out.println(text2003);
//word 2007 圖片不會被讀取, 表格中的數據會被放在字符串的最後
OPCPackage opcPackage = POIXMLDocument.openPackage("D:\\2007.docx");
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
String str = extractor.getText();
str = str.replaceAll(" ", ";");
String text2007 = str;
System.out.println(text2007);
} catch (Exception e) {
e.printStackTrace();
}
}
}
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
/**
* POI 讀取 word 2003 和 word 2007 中文字內容的測試類<br />
* @createDate 2009-07-25
* @author Carl He
*/
public class ParseTable {
public static void main(String[] args) {
try {
//word 2003: 圖片不會被讀取
InputStream is = new FileInputStream(new File("d:\\2003.doc"));
WordExtractor ex = new WordExtractor(is);
String text2003 = ex.getText();
System.out.println(text2003);
//word 2007 圖片不會被讀取, 表格中的數據會被放在字符串的最後
OPCPackage opcPackage = POIXMLDocument.openPackage("d:\\2007.docx");
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
String text2007 = extractor.getText();
System.out.println(text2007);
} catch (Exception e) {
e.printStackTrace();
}
}
}
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import org.apache.poi.hwpf.extractor.WordExtractor;
public class ParseWord {
public static void main(String[] args) {
File file = new File("d:\\hello.doc");
try {
FileInputStream fis = new FileInputStream(file);
WordExtractor wordExtractor = new WordExtractor(fis);
System.out.println(wordExtractor.getText());
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}