Java 解析 Word Word 中的表格

import java.io.File;  
import java.io.FileInputStream;  
import java.io.FileNotFoundException;  
 
import org.apache.poi.hwpf.HWPFDocument;  
import org.apache.poi.hwpf.usermodel.Paragraph;  
import org.apache.poi.hwpf.usermodel.Range;  
import org.apache.poi.hwpf.usermodel.Table;  
import org.apache.poi.hwpf.usermodel.TableCell;  
import org.apache.poi.hwpf.usermodel.TableIterator;  
import org.apache.poi.hwpf.usermodel.TableRow;  
 
import java.io.File;     
import java.io.FileInputStream;     
import java.io.InputStream;     
    
import org.apache.poi.POIXMLDocument;     
import org.apache.poi.POIXMLTextExtractor;     
import org.apache.poi.hwpf.extractor.WordExtractor;     
import org.apache.poi.openxml4j.opc.OPCPackage;     
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;    
 
 
import org.apache.poi.poifs.filesystem.POIFSFileSystem;  
 
public class ExportDocImpl  
{  
    public void testWord(){  
        try{  
            FileInputStream in = new FileInputStream("D:\\2003.doc");//載入文檔  
           POIFSFileSystem pfs = new POIFSFileSystem(in);     
            HWPFDocument hwpf = new HWPFDocument(pfs);     
            Range range = hwpf.getRange();//得到文檔的讀取範圍  
            TableIterator it = new TableIterator(range);  
           //迭代文檔中的表格  
            while (it.hasNext()) {     
                Table tb = (Table) it.next();     
                //迭代行,默認從0開始  
                for (int i = 0; i < tb.numRows(); i++) {     
                    TableRow tr = tb.getRow(i);     
                    //迭代列,默認從0開始  
                    for (int j = 0; j < tr.numCells(); j++) {     
                        TableCell td = tr.getCell(j);//取得單元格  
                        //取得單元格的內容  
                        for(int k=0;k<td.numParagraphs();k++){     
                            Paragraph para =td.getParagraph(k);     
                            String s = para.text();     
                           
                        } //end for      
                    }   //end for  
                }   //end for  
            } //end while  
        }catch(Exception e){  
            e.printStackTrace();  
        }  
    }//end method  
      
    public static void main(String[] args){
     ExportDocImpl ExportDocImpl = new ExportDocImpl();
     ExportDocImpl.testWord1();
    }  
   
   
   
   
           public void testWord1(){  
           try {     
            //word 2003: 圖片不會被讀取     
            InputStream is = new FileInputStream(new File("D:\\2003.doc"));     
                  WordExtractor ex = new WordExtractor(is);   
                  String str1 = ex.getText();
                  str1 = str1.replaceAll("", ";");
                  String text2003 = str1;     
                  System.out.println(text2003);     
                 
            //word 2007 圖片不會被讀取, 表格中的數據會被放在字符串的最後     
            OPCPackage opcPackage = POIXMLDocument.openPackage("D:\\2007.docx");     
                  POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);  
                  String str = extractor.getText();
                  str = str.replaceAll(" ", ";");
                  String text2007 = str;     
                  System.out.println(text2007);     
                 
        } catch (Exception e) {     
                  e.printStackTrace();     
        }   
    }  

 

 

 

 
import java.io.File;  
import java.io.FileInputStream;  
import java.io.InputStream;  
 
import org.apache.poi.POIXMLDocument;  
import org.apache.poi.POIXMLTextExtractor;  
import org.apache.poi.hwpf.extractor.WordExtractor;  
import org.apache.poi.openxml4j.opc.OPCPackage;  
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;  
 
/** 
* POI 讀取 word 2003 和 word 2007 中文字內容的測試類<br /> 
* @createDate 2009-07-25 
* @author Carl He 
*/ 
public class ParseTable {  
    public static void main(String[] args) {  
        try {  
            //word 2003: 圖片不會被讀取  
              InputStream is = new FileInputStream(new File("d:\\2003.doc"));  
            WordExtractor ex = new WordExtractor(is);  
            String text2003 = ex.getText();  
            System.out.println(text2003);  
 
            //word 2007 圖片不會被讀取, 表格中的數據會被放在字符串的最後  
            OPCPackage opcPackage = POIXMLDocument.openPackage("d:\\2007.docx");  
            POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);  
            String text2007 = extractor.getText();  
            System.out.println(text2007);  
     
        } catch (Exception e) {  
            e.printStackTrace();  
        }  
    }  

 

 

 

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;

import org.apache.poi.hwpf.extractor.WordExtractor;

public class ParseWord {

 public static void main(String[] args) {
  File file = new File("d:\\hello.doc");
  try {
   FileInputStream fis = new FileInputStream(file);
   WordExtractor wordExtractor = new WordExtractor(fis);
   System.out.println(wordExtractor.getText());
  } catch (FileNotFoundException e) {
   e.printStackTrace();
  } catch (IOException e) {
   e.printStackTrace();
  }
 }
}

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章