java解析word2003 doc文件中的表格

1;apache poi插件鏈接http://poi.apache.org/ 這個插件主要用於office文件文本內以及富文本(表格,圖片)等的提取,還有支持對已知密碼的office文件的提取,
 黑框裏面的是Word (HWPF+XWPF)
其他格式的文件加密以及解密都有 見鏈接http://poi.apache.org/encryption.html
http://www.openoffice.org/sc/compdocfileformat.pdf
因爲之前做全文檢索的時候需要提取word03裏面的表格進行分析,使用這個插件對加密的文件不支持,所以研究了下doc文件格式以及它的加密方式,另外office07系列的比較簡單,基本都是對xml解析,這裏就不分析了。再說一句,這裏只是對linux平臺,其他調用微軟com的那套就不用說了。

2;首先先說下word97-03文件是怎麼加密的,word加密時並不是所有字節都加密,只是帶有文本,表格,富文本,圖片的stream纔會加密,涉及到的加密算法有RC4,MD5算法。
具體細節參考:
http://wenku.baidu.com/link?url=TrCAFtr1mHXZbh3qnnOlhYmXTS7-ynw-CES0W6KOUzRzzRg7l04Y5LXl0V3W8pQRGO4SxyzXDlXk5zlLOzPphlXOxfC6UglqRzJlnb6439_

這裏只是列出一些代碼實現:
字符串轉utf8

 public static String utf8ToUnicode(String inStr) {
      char[] myBuffer = inStr.toCharArray();

      StringBuffer sb = new StringBuffer();
      for (int i = 0; i < inStr.length(); i++) {
       UnicodeBlock ub = UnicodeBlock.of(myBuffer[i]);
          if(ub == UnicodeBlock.BASIC_LATIN){
           sb.append(myBuffer[i]);
          }else if(ub == UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS){
           int j = (int) myBuffer[i] - 65248;
           sb.append((char)j);
          }else{
           short s = (short) myBuffer[i];
              String hexS = Integer.toHexString(s);
              String unicode = "\\u"+hexS;
           sb.append(unicode.toLowerCase());
          }
      }
      return sb.toString();
  }
//驗證passwd
public boolean  verifypwd(byte pwarray[], byte docid[], byte salt[], byte hashedsalt[],HWMPFEMD5 valContext)
  {
      HWPFRC4 tool  =new HWPFRC4();
      HWPFRC4 key  =new HWPFRC4();
      int offset=0, keyoffset=0;
      int tocopy=5;
      HWMPFEMD5 md5=new HWMPFEMD5();
      HWMPFEMD5 md51=new HWMPFEMD5();
      md5.md5Init();
      md5.md5Update(pwarray, 64);
      md5.getMD5StoreDigest(md5); 
      valContext.md5Init();
      while (offset != 16)
      {
          if ((64 - offset) < 5)
              tocopy = 64 - offset;
          for(int y=0;y<tocopy;y++)
          {
             pwarray[offset+y]=md5.digest[keyoffset+y];
          }
          offset += tocopy;

          if (offset == 64)
            {
            valContext.md5Update(pwarray, 64);
            keyoffset = tocopy;
            tocopy = 5 - tocopy;
            offset = 0;
            continue;
            }
          keyoffset = 0;
          tocopy = 5;
          for(int y=0;y<16;y++)
          {
              pwarray[offset+y]=docid[y];
          }
          offset += 16;
       }
       pwarray[16] = (byte) 0x80;
       for(int i=0;i<47;i++)
       {
           pwarray[17+i]=0;
       }
       pwarray[56] = (byte)0x80;
       pwarray[57] = (byte)0x0A;
       valContext.md5Update(pwarray,64);
       valContext.getMD5StoreDigest(valContext); 
       tool.makekey (0, key,valContext);
       tool.rc4 (salt, 16, key);
       tool.rc4 (hashedsalt, 16, key);
       salt[16] = (byte) 0x80;
       for(int i=0;i<47;i++) salt[17+i]=0;
       salt[56] = (byte) 0x80;
       md51.md5Init();
       md51.md5Update(salt, 64); 
       md51.getMD5StoreDigest(md51);
       for(int i=0;i<16;i++)
       {   
           if(hashedsalt[i]!=md51.digest[i]) return false;
       }
       return true;
  }
 public void preparekey(byte[] key_data_ptr, int key_data_len, HWPFRC4 key)
     {
            int index1;
            int index2;
            byte []state=new byte[256];
            int counter;
            state = key.state;
            for (counter = 0; counter < 256; counter++) state[counter] =  (byte)counter;
            key.x = 0;
            key.y = 0;
            index1 = 0; 
            index2 = 0;
            for (counter = 0; counter < 256; counter++)
              {
              index2 =  ((key_data_ptr[index1]&0xff) + (state[counter]&0xff )+ index2 ) &0xff;
              byte btemp=state[counter];
              state[counter]=state[index2];
              state[index2]=btemp;
              index1 = ((index1 + 1) % key_data_len);
              }
     }
     public void makekey(int block,HWPFRC4 rc4key,HWMPFEMD5 md5) 
     {   
             byte[]pwarray=new byte[64];
             HWMPFEMD5 temp=new HWMPFEMD5();
             for(int i=0;i<64;i++) pwarray[i]=0;
             for(int i=0;i<5;i++)
             {  
                 pwarray[i]=md5.digest[i]; 
             }  
             pwarray[5] = (byte) (block & 0xFF);
             pwarray[6] = (byte) ((block >> 8) & 0xFF);
             pwarray[7] = (byte) ((block >> 16) & 0xFF);
             pwarray[8] = (byte) ((block >> 24) & 0xFF);
             pwarray[9] = (byte)0x80;
             pwarray[56] =(byte)0x48;
             temp.md5Init();
             temp.md5Update(pwarray, 64);
             temp.getMD5StoreDigest(temp);
             preparekey(temp.digest, 16, rc4key);
     }
     void rc4 ( byte[] buffer_ptr, int buffer_len, HWPFRC4  key)
     {
             int x;
             int y;
             byte []state=new byte[256];
             int xorIndex;
             int counter;
             x = key.x;
             y = key.y;
             state = key.state;
             for (counter = 0; counter < buffer_len; counter++)
               {
                  x =  ((x + 1) & 0xff);
                  y = (((state[x]&0xff) + y) & 0xff);
                  byte btemp=state[x];
                  state[x]=state[y];
                  state[y]=btemp;
                  xorIndex =  (((state[x]&0xff )+ (state[y]&0xff)) & 0xff);
                  buffer_ptr[counter]^=(state[xorIndex]);
               }
             key.x = x; 
             key.y = y;
     }

這裏代碼是按照文檔裏面寫的doc文件格式,先對密碼進行unicde解碼,然後取出x,docid,salt,hashedsalt,調用verifypwd進行驗證,密碼正確後會返回HWMPFEMD5對象,然後對_tableStream的每512字節進行RC4操作,在對_mainStream進行RC4操作,然後
_fib = new FileInformationBlock(_mainStream);
_fib.fillVariableFields(_mainStream, _tableStream);
這裏存儲文件頭部信息,然後在對_dataStream進行解碼,這裏就是簡單的針對512字節進行RC4操作,這裏只是對文檔的簡單說明。
3;Java調用方式

 POIFSFileSystem pfs;
            HWPFDocument hwpf =null;
            try {
                pfs = new POIFSFileSystem(new FileInputStream("./test/20030523jm.doc"));
                hwpf = new HWPFDocument(pfs,"111111"); 
            } catch ( Exception e) {
            }       
            Range range = hwpf.getOverallRange();
            TableIterator it = new TableIterator(range);

4:githup鏈接
https://github.com/DusonWang/word95-03parse.git

參考項目代碼http://sourceforge.net/projects/wvware/

5:jar包下載
http://download.csdn.net/detail/dusonblog/9310899

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章