代碼生成docx數據錶轉換爲xml文件

背景

在參與到軟件設計的文檔修改時，需要將docx文檔中的數據表錄入到xml文檔中，在第一次錄入時，一個字段一個字段的錄入實在是太過麻煩，遇到有幾十個字段的表，眼睛都看花了還是錄不完，於是想着使用代碼去讀取相應的表格，將其中的數據表生成xml文件。
當然，在寫腳本的時候，大家可能首先想到的是使用Python進行編寫。由於我的電腦中沒有安裝Python環境，就使用已有的環境編寫了Java代碼進行轉換。在讀取表的時候，還有很多問題可以改進，但是由於文檔的規範不同，處理還不是很全面，現在將自己實現的額進行記錄。

DOCX的數據表

表格式一

表格式二

針對遇到的兩種格式的表，進行編寫代碼，生成相應的xml文件

代碼實現

Docx2XMLUtil.java

package docx2xml;

import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow;

import java.io.*;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import java.util.List;

/**
 * @ClassName Docx2XMLUtil
 * @Author StriveFarrell
 * @Date 2019/12/4 15:22
 * @Description
 * 將docx文檔章的表格轉化爲xml文檔
 */

public class Docx2XMLUtil {
    private  String docxFilePath ;
    private  String xmlFileSavePath ;
    private  String author;

    public String getDocxFilePath() {
        return docxFilePath;
    }

    public void setDocxFilePath(String docxFilePath) {
        this.docxFilePath = docxFilePath;
    }

    public String getXmlFileSavePath() {
        return xmlFileSavePath;
    }

    public void setXmlFileSavePath(String xmlFileSavePath) {
        this.xmlFileSavePath = xmlFileSavePath;
    }

    public String getAuthor() {
        return author;
    }

    public void setAuthor(String author) {
        this.author = author;
    }

    public void getTableData(){
        try {
            String filePath = getDocxFilePath();
            if (filePath.toLowerCase().endsWith("docx")){
                docx2xml();
            }else if (filePath.toLowerCase().endsWith(".doc")){
                doc2xml();
            }
        }catch (Exception e){
            e.printStackTrace();
        }
    }

    private void docx2xml(){
        XWPFDocument document = getXWPFDocument();

        Iterator<XWPFTable> tabItr = document.getTablesIterator();
        String tableHeaderInfo = getTableHeader();
        String remInfo = getRemInfo();
        int tableIndex = 1;
        while (tabItr.hasNext()){
            StringBuffer tablexml = new StringBuffer(tableHeaderInfo);
            tablexml.append(remInfo);

            XWPFTable table = tabItr.next();
            String tableColumnInfo = getTableColumn(table);
            tablexml.append(tableColumnInfo);

            String xmlString = tablexml.toString()+getEndTableTag()+"\n\n\n\n";
            testPrint(String.valueOf(tableIndex), xmlString);
            saveXml(xmlString);
            tableIndex++;
        }
    }
    private  void doc2xml(){

    }

    /**
     * 打印測試
     * @param message
     * @param out
     */
    private  void testPrint(String message,String out){
        System.out.println(message+":\n"+out);
    }

    /**
     * 獲取當前的日期，格式爲yyyy.MM.dd
     * @return
     */
    private  String getDate(){
        SimpleDateFormat df = new SimpleDateFormat("yyyy.MM.dd");
        return df.format(new Date());
    }

    private  void saveXml(String data){
        String saveXmlPath = getXmlFileSavePath();
        try {
            FileWriter fw = new FileWriter(saveXmlPath, true);
            BufferedWriter bw = new BufferedWriter(fw);
            bw.write(data);
            bw.close();
            fw.close();
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    /**
     * 判斷數據類型是否有長度
     * @param cell
     * @return
     */
    private  boolean hasSize(String cell){
        if ("DATETIME".equalsIgnoreCase(cell)||"TEXT".equalsIgnoreCase(cell) || "TIMESTAMP".equalsIgnoreCase(cell) || "LONGTEXT".equalsIgnoreCase(cell)){
            return false;
        }
        return true;
    }

    /**
     * 判斷表格格式，tableStyle標記表的格式，字段和長度在一起爲true，分開爲false
     * @param header
     * @return
     */
    private boolean getTableStyle(List<XWPFTableCell> header) {
        boolean isContains = false ;
        Iterator<XWPFTableCell> cellIterator = header.iterator();
        while (cellIterator.hasNext()){
            String cel = cellIterator.next().getText();
            if (cel.contains("長度")){
                isContains = true;
                break;
            }
        }
        return isContains;
    }

    /**
     * 獲取文件輸入流
     * @return
     */
    private FileInputStream getFileInputStream(){
        FileInputStream in = null;
        try {
            in = new FileInputStream(getDocxFilePath());
        }catch (FileNotFoundException e) {
            e.printStackTrace();
        }
        return in;
    }

    /**
     * 獲取docx文件流
     * @return
     */
    private XWPFDocument getXWPFDocument(){
       FileInputStream in = getFileInputStream();
       XWPFDocument document = null;
        try {
            document = new XWPFDocument(in);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return document;
    }

    /**
     * 獲取表同的格式
     * @return
     */
    private String getTableHeader(){
        String tableTagStart = "<table ";
        String tableId = "id=";
        String javaId = "javaId=";
        String tableName = "name=";
        String tableTagEnd = ">";
        StringBuffer headBuffer = new StringBuffer(tableTagStart);
        headBuffer.append(tableId+"\"\" ");
        headBuffer.append(javaId+"\"\" ");
        headBuffer.append(tableName+"\"\" ");
        headBuffer.append(tableTagEnd+"\n");
        return headBuffer.toString();
    }

    /**
     * 返回table的閉合標籤
     * @return
     */
    private String getEndTableTag(){
        return "<\\table>";
    }

    /**
     * 獲取創建人的信息
     * @return
     */
    private  String getRemInfo(){
        String remInfo = "\t<rem>====================================================================</rem>\n" +
                        "\t<rem> 輸入人:"+ author +"\t輸入時間："+ getDate()+"</rem>\n" +
                         "\t<rem>table description</rem>\n"+
                        "\t<rem>====================================================================</rem>\n";
        return remInfo;
    }

    /**
     * 遍歷獲取每一列的數據
     * @param table
     * @return
     */
    private String getTableColumn(XWPFTable table) {
        String tag = "\t<column ";
        String id = "id=";
        String type = "type=";
        String size = "size=";
        String primaryKey = "primaryKey=";
        String required = "required=";
        String name = "name=";
        String end = " />\n";
        StringBuffer tableColumsBuffer = new StringBuffer();
        List<XWPFTableRow> rowList = table.getRows();
        //tableStyle標記表的格式，字段和長度在一起爲true，分開爲false
        boolean tableStyle = false;
        tableStyle = getTableStyle(rowList.get(0).getTableCells());

        for (int i = 1; i < rowList.size(); i++) {
            StringBuffer rowBUffer = new StringBuffer(tag);
            XWPFTableRow row = rowList.get(i);
            List<XWPFTableCell> cellList = row.getTableCells();
            boolean isHasSize = false;
            for (int j = 0; j < cellList.size(); j++) {
                String cell = cellList.get(j).getText().trim().toUpperCase();
                switch (j) {
                    case 0:
                        String newId = id + "\"" + cell + "\" ";
                        rowBUffer.append(newId);
                        break;
                    case 1:
                        if (!tableStyle){
                            if (cell.contains("(")) {
                                int startIndex = cell.indexOf("(");
                                int endIndex = cell.indexOf(")");
                                String cellType = cell.substring(0, startIndex);
                                String cellSize = cell.substring(startIndex + 1, endIndex);
                                String newType = type + "\"" + cellType + "\" ";
                                rowBUffer.append(newType);
                                String newSize = size + "\"" + cellSize + "\" ";
                                rowBUffer.append(newSize);
                            } else {
                                String newType = type + "\"" + cell + "\" ";
                                rowBUffer.append(newType);
                            }
                        }else {
                            isHasSize = hasSize(cell);
                            String newType = type + "\"" + cell + "\" ";
                            rowBUffer.append(newType);
                        }
                        break;
                    case 2:
                        if (isHasSize) {
                            String newSize = size + "\"" +cell + "\" ";
                            rowBUffer.append(newSize);
                            isHasSize = false;
                        }
                        break;
                    case 3:
                        String newPrimaryKey = "";
                        String newRequired = "";
                        if (cell.contains("主鍵")) {
                            newPrimaryKey = primaryKey + "\"true\" ";
                        } else {
                            newPrimaryKey = primaryKey + "\"false\" ";
                        }
                        if (cell.contains("非空")) {
                            newRequired = required + "\"true\" ";
                        } else {
                            newRequired = required + "\"false\" ";
                        }
                        rowBUffer.append(newPrimaryKey);
                        rowBUffer.append(newRequired);
                        break;
                    case 4:
                        String newName = name + "\"" + cell + "\"";
                        rowBUffer.append(newName);
                        rowBUffer.append(end);
                        break;
                    default:
                }
            }
            tableColumsBuffer.append(rowBUffer.toString());
        }
        return tableColumsBuffer.toString();
    }
}

Docx2XMLUtilTest.java

package docx2xml;


/**
 * @ClassName Docx2XMLUtilTest
 * @Author StriveFarrell
 * @Date 2019/12/4 16:12
 * @Description
 * docx文檔轉換爲xml文檔的測試類
 */

public class Docx2XMLUtilTest {
    //docx文件所在文件路徑
    private static final String docxFilePath = "D:\\MyFile\\workLearning\\spark\\src\\files\\docx2xml3.docx";
    //生成的xml文件保存路徑
    private static final String xmlFileSavePath = "D:\\MyFile\\workLearning\\spark\\src\\files\\docx2xml3.xml";
    //表格錄入人
    private static final String author = "Hello Table";
    public static void main(String[] args){
        Docx2XMLUtil util = new Docx2XMLUtil();
        util.setDocxFilePath(docxFilePath);
        util.setAuthor(author);
        util.setXmlFileSavePath(xmlFileSavePath);
        util.getTableData();
    }
}

生成XML格式

表格式一XML

<table id="" javaId="" name="" >
   <rem>====================================================================</rem>
   <rem> 輸入人:Hello Table    輸入時間：2019.12.05</rem>
   <rem>table description</rem>
   <rem>====================================================================</rem>
   <column id="COLUMN_ID" type="VARCHAR" size="20" primaryKey="true" required="true" name="信息項定義主鍵" />
   <column id="IS_PRIMARY_KEY" type="INT" size="2" primaryKey="false" required="true" name="表明是否爲主鍵(0:否;1:是)" />
   <column id="NOT_NULL" type="INT" size="2" primaryKey="false" required="true" name="0:可以爲空；1：不可爲空。" />
   <column id="LENGTH" type="VARCHAR" size="11" primaryKey="false" required="true" name="長度" />
   <column id="TYPE" type="VARCHAR" size="22" primaryKey="false" required="true" name="類型" />
   <column id="NAME_EN" type="VARCHAR" size="2048" primaryKey="false" required="false" name="英文名稱" />
   <column id="NAME_CN" type="VARCHAR" size="2048" primaryKey="false" required="false" name="中文名稱" />
   <column id="CATA_ID" type="VARCHAR" size="20" primaryKey="false" required="true" name="目錄ID" />
<\table>

表格式二XML

<table id="" javaId="" name="" >
   <rem>====================================================================</rem>
   <rem> 輸入人:zhangqx02    輸入時間：2019.12.05</rem>
   <rem>table description</rem>
   <rem>====================================================================</rem>
   <column id="COLUMN_ID" type="VARCHAR" size="20" primaryKey="true" required="true" name="信息項定義主鍵" />
   <column id="IS_PRIMARY_KEY" type="INT" size="2" primaryKey="false" required="true" name="表明是否爲主鍵(0:否;1:是)" />
   <column id="NOT_NULL" type="INT" size="2" primaryKey="false" required="true" name="0:可以爲空；1：不可爲空。" />
   <column id="LENGTH" type="VARCHAR" size="11" primaryKey="false" required="true" name="長度" />
   <column id="TYPE" type="VARCHAR" size="22" primaryKey="false" required="true" name="類型" />
   <column id="NAME_EN" type="VARCHAR" size="2048" primaryKey="false" required="false" name="英文名稱" />
   <column id="NAME_CN" type="VARCHAR" size="2048" primaryKey="false" required="false" name="中文名稱" />
   <column id="CATA_ID" type="VARCHAR" size="20" primaryKey="false" required="true" name="目錄ID" />
<\table>

這個表還有很多不完善的地方，比如沒有生成table標籤的id,javaId和name的一些字段，以後有時間在去處理。

附件

表格式一

下載地址：表格式一

表格式二

下載地址：表格式二

總結

將docx數據表錄入到xml中，如果純手動錄入時一個枯燥頭大的事情，一不小心就搞得自己眼花繚亂了。
實現的方式還不是很完整，還有很多可以改進的地方。

StriveFarrell

發佈了336 篇原創文章 · 獲贊 79 · 訪問量 22萬+

他的留言板關注

代碼生成docx數據錶轉換爲xml文件

背景

DOCX的數據表

代碼實現

生成XML格式

附件

總結

在線反混淆網站

SQLServer語句彙總

SQLServer官網示例表

架構師和技術員的區別

學習【聊聊框架】筆記

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結