目錄
背景
- 在參與到軟件設計的文檔修改時,需要將docx文檔中的數據表錄入到xml文檔中,在第一次錄入時,一個字段一個字段的錄入實在是太過麻煩,遇到有幾十個字段的表,眼睛都看花了還是錄不完,於是想着使用代碼去讀取相應的表格,將其中的數據表生成xml文件。
- 當然,在寫腳本的時候,大家可能首先想到的是使用Python進行編寫。由於我的電腦中沒有安裝Python環境,就使用已有的環境編寫了Java代碼進行轉換。在讀取表的時候,還有很多問題可以改進,但是由於文檔的規範不同,處理還不是很全面,現在將自己實現的額進行記錄。
DOCX的數據表
表格式一
表格式二
針對遇到的兩種格式的表,進行編寫代碼,生成相應的xml文件
代碼實現
Docx2XMLUtil.java
package docx2xml; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFTable; import org.apache.poi.xwpf.usermodel.XWPFTableCell; import org.apache.poi.xwpf.usermodel.XWPFTableRow; import java.io.*; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Iterator; import java.util.List; /** * @ClassName Docx2XMLUtil * @Author StriveFarrell * @Date 2019/12/4 15:22 * @Description * 將docx文檔章的表格轉化爲xml文檔 */ public class Docx2XMLUtil { private String docxFilePath ; private String xmlFileSavePath ; private String author; public String getDocxFilePath() { return docxFilePath; } public void setDocxFilePath(String docxFilePath) { this.docxFilePath = docxFilePath; } public String getXmlFileSavePath() { return xmlFileSavePath; } public void setXmlFileSavePath(String xmlFileSavePath) { this.xmlFileSavePath = xmlFileSavePath; } public String getAuthor() { return author; } public void setAuthor(String author) { this.author = author; } public void getTableData(){ try { String filePath = getDocxFilePath(); if (filePath.toLowerCase().endsWith("docx")){ docx2xml(); }else if (filePath.toLowerCase().endsWith(".doc")){ doc2xml(); } }catch (Exception e){ e.printStackTrace(); } } private void docx2xml(){ XWPFDocument document = getXWPFDocument(); Iterator<XWPFTable> tabItr = document.getTablesIterator(); String tableHeaderInfo = getTableHeader(); String remInfo = getRemInfo(); int tableIndex = 1; while (tabItr.hasNext()){ StringBuffer tablexml = new StringBuffer(tableHeaderInfo); tablexml.append(remInfo); XWPFTable table = tabItr.next(); String tableColumnInfo = getTableColumn(table); tablexml.append(tableColumnInfo); String xmlString = tablexml.toString()+getEndTableTag()+"\n\n\n\n"; testPrint(String.valueOf(tableIndex), xmlString); saveXml(xmlString); tableIndex++; } } private void doc2xml(){ } /** * 打印測試 * @param message * @param out */ private void testPrint(String message,String out){ System.out.println(message+":\n"+out); } /** * 獲取當前的日期,格式爲yyyy.MM.dd * @return */ private String getDate(){ SimpleDateFormat df = new SimpleDateFormat("yyyy.MM.dd"); return df.format(new Date()); } private void saveXml(String data){ String saveXmlPath = getXmlFileSavePath(); try { FileWriter fw = new FileWriter(saveXmlPath, true); BufferedWriter bw = new BufferedWriter(fw); bw.write(data); bw.close(); fw.close(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * 判斷數據類型是否有長度 * @param cell * @return */ private boolean hasSize(String cell){ if ("DATETIME".equalsIgnoreCase(cell)||"TEXT".equalsIgnoreCase(cell) || "TIMESTAMP".equalsIgnoreCase(cell) || "LONGTEXT".equalsIgnoreCase(cell)){ return false; } return true; } /** * 判斷表格格式,tableStyle標記表的格式,字段和長度在一起爲true,分開爲false * @param header * @return */ private boolean getTableStyle(List<XWPFTableCell> header) { boolean isContains = false ; Iterator<XWPFTableCell> cellIterator = header.iterator(); while (cellIterator.hasNext()){ String cel = cellIterator.next().getText(); if (cel.contains("長度")){ isContains = true; break; } } return isContains; } /** * 獲取文件輸入流 * @return */ private FileInputStream getFileInputStream(){ FileInputStream in = null; try { in = new FileInputStream(getDocxFilePath()); }catch (FileNotFoundException e) { e.printStackTrace(); } return in; } /** * 獲取docx文件流 * @return */ private XWPFDocument getXWPFDocument(){ FileInputStream in = getFileInputStream(); XWPFDocument document = null; try { document = new XWPFDocument(in); } catch (IOException e) { e.printStackTrace(); } return document; } /** * 獲取表同的格式 * @return */ private String getTableHeader(){ String tableTagStart = "<table "; String tableId = "id="; String javaId = "javaId="; String tableName = "name="; String tableTagEnd = ">"; StringBuffer headBuffer = new StringBuffer(tableTagStart); headBuffer.append(tableId+"\"\" "); headBuffer.append(javaId+"\"\" "); headBuffer.append(tableName+"\"\" "); headBuffer.append(tableTagEnd+"\n"); return headBuffer.toString(); } /** * 返回table的閉合標籤 * @return */ private String getEndTableTag(){ return "<\\table>"; } /** * 獲取創建人的信息 * @return */ private String getRemInfo(){ String remInfo = "\t<rem>====================================================================</rem>\n" + "\t<rem> 輸入人:"+ author +"\t輸入時間:"+ getDate()+"</rem>\n" + "\t<rem>table description</rem>\n"+ "\t<rem>====================================================================</rem>\n"; return remInfo; } /** * 遍歷獲取每一列的數據 * @param table * @return */ private String getTableColumn(XWPFTable table) { String tag = "\t<column "; String id = "id="; String type = "type="; String size = "size="; String primaryKey = "primaryKey="; String required = "required="; String name = "name="; String end = " />\n"; StringBuffer tableColumsBuffer = new StringBuffer(); List<XWPFTableRow> rowList = table.getRows(); //tableStyle標記表的格式,字段和長度在一起爲true,分開爲false boolean tableStyle = false; tableStyle = getTableStyle(rowList.get(0).getTableCells()); for (int i = 1; i < rowList.size(); i++) { StringBuffer rowBUffer = new StringBuffer(tag); XWPFTableRow row = rowList.get(i); List<XWPFTableCell> cellList = row.getTableCells(); boolean isHasSize = false; for (int j = 0; j < cellList.size(); j++) { String cell = cellList.get(j).getText().trim().toUpperCase(); switch (j) { case 0: String newId = id + "\"" + cell + "\" "; rowBUffer.append(newId); break; case 1: if (!tableStyle){ if (cell.contains("(")) { int startIndex = cell.indexOf("("); int endIndex = cell.indexOf(")"); String cellType = cell.substring(0, startIndex); String cellSize = cell.substring(startIndex + 1, endIndex); String newType = type + "\"" + cellType + "\" "; rowBUffer.append(newType); String newSize = size + "\"" + cellSize + "\" "; rowBUffer.append(newSize); } else { String newType = type + "\"" + cell + "\" "; rowBUffer.append(newType); } }else { isHasSize = hasSize(cell); String newType = type + "\"" + cell + "\" "; rowBUffer.append(newType); } break; case 2: if (isHasSize) { String newSize = size + "\"" +cell + "\" "; rowBUffer.append(newSize); isHasSize = false; } break; case 3: String newPrimaryKey = ""; String newRequired = ""; if (cell.contains("主鍵")) { newPrimaryKey = primaryKey + "\"true\" "; } else { newPrimaryKey = primaryKey + "\"false\" "; } if (cell.contains("非空")) { newRequired = required + "\"true\" "; } else { newRequired = required + "\"false\" "; } rowBUffer.append(newPrimaryKey); rowBUffer.append(newRequired); break; case 4: String newName = name + "\"" + cell + "\""; rowBUffer.append(newName); rowBUffer.append(end); break; default: } } tableColumsBuffer.append(rowBUffer.toString()); } return tableColumsBuffer.toString(); } }
Docx2XMLUtilTest.java
package docx2xml; /** * @ClassName Docx2XMLUtilTest * @Author StriveFarrell * @Date 2019/12/4 16:12 * @Description * docx文檔轉換爲xml文檔的測試類 */ public class Docx2XMLUtilTest { //docx文件所在文件路徑 private static final String docxFilePath = "D:\\MyFile\\workLearning\\spark\\src\\files\\docx2xml3.docx"; //生成的xml文件保存路徑 private static final String xmlFileSavePath = "D:\\MyFile\\workLearning\\spark\\src\\files\\docx2xml3.xml"; //表格錄入人 private static final String author = "Hello Table"; public static void main(String[] args){ Docx2XMLUtil util = new Docx2XMLUtil(); util.setDocxFilePath(docxFilePath); util.setAuthor(author); util.setXmlFileSavePath(xmlFileSavePath); util.getTableData(); } }
生成XML格式
表格式一XML
<table id="" javaId="" name="" > <rem>====================================================================</rem> <rem> 輸入人:Hello Table 輸入時間:2019.12.05</rem> <rem>table description</rem> <rem>====================================================================</rem> <column id="COLUMN_ID" type="VARCHAR" size="20" primaryKey="true" required="true" name="信息項定義主鍵" /> <column id="IS_PRIMARY_KEY" type="INT" size="2" primaryKey="false" required="true" name="表明是否爲主鍵(0:否;1:是)" /> <column id="NOT_NULL" type="INT" size="2" primaryKey="false" required="true" name="0:可以爲空;1:不可爲空。" /> <column id="LENGTH" type="VARCHAR" size="11" primaryKey="false" required="true" name="長度" /> <column id="TYPE" type="VARCHAR" size="22" primaryKey="false" required="true" name="類型" /> <column id="NAME_EN" type="VARCHAR" size="2048" primaryKey="false" required="false" name="英文名稱" /> <column id="NAME_CN" type="VARCHAR" size="2048" primaryKey="false" required="false" name="中文名稱" /> <column id="CATA_ID" type="VARCHAR" size="20" primaryKey="false" required="true" name="目錄ID" /> <\table>
表格式二XML
<table id="" javaId="" name="" > <rem>====================================================================</rem> <rem> 輸入人:zhangqx02 輸入時間:2019.12.05</rem> <rem>table description</rem> <rem>====================================================================</rem> <column id="COLUMN_ID" type="VARCHAR" size="20" primaryKey="true" required="true" name="信息項定義主鍵" /> <column id="IS_PRIMARY_KEY" type="INT" size="2" primaryKey="false" required="true" name="表明是否爲主鍵(0:否;1:是)" /> <column id="NOT_NULL" type="INT" size="2" primaryKey="false" required="true" name="0:可以爲空;1:不可爲空。" /> <column id="LENGTH" type="VARCHAR" size="11" primaryKey="false" required="true" name="長度" /> <column id="TYPE" type="VARCHAR" size="22" primaryKey="false" required="true" name="類型" /> <column id="NAME_EN" type="VARCHAR" size="2048" primaryKey="false" required="false" name="英文名稱" /> <column id="NAME_CN" type="VARCHAR" size="2048" primaryKey="false" required="false" name="中文名稱" /> <column id="CATA_ID" type="VARCHAR" size="20" primaryKey="false" required="true" name="目錄ID" /> <\table>
這個表還有很多不完善的地方,比如沒有生成table標籤的id,javaId和name的一些字段,以後有時間在去處理。
附件
表格式一
- 下載地址:表格式一
表格式二
- 下載地址:表格式二
總結
- 將docx數據表錄入到xml中,如果純手動錄入時一個枯燥頭大的事情,一不小心就搞得自己眼花繚亂了。
- 實現的方式還不是很完整,還有很多可以改進的地方。