java---Word轉html(1會丟失格式,2支持圖片)

1.依賴

<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-ooxml</artifactId>
			<version>3.15</version>
		</dependency>
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-scratchpad</artifactId>
			<version>3.15</version>
		</dependency>
		<dependency>
			<groupId>fr.opensagres.xdocreport</groupId>
			<artifactId>fr.opensagres.xdocreport.document</artifactId>
			<version>1.0.5</version>
		</dependency>
		<dependency>
			<groupId>fr.opensagres.xdocreport</groupId>
			<artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>
			<version>1.0.5</version>
		</dependency>

2.java代碼

package com.dxl.core;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.converter.core.BasicURIResolver;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;

/**
 * <b> Word轉html(會丟失Word的格式)
 * </b><br><br><i>Description</i> :
 * <br><br>Date: 2019/11/12 ${time}    <br>Author : dxl
 */
public class WordToHtmlUtil {
    /**
     * 2007版本word轉換成html 2018-4-9
     * 
     * @param wordPath
     *            word文件路徑
     * @param wordName
     *            word文件名稱無後綴
     * @param suffix
     *            word文件後綴
     * @return
     * @throws IOException
     */
    public static String Word2007ToHtml(String wordPath, String wordName,
                                        String suffix) throws IOException {
        wordPath = "E:\\myGiteeProject\\mygenerator\\src\\main\\resources\\testdocx\\";
        wordName = "demo3";
        suffix =".docx";
        String htmlPath = wordPath + File.separator + wordName + "_show"
                + File.separator;
        String htmlName = wordName + ".html";
        String imagePath = htmlPath + "image" + File.separator;


// 判斷html文件是否存在
        File htmlFile = new File(htmlPath + htmlName);
        if (htmlFile.exists()) {
            return htmlFile.getAbsolutePath();
        }


// word文件
        File wordFile = new File(wordPath + File.separator + wordName + suffix);


// 1) 加載word文檔生成 XWPFDocument對象
        InputStream in = new FileInputStream(wordFile);
        XWPFDocument document = new XWPFDocument(in);


// 2) 解析 XHTML配置 (這裏設置IURIResolver來設置圖片存放的目錄)
        File imgFolder = new File(imagePath);
        XHTMLOptions options = XHTMLOptions.create();
        options.setExtractor(new FileImageExtractor(imgFolder));
// html中圖片的路徑 相對路徑
        options.URIResolver(new BasicURIResolver("image"));
        options.setIgnoreStylesIfUnused(false);
        options.setFragment(true);


// 3) 將 XWPFDocument轉換成XHTML
// 生成html文件上級文件夾
        File folder = new File(htmlPath);
        if (!folder.exists()) {
            folder.mkdirs();
        }
        OutputStream out = new FileOutputStream(htmlFile);
        XHTMLConverter.getInstance().convert(document, out, options);
// 也可以使用字符數組流獲取解析的內容
// ByteArrayOutputStream baos = new ByteArrayOutputStream();
// XHTMLConverter.getInstance().convert(document, baos, options);
// String content = baos.toString();
// System.out.println("2007-docx"+content);
// baos.close();
        return htmlFile.getAbsolutePath();
    }


    /**
     * 將word2003轉換爲html文件 2018-4-9
     * 
     * @param wordPath
     *            word文件路徑
     * @param wordName
     *            word文件名稱無後綴
     * @param suffix
     *            word文件後綴
     * @throws IOException
     * @throws TransformerException
     * @throws ParserConfigurationException
     */
    public static String Word2003ToHtml(String wordPath, String wordName,
                                        String suffix) throws IOException, TransformerException,
            ParserConfigurationException {
        String htmlPath = wordPath + File.separator + wordName + "_show"
                + File.separator;
        String htmlName = wordName + ".html";
        final String imagePath = htmlPath + "image" + File.separator;


// 判斷html文件是否存在
        File htmlFile = new File(htmlPath + htmlName);
        if (htmlFile.exists()) {
            return htmlFile.getAbsolutePath();
        }


// 原word文檔
        final String file = wordPath + File.separator + wordName + suffix;
        InputStream input = new FileInputStream(new File(file));


        HWPFDocument wordDocument = new HWPFDocument(input);
        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                DocumentBuilderFactory.newInstance().newDocumentBuilder()
                        .newDocument());
// 設置圖片存放的位置
        wordToHtmlConverter.setPicturesManager(new PicturesManager() {
            public String savePicture(byte[] content, PictureType pictureType,
                                      String suggestedName, float widthInches, float heightInches) {
                File imgPath = new File(imagePath);
                if (!imgPath.exists()) {// 圖片目錄不存在則創建
                    imgPath.mkdirs();
                }
                File file = new File(imagePath + suggestedName);
                try {
                    OutputStream os = new FileOutputStream(file);
                    os.write(content);
                    os.close();
                } catch (FileNotFoundException e) {
                    e.printStackTrace();
                } catch (IOException e) {
                    e.printStackTrace();
                }
// 圖片在html文件上的路徑 相對路徑
                return "image/" + suggestedName;
            }
        });


// 解析word文檔
        wordToHtmlConverter.processDocument(wordDocument);
        Document htmlDocument = wordToHtmlConverter.getDocument();


// 生成html文件上級文件夾
        File folder = new File(htmlPath);
        if (!folder.exists()) {
            folder.mkdirs();
        }


// 生成html文件地址
// 也可以使用字符數組流獲取解析的內容
// ByteArrayOutputStream baos = new ByteArrayOutputStream();
// OutputStream outStream = new BufferedOutputStream(baos);


        OutputStream outStream = new FileOutputStream(htmlFile);


        DOMSource domSource = new DOMSource(htmlDocument);
        StreamResult streamResult = new StreamResult(outStream);


        TransformerFactory factory = TransformerFactory.newInstance();
        Transformer serializer = factory.newTransformer();
        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");


        serializer.transform(domSource, streamResult);


// 也可以使用字符數組流獲取解析的內容
// String content = baos.toString();
// System.out.println(content);
// baos.close();
// System.out.println("2003-doc"+content);
        return htmlFile.getAbsolutePath();
    }

}

注:只測試了20072即docx格式的

3.效果

生成目錄(我是生成在項目裏了,測試方便)

html:

發佈了64 篇原創文章 · 獲贊 6 · 訪問量 8萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章