word文檔有需要兼容.doc和.docx兩種文檔格式。其中.docx通過poi直接就可以將word轉成pdf,.doc則無法這樣實現,上網查詢很多資料,大概思路是正確的,既將.doc文檔轉成html,再將html轉成pdf,具體實現的時候,卻發現很多方法都不完善,要麼轉換的html標籤不閉合,無法轉pdf,要麼是轉pdf時中文不顯示,在下將方法彙總之後,整理出一套親測可用的代碼,現附上,如下:
maven依賴:
<!-- word轉html -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.14</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.14</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.14</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>xdocreport</artifactId>
<version>1.0.6</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>3.14</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>ooxml-schemas</artifactId>
<version>1.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.lowagie/itext -->
<!-- <dependency>
<groupId>com.lowagie</groupId>
<artifactId>itext</artifactId>
<version>2.0.8</version>
</dependency> -->
<!-- https://mvnrepository.com/artifact/org.xhtmlrenderer/core-renderer -->
<!-- html轉pdf -->
<dependency>
<groupId>org.xhtmlrenderer</groupId>
<artifactId>core-renderer</artifactId>
<version>R8</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<!-- html轉xhtml -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
代碼:
/**
*
*/
package cn.test.util.utils;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.commons.collections.MapUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.converter.pdf.PdfConverter;
import org.apache.poi.xwpf.converter.pdf.PdfOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Entities;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.xhtmlrenderer.pdf.ITextFontResolver;
import org.xhtmlrenderer.pdf.ITextRenderer;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PdfCopy;
import com.itextpdf.text.pdf.PdfImportedPage;
import com.itextpdf.text.pdf.PdfReader;
import com.lowagie.text.pdf.BaseFont;
/**
* @author gsxs
* <li>word轉pdf工具類<li>
* @since 2019年2月26日15:52:21
*/
public class Word2PDFUtils {
private static final Logger logger = LoggerFactory
.getLogger(Word2PDFUtils.class);
public static void main(String[] args) {
try {
word2PDF("D://Test/test.doc",
"D:/Test/test.pdf");
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* word文檔轉pdf,自動匹配.doc和.docx格式
*
* @param wordFilePath
* word文檔路徑
* @param pdfFilePath
* 欲輸出pdf文檔路徑
* @throws Exception
*/
public static File word2PDF(String wordFilePath, String pdfFilePath)
throws Exception {
if (StringUtils.isBlank(pdfFilePath)
|| StringUtils.isBlank(wordFilePath)) {
logger.info("word2PDF 文件路徑爲空,wordFilePath={},pdfFilePath={}",
wordFilePath, pdfFilePath);
return null;
}
File wordFile = new File(wordFilePath);
File pdfFile = new File(pdfFilePath);
return word2PDF(wordFile, pdfFile);
}
/**
* word文檔轉pdf,自動匹配.doc和.docx格式
*
* @param wordFile
* word文檔File對象
* @param pdfFile
* pdfFile對象
* @throws Exception
* @throws FileNotFoundException
*/
public static File word2PDF(File wordFile, File pdfFile) throws Exception {
if (null == wordFile || null == pdfFile) {
logger.info("word2PDF 文件對象爲空,wordFile={},pdfFile={}", wordFile,
pdfFile);
return null;
}
String wordName = wordFile.getName();
if (!wordName.endsWith(".doc") && !wordName.endsWith(".docx")) {
// 格式不對
logger.info("不是word文檔格式,文件路徑={}", wordFile.getAbsolutePath());
return null;
}
File pdfParentFile = pdfFile.getParentFile();
if (!pdfParentFile.exists()) {
pdfParentFile.mkdirs();
}
String absolutePath = pdfParentFile.getAbsolutePath();
wordName = wordName.substring(0, wordName.indexOf("."));
String pdfPath = absolutePath + "/pdf/" + wordName + ".pdf";
File tempPdfFile = new File(pdfPath);
if (wordFile.getName().endsWith("doc")) {
String htmlPath = absolutePath + "/html/" + wordName + ".html";
File htmlFile = new File(htmlPath);
// doc格式word文檔,先轉成html,再格式化標籤成xhtml,最後轉成pdf
wordDocToHtml(wordFile, htmlFile);
convertHtmlToPdf(htmlFile, tempPdfFile);
// 刪除html文件
boolean delete = htmlFile.delete();
logger.info("刪除htmlFile路徑path={},結果={}",
htmlFile.getAbsolutePath(), delete);
} else if (wordFile.getName().endsWith("docx")) {
// docx格式轉pdf
wordConverterToPdf(new FileInputStream(wordFile),
new FileOutputStream(tempPdfFile), null);
}
// 抽取第一頁
splitPDFFile(tempPdfFile.getAbsolutePath(), pdfFile.getAbsolutePath(),
1, 2);
// 刪除臨時的pdf文件
boolean delete = tempPdfFile.delete();
logger.info("刪除tempPdfFile路徑path={},結果={}",
tempPdfFile.getAbsolutePath(), delete);
return pdfFile;
}
/**
* 將word文檔, 轉換成pdf, 中間替換掉變量
*
* @param source
* 源爲word文檔, 必須爲docx文檔
* @param target
* 目標輸出
* @param params
* 需要替換的變量
* @throws Exception
*/
private static void wordConverterToPdf(InputStream source,
OutputStream target, Map<String, String> params) throws Exception {
wordConverterToPdf(source, target, null, params);
}
/**
* 將word文檔, 轉換成pdf, 中間替換掉變量
*
* @param source
* 源爲word文檔, 必須爲docx文檔
* @param target
* 目標輸出
* @param params
* 需要替換的變量
* @param options
* PdfOptions.create().fontEncoding( "windows-1250" ) 或者其他
* @throws Exception
*/
private static void wordConverterToPdf(InputStream source,
OutputStream target, PdfOptions options, Map<String, String> params)
throws Exception {
XWPFDocument doc = new XWPFDocument(source);
paragraphReplace(doc.getParagraphs(), params);
// 存在需要替換的再循環
if (MapUtils.isNotEmpty(params)) {
for (XWPFTable table : doc.getTables()) {
for (XWPFTableRow row : table.getRows()) {
for (XWPFTableCell cell : row.getTableCells()) {
paragraphReplace(cell.getParagraphs(), params);
}
}
}
}
PdfConverter.getInstance().convert(doc, target, options);
}
/**
* 替換數據
*
* @param paragraphs
* @param params
*/
private static void paragraphReplace(List<XWPFParagraph> paragraphs,
Map<String, String> params) {
if (MapUtils.isNotEmpty(params)) {
for (XWPFParagraph p : paragraphs) {
for (XWPFRun r : p.getRuns()) {
String content = r.getText(r.getTextPosition());
if (StringUtils.isNotEmpty(content)
&& params.containsKey(content)) {
r.setText(params.get(content), 0);
}
}
}
}
}
/**
* .doc文檔轉html
*
* @param wordFile
* word File對象
* @param htmlFile
* html File對象
*/
private static void wordDocToHtml(File wordFile, File htmlFile) {
if (null == wordFile || null == htmlFile) {
return;
}
File parentFile = htmlFile.getParentFile();
if (!parentFile.exists()) {
parentFile.mkdirs();
}
String absolutePath = parentFile.getAbsolutePath();
HWPFDocument wordDocument;
final String imagepath = absolutePath + "/temp/wordimage/";
try {
// 根據輸入文件路徑與名稱讀取文件流
InputStream in = new FileInputStream(wordFile);
// 把文件流轉化爲輸入wordDom對象
wordDocument = new HWPFDocument(in);
// 通過反射構建dom創建者工廠
DocumentBuilderFactory domBuilderFactory = DocumentBuilderFactory
.newInstance();
// 生成dom創建者
DocumentBuilder domBuilder = domBuilderFactory.newDocumentBuilder();
// 生成dom對象
Document dom = domBuilder.newDocument();
// 生成針對Dom對象的轉化器
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
dom);
// 轉化器重寫內部方法
wordToHtmlConverter.setPicturesManager(new PicturesManager() {
@Override
public String savePicture(byte[] content,
PictureType pictureType, String suggestedName,
float widthInches, float heightInches) {
File imgPath = new File(imagepath);
if (!imgPath.exists()) {// 圖片目錄不存在則創建
imgPath.mkdirs();
}
File file = new File(imagepath + suggestedName);
try {
OutputStream os = new FileOutputStream(file);
os.write(content);
os.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return imagepath + suggestedName;
}
});
// 轉化器開始轉化接收到的dom對象
wordToHtmlConverter.processDocument(wordDocument);
// 保存文檔中的圖片
List<?> pics = wordDocument.getPicturesTable().getAllPictures();
if (pics != null) {
for (int i = 0; i < pics.size(); i++) {
Picture pic = (Picture) pics.get(i);
try {
pic.writeImageContent(new FileOutputStream(imagepath
+ pic.suggestFullFileName()));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
}
// 從加載了輸入文件中的轉換器中提取DOM節點
Document htmlDocument = wordToHtmlConverter.getDocument();
// 從提取的DOM節點中獲得內容
DOMSource domSource = new DOMSource(htmlDocument);
// 字節碼輸出流
OutputStream out = new FileOutputStream(htmlFile);
// 輸出流的源頭
StreamResult streamResult = new StreamResult(out);
// 轉化工廠生成序列轉化器
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
// 設置序列化內容格式
serializer.setOutputProperty(OutputKeys.ENCODING, "Unicode");//此處根據你 的word文檔的編碼格式進行設置
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
out.close();
in.close();
} catch (FileNotFoundException e1) {
e1.printStackTrace();
} catch (IOException e1) {
e1.printStackTrace();
} catch (TransformerConfigurationException e) {
e.printStackTrace();
} catch (TransformerException e) {
e.printStackTrace();
} catch (ParserConfigurationException e) {
e.printStackTrace();
}
}
/**
* .doc轉html
*
* @param wordFilePath
* @param htmlFilePath
*/
private static void wordDocToHtml(String wordFilePath, String htmlFilePath) {
if (org.apache.commons.lang3.StringUtils.isAnyBlank(wordFilePath,
htmlFilePath)) {
return;
}
File wordFile = new File(wordFilePath);
File htmlFile = new File(htmlFilePath);
wordDocToHtml(wordFile, htmlFile);
}
/**
* html轉pdf
*
* @param htmlFile
* @param pdfFile
* @return
* @throws Exception
*/
private static boolean convertHtmlToPdf(File htmlFile, File pdfFile)
throws Exception {
if (null == htmlFile || null == pdfFile) {
logger.info("html轉pdf時,有file爲空,htmlFile={},pdfFile={}", htmlFile,
pdfFile);
return false;
}
String absoluteFilePath = htmlFile.getParentFile().getAbsolutePath();
if (!pdfFile.getParentFile().exists()) {
pdfFile.getParentFile().mkdirs();
}
// .doc轉成的html中有些標籤:例如<mate>不嚴謹,會出現標籤不閉合問題,在轉pdf時會報異常,故此處用jsoup將html轉化成xhtml,將標籤嚴謹化
// 格式化html標籤
org.jsoup.nodes.Document parse = Jsoup.parse(htmlFile, "utf-8");
parse.outputSettings()
.syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml)
.escapeMode(Entities.EscapeMode.xhtml);
String html = parse.html();
// 此處將body標籤中的字體設置爲SimSun,必須是這種樣式,纔會識別中文支持的文件,如果不設置,會出現轉成的pdf中文不顯示問題(此處需要替換的字段,可用將自己轉成的html打印出來,查看是否是宋體,如不是,將宋體改爲你轉換成html的字體格式)
html = html.replace("font-family:宋體", "font-family: SimSun");
OutputStream os = new FileOutputStream(pdfFile);
ITextRenderer renderer = new ITextRenderer();
renderer.setDocumentFromString(html);
// 解決中文支持問題
ITextFontResolver fontResolver = renderer.getFontResolver();
String path = Word2PDFUtils.class.getClassLoader()
.getResource("simsun.ttc").getPath();
logger.info(path);
fontResolver.addFont(path, BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);
// 解決圖片的相對路徑問題
renderer.getSharedContext().setBaseURL(
"file:" + absoluteFilePath + "/temp/htmlimage");
renderer.layout();
renderer.createPDF(os);
os.flush();
os.close();
return true;
}
/**
* html轉pdf
*
* @param inputFile
* @param outputFile
* @return
* @throws Exception
*/
private static boolean convertHtmlToPdf(String inputFile, String outputFile)
throws Exception {
if (org.apache.commons.lang3.StringUtils.isAnyBlank(inputFile,
outputFile)) {
logger.info("html轉pdf是,路徑爲空,inputFile={},outputFile={}", inputFile,
outputFile);
;
return false;
}
File htmlFile = new File(inputFile);
File pdfFile = new File(outputFile);
return convertHtmlToPdf(htmlFile, pdfFile);
}
/**
* 截取pdfFile的第from頁至第end頁,組成一個新的文件名
*
* @param pdfFile
* 需要分割的PDF
* @param savepath
* 新PDF
* @param from
* 起始頁
* @param end
* 結束頁
*/
private static void splitPDFFile(String respdfFile, String savepath,
int from, int end) {
com.itextpdf.text.Document document = null;
PdfCopy copy = null;
try {
PdfReader reader = new PdfReader(respdfFile);
int n = reader.getNumberOfPages();
if (end == 0) {
end = n;
}
ArrayList<String> savepaths = new ArrayList<String>();
savepaths.add(savepath);
document = new com.itextpdf.text.Document(reader.getPageSize(1));
copy = new PdfCopy((com.itextpdf.text.Document) document,
new FileOutputStream(savepaths.get(0)));
document.open();
for (int j = from; j < end; j++) {
document.newPage();
PdfImportedPage page = copy.getImportedPage(reader, j);
copy.addPage(page);
}
document.close();
reader.close();
copy.close();
} catch (IOException e) {
e.printStackTrace();
} catch (DocumentException e) {
e.printStackTrace();
}
}
}
其中.doc文檔轉html方法與其他一樣,只是轉完html時需要用jsoup轉一遍xhtml,使標籤嚴謹化,然後轉pdf,轉pdf時加入中文字體支持,
如果報沒有搜索到方法的異常,可能是jar包版本的問題,就將<!-- <dependency>
<groupId>com.lowagie</groupId>
<artifactId>itext</artifactId>
<version>2.0.8</version>
</dependency> -->依賴放開試試,我開始的時候遇見過這個異常,後來隨着導入的依賴增多,這個依賴注掉也不會有這個異常了。可能是其他的依賴裏有這個版本--2.0.8的itext的jar包,但是不確定你的其他依賴裏是否存在,故此說明
另外附上文件simsun.ttc百度雲下載地址:
鏈接:https://pan.baidu.com/s/1iH4iqJB2X_0gB7T4_CClzA
提取碼:7rmn