將HTML轉化爲PDF
參考於http://swordshadow.iteye.com/blog/1983935
package org.zbq.html2pdf;
import java.io.FileOutputStream;
import java.io.OutputStream;
import org.xhtmlrenderer.pdf.ITextRenderer;
import com.lowagie.text.pdf.BaseFont;
public class Html2Pdf {
private String inputFile;
private String outputFile;
public Html2Pdf(String inputFile, String outputFile) {
this.inputFile = inputFile;
this.outputFile = outputFile;
}
public void createPdf() throws Exception {
// step 1
OutputStream os = new FileOutputStream(outputFile);
org.xhtmlrenderer.pdf.ITextRenderer renderer = new ITextRenderer();
renderer.setDocument(inputFile);
// step 2 解決中文支持
org.xhtmlrenderer.pdf.ITextFontResolver fontResolver = renderer
.getFontResolver();
fontResolver.addFont("c:/Windows/Fonts/simsun.ttc",
BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);
renderer.layout();
renderer.createPDF(os);
os.close();
System.out.println("create pdf done!!");
}
public static void main(String[] args) throws Exception {
Html2Pdf app = new Html2Pdf(args[0], args[1]);
app.createPdf();
}
}
maven pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.zbq</groupId>
<artifactId>html2pdf</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>html2pdf</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<build>
<pluginManagement>
<plugins>
<!-- skip unit testing -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<skip>true</skip>
</configuration>
</plugin>
<!-- generate jar package -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>2.4</version>
<configuration>
<archive>
<manifest>
<addClasspath>true</addClasspath>
<classpathPrefix>lib/</classpathPrefix>
<mainClass>org.zbq.html2pdf.Html2Pdf</mainClass>
</manifest>
</archive>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<archive>
<manifest>
<mainClass>org.zbq.html2pdf.Html2Pdf</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.lowagie</groupId>
<artifactId>itext</artifactId>
<version>2.0.8</version>
</dependency>
<dependency>
<groupId>org.xhtmlrenderer</groupId>
<artifactId>core-renderer</artifactId>
<version>R8</version>
</dependency>
</dependencies>
</project>
打包成jar包
mvn assembly:assembly
可以生成html2pdf-0.0.1-SNAPSHOT-jar-with-dependencies.jar
從CSDN上解析markdown文檔生成html
使用了BeautifulSoup,下載地址:http://www.crummy.com/software/BeautifulSoup/bs4/download/4.0/
BeautifulSoup文檔參考:http://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
# -*- encoding:utf-8 -*-
import sys
import os
import urllib
import httplib
from sgmllib import SGMLParser
from bs4 import BeautifulSoup
# Download from http://www.crummy.com/software/BeautifulSoup/bs4/download/4.0/
# Document http://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
head = '''
<head>
<meta content="no-siteapp" http-equiv="Cache-Control"/>
<title>%s</title>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
</head>
'''
def getHtmlfromFilepath(filepath, name):
# http://blog.csdn.net/zhubinqiang/article/details/48682523
# filepath: "/zhubinqiang/article/details/47144759"
html = None
host = 'blog.csdn.net'
conn = httplib.HTTPConnection(host)
#conn.request("GET", "/zhubinqiang/article/details/48682523")
#conn.request("GET", "/zhubinqiang/article/details/47280537")
conn.request("GET", filepath)
r1 = conn.getresponse()
print r1.status, r1.reason
html = r1.read()
conn.close()
soup = BeautifulSoup(html)
#head = soup.head
#print head
content = soup.find(id='article_content')
# print content
# print (content.get_text())
with open('%s.html' %(name), 'w') as f:
f.write('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">')
f.write('<html xmlns="http://www.w3.org/1999/xhtml">')
h = head %(name)
f.write(h.encode('utf8'))
f.write(str(content))
f.write('</html>')
def getArticleUrls():
l = []
# http://blog.csdn.net/zhubinqiang/article/list/1
url = 'blog.csdn.net'
conn = httplib.HTTPConnection(url)
conn.request("GET", "/zhubinqiang/article/list/1")
r1 = conn.getresponse()
print r1.status, r1.reason
html = r1.read()
conn.close()
#print html
soup = BeautifulSoup(html)
h1s = soup.find_all('h1')
#print h1s
for h1 in h1s:
a = h1.find('a')
l.append((a['href'], a.get_text().strip()))
print "[%s], [%s]" %(a['href'], a.get_text().strip())
return l
def html2pdf(inputfile, outputfile):
print inputfile, outputfile
print "java -jar html2pdf-0.0.1-SNAPSHOT-jar-with-dependencies.jar %s %s" %(inputfile, outputfile)
os.system('''java -jar html2pdf-0.0.1-SNAPSHOT-jar-with-dependencies.jar "%s" "%s"''' %(inputfile, outputfile))
if __name__ == '__main__':
for filepath, name in getArticleUrls():
print name, filepath
getHtmlfromFilepath(filepath, name)
html2pdf(inputfile='%s.html' %(name.encode('gbk')), outputfile='%s.pdf' %(name.encode('gbk')))
Note: python的字符編碼