HTML轉化爲PDF

將HTML轉化爲PDF

參考於http://swordshadow.iteye.com/blog/1983935

package org.zbq.html2pdf;

import java.io.FileOutputStream;
import java.io.OutputStream;

import org.xhtmlrenderer.pdf.ITextRenderer;

import com.lowagie.text.pdf.BaseFont;

public class Html2Pdf {
    private String inputFile;
    private String outputFile;

    public Html2Pdf(String inputFile, String outputFile) {
        this.inputFile = inputFile;
        this.outputFile = outputFile;
    }

    public void createPdf() throws Exception {
        // step 1
        OutputStream os = new FileOutputStream(outputFile);
        org.xhtmlrenderer.pdf.ITextRenderer renderer = new ITextRenderer();
        renderer.setDocument(inputFile);

        // step 2 解決中文支持
        org.xhtmlrenderer.pdf.ITextFontResolver fontResolver = renderer
                .getFontResolver();
        fontResolver.addFont("c:/Windows/Fonts/simsun.ttc",
                BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);

        renderer.layout();
        renderer.createPDF(os);
        os.close();

        System.out.println("create pdf done!!");
    }

    public static void main(String[] args) throws Exception {
        Html2Pdf app = new Html2Pdf(args[0], args[1]);
        app.createPdf();
    }

}

maven pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.zbq</groupId>
    <artifactId>html2pdf</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <packaging>jar</packaging>

    <name>html2pdf</name>
    <url>http://maven.apache.org</url>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    </properties>

    <build>
        <pluginManagement>
            <plugins>
                <!-- skip unit testing -->
                <plugin>
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-surefire-plugin</artifactId>
                    <configuration>
                        <skip>true</skip>
                    </configuration>
                </plugin>

                <!-- generate jar package -->
                <plugin>
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-jar-plugin</artifactId>
                    <version>2.4</version>
                    <configuration>
                        <archive>
                            <manifest>
                                <addClasspath>true</addClasspath>
                                <classpathPrefix>lib/</classpathPrefix>
                                <mainClass>org.zbq.html2pdf.Html2Pdf</mainClass>
                            </manifest>
                        </archive>
                    </configuration>
                </plugin>

                <plugin>
                    <artifactId>maven-assembly-plugin</artifactId>
                    <configuration>
                        <archive>
                            <manifest>
                                <mainClass>org.zbq.html2pdf.Html2Pdf</mainClass>
                            </manifest>
                        </archive>
                        <descriptorRefs>
                            <descriptorRef>jar-with-dependencies</descriptorRef>
                        </descriptorRefs>
                    </configuration>
                </plugin>
            </plugins>
        </pluginManagement>
    </build>

    <dependencies>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.11</version>
            <scope>test</scope>
        </dependency>

        <dependency>
            <groupId>com.lowagie</groupId>
            <artifactId>itext</artifactId>
            <version>2.0.8</version>
        </dependency>

        <dependency>
            <groupId>org.xhtmlrenderer</groupId>
            <artifactId>core-renderer</artifactId>
            <version>R8</version>
        </dependency>

    </dependencies>
</project>

打包成jar包

mvn assembly:assembly

可以生成html2pdf-0.0.1-SNAPSHOT-jar-with-dependencies.jar

從CSDN上解析markdown文檔生成html

使用了BeautifulSoup,下載地址:http://www.crummy.com/software/BeautifulSoup/bs4/download/4.0/
BeautifulSoup文檔參考:http://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/

# -*- encoding:utf-8 -*-

import sys
import os
import urllib
import httplib
from sgmllib import SGMLParser 
from bs4 import BeautifulSoup


# Download from http://www.crummy.com/software/BeautifulSoup/bs4/download/4.0/
# Document http://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/


head = '''
<head>

<meta content="no-siteapp" http-equiv="Cache-Control"/>
<title>%s</title>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>

</head>
'''

def getHtmlfromFilepath(filepath, name):
    # http://blog.csdn.net/zhubinqiang/article/details/48682523
    # filepath: "/zhubinqiang/article/details/47144759"

    html = None

    host = 'blog.csdn.net'
    conn = httplib.HTTPConnection(host)
    #conn.request("GET", "/zhubinqiang/article/details/48682523")
    #conn.request("GET", "/zhubinqiang/article/details/47280537")
    conn.request("GET", filepath)

    r1 = conn.getresponse()
    print r1.status, r1.reason
    html = r1.read()
    conn.close()

    soup = BeautifulSoup(html)
    #head = soup.head
    #print head


    content = soup.find(id='article_content')
#    print content
#    print (content.get_text())

    with open('%s.html' %(name), 'w') as f:
        f.write('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">')
        f.write('<html xmlns="http://www.w3.org/1999/xhtml">')
        h = head %(name)
        f.write(h.encode('utf8'))
        f.write(str(content))
        f.write('</html>')


def getArticleUrls():
    l = []
    # http://blog.csdn.net/zhubinqiang/article/list/1
    url = 'blog.csdn.net'
    conn = httplib.HTTPConnection(url)
    conn.request("GET", "/zhubinqiang/article/list/1")
    r1 = conn.getresponse()
    print r1.status, r1.reason
    html = r1.read()
    conn.close()

    #print html

    soup = BeautifulSoup(html)
    h1s = soup.find_all('h1')
    #print h1s


    for h1 in h1s:
        a = h1.find('a')
        l.append((a['href'], a.get_text().strip()))
        print "[%s], [%s]" %(a['href'], a.get_text().strip())
    return l


def html2pdf(inputfile, outputfile):
    print inputfile, outputfile
    print "java -jar html2pdf-0.0.1-SNAPSHOT-jar-with-dependencies.jar %s %s" %(inputfile, outputfile)
    os.system('''java -jar html2pdf-0.0.1-SNAPSHOT-jar-with-dependencies.jar "%s" "%s"''' %(inputfile, outputfile))


if __name__ == '__main__':
    for filepath, name in getArticleUrls():
        print name, filepath
        getHtmlfromFilepath(filepath, name)
        html2pdf(inputfile='%s.html' %(name.encode('gbk')), outputfile='%s.pdf' %(name.encode('gbk')))

Note: python的字符編碼

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章