csdn博客的正文的圖片一般不能直接複製,需要複製到微信等再粘貼出來,比較麻煩
因此寫了個工具類,方便進行下載,可以方便的生成html和word
需要jsoup和poi
package test.test2019; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.poi.poifs.filesystem.DirectoryEntry; import org.apache.poi.poifs.filesystem.DocumentEntry; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; /** * Created by admin on 2019/1/15. */ public class JsoupTest { /** * 從csdn讀取和加工正文 * @param uriStr * @return */ private static String readHtml(String uriStr){ StringBuffer sb=new StringBuffer(); sb.append("<html><head>"); try { URI uri=new URI(uriStr); Document doc= Jsoup.parse(uri.toURL(),10000); sb.append(doc.select("style").outerHtml()); Elements elements=doc.select("link[rel=\"stylesheet\"]"); String url=null; for(Element element:elements){ url=element.attr("href"); sb.append("<style type=\"text/css\" url='"); sb.append(url); sb.append("'>"); sb.append(IOUtils.toString(new URI(url),"UTF-8")); sb.append("</style>"); sb.append("\r\n"); } sb.append("<style type=\"text/css\">"); sb.append("#article_content,.fontclass{font-family:\"Microsoft YaHei\"}"); sb.append("</style>"); sb.append("</head><body>"); sb.append("<h1 class='title-article'>下載地址:<b>").append(uriStr).append("</b></h1>"); doc.select("#article_content p").toggleClass("fontclass"); sb.append(doc.select(".blog-content-box").outerHtml()); sb.append("</body></html>"); } catch (Exception e) { e.printStackTrace(); } return sb.toString(); } public static boolean writeDocFile( File file, String html,String encoding) { boolean w = false; File fileDir=file.getParentFile(); if (!fileDir.exists()) { fileDir.mkdirs(); } try { byte b[] = html.getBytes(encoding); ByteArrayInputStream bais = new ByteArrayInputStream(b); POIFSFileSystem poifs = new POIFSFileSystem(); DirectoryEntry directory = poifs.getRoot(); DocumentEntry documentEntry = directory.createDocument( "WordDocument", bais); FileOutputStream ostream = new FileOutputStream(file); poifs.writeFilesystem(ostream); bais.close(); ostream.close(); }catch(IOException e){ e.printStackTrace(); } return w; } /**從csdn截取正文 * @param uriStr * @throws IOException * @throws URISyntaxException */ public static void writeCSDNWordFile(String uriStr,File wordFile) { writeDocFile(wordFile,readHtml(uriStr),"UTF-8"); } /**從csdn截取正文 * @param uriStr * @throws IOException * @throws URISyntaxException */ public static void writeCSDNHtmlFile(String uriStr,File localFile) throws IOException { FileUtils.writeStringToFile(localFile,readHtml(uriStr),"UTF-8"); } public static void main(String[] args) throws IOException, URISyntaxException { String html="D:/test/word/jxl-excel.html"; String doc="D:/test/word/jxl-excel.doc"; String uri="https://blog.csdn.net/a1091662876/article/details/87722035"; writeCSDNHtmlFile(uri,new File(html)); writeCSDNWordFile(uri,new File(doc)); } }