java讀取pdf

原創

2018-11-30 23:57

package jsinfo.com.yxp;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class UrlSpeder {
static List<String> urllist =new ArrayList<String>();
public static void main(String[] args) {
// List<String> urllist =new ArrayList<String>();

}

   public   static String analyse( String site){

    String regex= "\\<a[^\\<|^\\>]*href=[\\'|\\\"]([^\\<|^\\>]*\\.pdf)[\\'|\\\"][^\\<|^\\>]*[\\>|\\/\\>]";
    Pattern p = Pattern.compile(regex);
    try {
    URL url = new URL(site);

    InputStream is = url.openStream();
    BufferedReader br = new BufferedReader(new InputStreamReader(is));
    String line;
    while ((line = br.readLine()) != null) {
     Matcher m = p.matcher(line);
     while(m.find()){
//        System.out.println(m.group(1));
      // 把解析好的pdf的下載地址放到list中
      urllist.add(m.group(1));
     }
    }
    br.close();
    is.close();

    String dir = "d:/pdf/";
    File file = new File(dir);
    if (!file.exists()) {
     file.mkdirs();
    }
    for (String pdf : urllist) {
    URL u = new URL(pdf);
    InputStream i = u.openStream();
    byte[] b = new byte[1024*1024];
    int len;
    String fileName = pdf.substring(pdf.lastIndexOf("/"));
    OutputStream bos = new FileOutputStream(new File(dir + fileName));
    while ((len = i.read(b)) != -1) {
     bos.write(b, 0, len);
    }
    bos.flush();
    bos.close();
    i.close();
   }
  } catch (MalformedURLException e) {
   e.printStackTrace();
  } catch (IOException e) {
   e.printStackTrace();
  }

    return null;
      }

}

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

java讀取pdf

freeswitch-callcenter 源碼邏輯

easy_sanic更便捷實用sanic，支持orm、restful

mysql5.7 官方文檔閱讀-Chapter 14 The InnoDB Storage Engine 14.1

封裝Async Python server sanic

sqlalchemy自定義壓縮字段

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結