java讀取pdf

package jsinfo.com.yxp;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class UrlSpeder {
 static   List<String> urllist =new ArrayList<String>();
 public static void main(String[] args) {
  // List<String> urllist =new ArrayList<String>();
    
 }
 
   public   static String analyse( String site){
     
      String regex= "\\<a[^\\<|^\\>]*href=[\\'|\\\"]([^\\<|^\\>]*\\.pdf)[\\'|\\\"][^\\<|^\\>]*[\\>|\\/\\>]";
      Pattern p = Pattern.compile(regex);
      try {
    URL url = new URL(site);
    
    InputStream is = url.openStream();
    BufferedReader br = new BufferedReader(new InputStreamReader(is));
    String line;
    while ((line = br.readLine()) != null) {
     Matcher m = p.matcher(line);
     while(m.find()){
//        System.out.println(m.group(1));
      // 把解析好的pdf的下載地址放到list中
      urllist.add(m.group(1));
     }
    }
    br.close();
    is.close();
    
    String dir = "d:/pdf/";
    File file = new File(dir);
    if (!file.exists()) {
     file.mkdirs();
    }
    for (String pdf : urllist) {
    URL u = new URL(pdf);
    InputStream i = u.openStream();
    byte[] b = new byte[1024*1024];
    int len;
    String fileName = pdf.substring(pdf.lastIndexOf("/"));
    OutputStream bos = new FileOutputStream(new File(dir + fileName));
    while ((len = i.read(b)) != -1) {
     bos.write(b, 0, len);
    }
    bos.flush();
    bos.close();
    i.close();
   }
  } catch (MalformedURLException e) {
   e.printStackTrace();
  } catch (IOException e) {
   e.printStackTrace();
  }
        
      return null;
      }
     
}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章