package jsinfo.com.yxp;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class UrlSpeder {
static List<String> urllist =new ArrayList<String>();
public static void main(String[] args) {
// List<String> urllist =new ArrayList<String>();
}
public static String analyse( String site){
String regex= "\\<a[^\\<|^\\>]*href=[\\'|\\\"]([^\\<|^\\>]*\\.pdf)[\\'|\\\"][^\\<|^\\>]*[\\>|\\/\\>]";
Pattern p = Pattern.compile(regex);
try {
URL url = new URL(site);
InputStream is = url.openStream();
BufferedReader br = new BufferedReader(new InputStreamReader(is));
String line;
while ((line = br.readLine()) != null) {
Matcher m = p.matcher(line);
while(m.find()){
// System.out.println(m.group(1));
// 把解析好的pdf的下載地址放到list中
urllist.add(m.group(1));
}
}
br.close();
is.close();
String dir = "d:/pdf/";
File file = new File(dir);
if (!file.exists()) {
file.mkdirs();
}
for (String pdf : urllist) {
URL u = new URL(pdf);
InputStream i = u.openStream();
byte[] b = new byte[1024*1024];
int len;
String fileName = pdf.substring(pdf.lastIndexOf("/"));
OutputStream bos = new FileOutputStream(new File(dir + fileName));
while ((len = i.read(b)) != -1) {
bos.write(b, 0, len);
}
bos.flush();
bos.close();
i.close();
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
}