下面代码的具体功能是这样的,先用ScheduledExecutorService 让HttpURLConnection去按固定间隔时间去获得网页内容,然后对得到的内容进行截取,并写入文件,最后对文件的内容进行去重操作。哈哈。。。听起来好像很麻烦,我想对于大家来说ScheduledExecutorService 、HttpURLConnection或许用用,至于后面文件的操作就没啥作用了。我之所以在这都详细的写出来,是因为我对于文件的操作一直都不是很熟悉,所以就想借此机会让自己再次学习下IO流的操作,加深自己的印象!
ok!话有点多了,下面来看代码:
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
/**
* Created by IntelliJ IDEA. User: user Date: 2009-5-11 Time: 13:34:10
* 模拟Http访问的工具类
*/
public class HttpConnectionUtil {
private static SimpleDateFormat sdf = new SimpleDateFormat(
"yyyy-MM-dd HH:mm:ss");
public static String getHttpContent(String url) {
return getHttpContent(url, "GB2312");
}
public static String getHttpContent(String url, String charSet) {
HttpURLConnection connection = null;
String content = "";
try {
URL address_url = new URL(url);
connection = (HttpURLConnection) address_url.openConnection();
// connection.setRequestMethod("GET");
// 设置访问超时时间及读取网页流的超市时间,毫秒值
System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
System.setProperty("sun.net.client.defaultReadTimeout", "30000");
// after JDK 1.5
// connection.setConnectTimeout(10000);
// connection.setReadTimeout(10000);
// 得到访问页面的返回值
int response_code = connection.getResponseCode();
if (response_code == HttpURLConnection.HTTP_OK) {
InputStream in = connection.getInputStream();
// InputStreamReader reader = new InputStreamReader(in,charSet);
BufferedReader reader = new BufferedReader(
new InputStreamReader(in, charSet));
String line = null;
while ((line = reader.readLine()) != null) {
content += line;
}
return content;
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (connection != null) {
connection.disconnect();
}
}
return "";
}
//对文件去重(获得的网址中有重复的网址)
public static void readFile(String insertFile, String outfile) {
Set<String> set = new HashSet<String>();
try {
String line = "";
FileInputStream fis = new FileInputStream(insertFile);
InputStreamReader isr = new InputStreamReader(fis, "UTF8");
BufferedReader inbr = new BufferedReader(isr);
String temp = "";
while ((temp = inbr.readLine()) != null) {
if(temp.indexOf("'https:")!=-1){
temp = temp.substring(temp.indexOf("'https:"));
String[] strs = new String[2];
strs = temp.split(",");
for (int i = 0; i < strs.length; i++) {
set.add(strs[i]);//set不允许有重复的记录
}
}
}
inbr.close();
createFile(outfile);//创建文件
FileOutputStream fos = new FileOutputStream(outfile);
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fos,
"UTF8"));
Iterator it = set.iterator();
while (it.hasNext()) {
String element = (String) it.next();
out.write(element);
out.newLine();
}
out.close();
} catch (Exception e) {
System.out.println("readFile errors:" + e.getMessage());
}
}
//创建文件
public static void createFile(String insert) throws IOException{
File file = new File(insert);
if(!file.exists()){
file.createNewFile();
System.out.println("文件创建成功!");
}
}
//爬取目标网页的内容
public static void getFile(final String insert) {
try {
//String content = "";
int i = 0;
String catalog=insert.substring(0, insert.lastIndexOf("\\"));
File dir=new File(catalog);
if(!dir.exists()){
dir.mkdir();
System.out.println("文件夹创建成功!");
}
createFile(insert);//创建文件
ScheduledExecutorService execService = Executors.newSingleThreadScheduledExecutor();
execService.scheduleWithFixedDelay(new Runnable() {
public void run() {
String content = HttpConnectionUtil
.getHttpContent("https://s3.amazonaws.com/cdtimes/index.html");
String str = "";
if (content.contains("var b_urls")) {
str = content.substring(content.indexOf("var b_urls"), content
.indexOf("b_urls = shuffle(b_urls)"));
}
str = str.substring(str.indexOf("['") + 1, str.indexOf("']") + 1);
System.out.println(str);
AppendFile(insert, sdf.format(new Date()) + " " + str);
}
}, 1, 1, TimeUnit.MINUTES); //间隔一分钟爬取
Thread.sleep(24*3600*1000L); //控制程序爬取的时间
execService.shutdown();
} catch (Exception e) {
System.out.println("readFile errors:" + e.getMessage());
}
}
//以追加的方式将内容写入文件
public static void AppendFile(String fileName, String content) {
BufferedWriter writer = null;
try {
// 打开一个写文件器,构造函数中的第二个参数true表示以追加形式写文件
writer = new BufferedWriter(new FileWriter(fileName, true));
writer.write(content);
writer.newLine();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (writer != null) {
writer.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) throws IOException {
String insert="";
String outfile="";
if(args!=null&&args.length!=0){//这个地方做了两个参数,是为了方便将该文件打包成jar文件去执行,传入两个文件名即可
insert = args[0];//全部结果
outfile = args[1];//去重后的结果
}else{//默认文件
insert="d:\\HttpConnection\\allResultFile.txt";
outfile="d:\\HttpConnection\\resultFile.txt";
}
System.out.println("开始获取全部信息...");
getFile(insert);//获取全部信息
System.out.println("全部信息获取成功!");
System.out.println("全部结果内容存放文件:"+insert);
System.out.println("开始对全部结果去重...");
readFile(insert, outfile);//对信息进行去重处理
System.out.println("去重完成!");
System.out.println("去重后结果内容存放文件:"+outfile);
}
}