package com.test;
import java.io.File;
import java.io.FileWriter;
import java.net.URL;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
*@Author:liangjilong
*@Date:2014-8-27
*@Email:[email protected]
*
*@Descript:此程序是抓取中國合格評定國家認可委員會---獲准認可機構
*/
public class TestReptile {
/**
* @param args
*/
public static void main(String[] args) throws Exception {
String path = "D:/test/test.txt";
File file = new File(path);
if (!file.exists()) {
file.createNewFile();// 不存在就創建一個..
String newFlie = file.getPath();
FileWriter fileWriter = new FileWriter(newFlie);
int page = 150;// 抓取頁數
for (int i = 1; i <= page; i++) {
String URL = getUrl(i);// 抓取第一頁的內容
System.out.println(URL);
Document doc = Jsoup.parse(new URL(URL), 3000);
if (doc != null) {
Elements divtables = doc.getElementsByAttributeValue("class", "divtable");// 獲取div樣式class=divtable裏面的html內容
for (Element d : divtables) {
String a_text = d.select("a").html();// 獲取html裏面a標籤的內容
//System.out.println(a_text);
fileWriter.write(a_text);
fileWriter.flush();
}
} else {
System.out.println("網絡異常..");
}
}
fileWriter.close();
} else {
System.err.println("文件存在..");
}
}
/**
* @param pageSize頁數.
* @return
*/
public static String getUrl(Integer pageSize){
String url="http://219.238.178.49/";
StringBuffer buffer=new StringBuffer(url);
buffer.append("Acc_Search2.asp?Class=L&page="+pageSize);
return buffer.toString();
}
}
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
/**
*@Author:liangjilong
*@Date:2014-9-9
*/
public class Test2 {
private static final String regEx_html = "<[^>]+>"; // 定義HTML標籤的正則表達式
/**
* @param args
*/
public static void main(String[] args)throws Exception {
Integer pageSize=20;
getHtml(pageSize);
}
/**
* @param pageSize
* @throws IOException
*/
private static void getHtml(Integer pageSize) throws IOException {
Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
for (int i = 1; i <= pageSize; i++) {
String url=getUrl(i);
Document doc=Jsoup.connect(url).get();
if(doc!=null){
String fileName=doc.getElementsByAttributeValue("class", "T1").html();//抓取class=T1的內容,作爲文件的名稱.
String path = "D:/test/"+fileName+i+".txt";//路徑名/i
File file = new File(path);
FileWriter fileWriter=null;
if (!file.exists()) {
file.createNewFile();// 不存在就創建一個.
String newFlie = file.getPath();
String htmlEl=doc.getElementsByAttributeValue("class", "clabel").html();
String htmlStr=p_html.matcher(htmlEl).replaceAll("").replaceAll(""", "");// 過濾html標籤
fileWriter = new FileWriter(newFlie);
fileWriter.write(htmlStr);
fileWriter.flush();
}
fileWriter.close();
}else{
System.err.println("網絡異常!");
}
}
System.out.println("抓取完成~..");
}
/**
* @param pageSize頁數.
*/
public static String getUrl(Integer pageSize){
String url="http://219.238.178.49/";
StringBuffer bufferUrl=new StringBuffer(url);
bufferUrl.append("BaseInfo.asp?Id=");
if(pageSize<=10){
if(pageSize==10){
bufferUrl.append("L000"+pageSize);
}else{
bufferUrl.append("L0000"+pageSize);
}
}else if(pageSize<=100){
if(pageSize==100){
bufferUrl.append("L00"+pageSize);
}else{
bufferUrl.append("L000"+pageSize);
}
}else if(pageSize<=1000){
if(pageSize==1000){
bufferUrl.append("L0"+pageSize);
}else{
bufferUrl.append("L00"+pageSize);
}
}
return bufferUrl.toString();
}
}