因为需要详细的地址数据信息所以需要爬取国家统计局的地址数据:
1,抓取url地址:
http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/index.html
2,java代码:(servlet)
package zzz;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import config.Common;
@SuppressWarnings("serial")
public class GetAddress extends HttpServlet {
/**
* 递归便利获取地区的信息
* @author yuyu
*/
public String basicUrl="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/";
public String json=null;
public String status="";
public void doGet(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
try{
response.setHeader("Content-Type", "application/xml; charset=UTF-8");//编码
response.setHeader("Access-Control-Allow-Origin", "*");//跨域问题
String url="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/index.html";
String data=Common.sendGet(url);
json="";
Pattern pattern = Pattern.compile("\\d+\\.html'>(\\D+)</a>");
Matcher matcher =pattern.matcher(data);
//对应的省市地区的id
int Id=Integer.parseInt(request.getParameter("id"))-1;
if(Id>30||Id<0){
throw new Exception("id错误,应取1-31");
}
//便利省市地区
int i=0;
while (matcher.find()) {
if(i==Id){
String info=matcher.group();
String aUrl=basicUrl+info.replaceAll(".>.*", "");
String aData=info.replaceAll("\\w|\\.|<|>|/|'", "");
System.out.println(aUrl);
System.out.println(aData);
json+="{\""+aData+"\":["+getInfo(aUrl)+"]}";
}
i++;
}
//将跑完的数据保存到txt文件
Common.contentToTxt("C:/Users/yuyu/Desktop/"+Id+".txt",json.replaceAll(",]", "]"));
response.setContentType("text/html");
request.setCharacterEncoding("UTF-8");//乱码问题
PrintWriter out = response.getWriter();
out.println("已写入C:/Users/yuyu/Desktop/"+Id+".txt");
out.flush();
out.close();
}catch(Exception e){
//发生错误的时候输出错误
e.printStackTrace();
response.setContentType("text/html");
request.setCharacterEncoding("UTF-8");//乱码问题
PrintWriter out = response.getWriter();
out.println("错误:"+e.getMessage());
out.flush();
out.close();
}
}
/**
* 根据url获取对应的页面信息
* @param url
* @return
* @throws Exception
*/
public String getInfo(String url) throws Exception{
String json="";
String data=Common.sendGet(url);
//请求出错的我时候
int y=0;
while("".equals(data)||null==data){
if(y==10){
break;
}
data=Common.sendGet(url);
y++;
}
if("".equals(data)||null==data){
throw new Exception("未请求到数据");
}
//取得对应区域的数据
Pattern pattern = Pattern.compile("<tr class='[a-z]*'>.+?</tr>");
Matcher matcher =pattern.matcher(data);
int x=0;
while (matcher.find()) {
if(x==0){
x++;
continue;
}
String info=matcher.group();
//获得正确的url
String status=url.replaceAll("\\d+\\.html", "");
String shh=getDataByRegex(info,"\\d+/\\d+.html");
//匹配到url
String aUrl=status+shh;
//匹配Id
String aId=getDataByRegex(info,"\\d{12}");
//匹配中文
String aData=getDataByRegex(info,"[\u4e00-\u9fa5]+");
//打印匹配信息
// System.out.println(aUrl);
// System.out.println(aId);
// System.out.println(aData);
//添加匹配带的信息
if("".equals(shh)){
json+="{\"id\":\""+aId+"\",\"name\":\""+aData+"\"},";
}else{
json+="{\"id\":\""+aId+"\",\"name\":\""+aData+"\",\"children\":["+getInfo(aUrl)+"]},";
}
}
return json;
}
/**
* 执行正则
* @param data
* @param regex
* @return
*/
public String getDataByRegex(String data,String regex){
try{
Pattern pattern = Pattern.compile(regex);
Matcher matcher =pattern.matcher(data);
matcher.find();
return matcher.group();
}catch(Exception e){
return "";
}
}
}
3、需要使用的(Common)工具类
/**
* 向指定URL发送GET方法的请求
*
* @param url
* 发送请求的URL
* @param param
* 请求参数,请求参数应该是 name1=value1&name2=value2 的形式。
* @return URL 所代表远程资源的响应结果
*/
public static String sendGet(String url) {
String result = "";
BufferedReader in = null;
try {
String urlNameString = url;
URL realUrl = new URL(urlNameString);
// 打开和URL之间的连接
URLConnection connection = realUrl.openConnection();
// 设置通用的请求属性
// connection.setRequestProperty("accept", "*/*");
// connection.setRequestProperty("connection", "Keep-Alive");
// connection.setRequestProperty("user-agent",
// "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
connection.setRequestProperty( "Content-Type","application/json; charset=utf-8");
// 建立实际的连接
connection.connect();
// 获取所有响应头字段
// Map<String, List<String>> map = connection.getHeaderFields();
// 遍历所有的响应头字段
// for (String key : map.keySet()) {
// System.out.println(key + "--->" + map.get(key));
// }
// 定义 BufferedReader输入流来读取URL的响应
in = new BufferedReader(
new InputStreamReader(connection.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
result += line;
}
} catch (Exception e) {
System.out.println("发送GET请求出现异常!" + e);
e.printStackTrace();
}
// 使用finally块来关闭输入流
finally {
try {
if (in != null) {
in.close();
}
} catch (Exception e2) {
e2.printStackTrace();
}
}
return result;
}
/**
* 把字符写入txt文件
* @param filePath
* @param content
*/
public static void contentToTxt(String filePath, String content) {
String str = new String(); //原有txt内容
String s1 = new String();//内容更新
try {
File f = new File(filePath);
if (f.exists()) {
System.out.print("文件存在");
} else {
System.out.print("文件不存在");
f.createNewFile();// 不存在则创建
}
BufferedReader input = new BufferedReader(new FileReader(f));
while ((str = input.readLine()) != null) {
s1 += str + "\n";
}
System.out.println(s1);
input.close();
s1 += content;
BufferedWriter output = new BufferedWriter(new FileWriter(f));
output.write(s1);
output.close();
} catch (Exception e) {
e.printStackTrace();
}
}
4、抓取的数据结构
5、文件下载
http://download.csdn.net/download/weixin_36751895/9820068