java爬蟲-2018國家統計局區劃和城鄉劃分代碼以及數據庫、json文件

國家統計局:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/

2018分析:

 # 查看省份數據 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html

 # 查看 內蒙古 市級數據 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15.html
 
 # 查看 內蒙古 區級數據 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15/1509.html
 
 # 查看 內蒙古 街道級數據http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15/09/150902.html
 
 # 查看 內蒙古 社區居委會級數據 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15/09/02/150902003.html

發現這個是有規律的,15是內蒙古的區劃代碼,而1509是烏蘭察布市的區劃代碼,前面的http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/這一大串都是一樣的,我們就叫commonUrl。
    
規律就是:
		 
		 # 獲取省的數據 commonUrl + index.html
		 
		 # 獲取市級數據 commonUrl + 對應省級區劃代碼.html
		 
		 # 獲取縣區級數據 commonUrl + 對應省級區劃代碼 + / + 對應市級區劃代碼.html

詳細代碼如下:

實體類

package com.reptile.area.jsoup;

import lombok.*;

import java.util.List;

@Data
@ToString
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class Node {

    private String name;

    private String code;
    
    private String dataFromUrl;

    private List<Node> nodes;
}

具體實現:

package com.reptile.area.jsoup;

import java.nio.charset.Charset;
import java.util.LinkedList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import cn.hutool.core.util.CharsetUtil;
import cn.hutool.http.HttpUtil;
import cn.hutool.json.JSONUtil;
import cn.hutool.log.StaticLog;

/**
 * * 省市區區劃地址解析
 * 
 * @author zhang.xiaoming
 */
public class CityStats {

	private static final String COMMON_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/";

	private static final Charset CHARSET = CharsetUtil.CHARSET_GBK;

	private CityStats() {
	}

	public static void parseProvince(String url) {

		String htmlStr = HttpUtil.get(url, CHARSET);

		Document document = Jsoup.parse(htmlStr);

		// 獲取 class='provincetr' 的元素
		Elements elements = document.getElementsByClass("provincetr");
		List<Node> provinces = new LinkedList<Node>();
		for (Element element : elements) {
			// 獲取 elements 下屬性是 href 的元素
			Elements links = element.getElementsByAttribute("href");
			for (Element link : links) {
				String provinceName = link.text();
				String href = link.attr("href");
				String provinceCode = href.substring(0, 2);

				StaticLog.info("provinceName: {} , provinceCode: {} .", provinceName, provinceCode);

				Node provinceNode = Node.builder().code(provinceCode).name(provinceName).dataFromUrl(url).build();

				StaticLog.info("省級數據:  {}  ", provinceNode);

				parseCity(COMMON_URL + href, provinceNode);
				provinces.add(provinceNode);
			}
		}
		StaticLog.info(JSONUtil.toJsonPrettyStr(provinces));
	}

	public static void parseCity(String url, Node provinceNode) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("citytr");
		List<Node> cities = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements links = tr.getElementsByTag("a");
			String href = links.get(0).attr("href");
			String cityCode = links.get(0).text().substring(0, 4);
			String cityName = links.get(1).text();

			Node cityNode = Node.builder().name(cityName).code(cityCode).dataFromUrl(url).build();

			StaticLog.info("	市級數據:  {}  ", cityNode);

			parseCounty(COMMON_URL + href, cityNode);
			cities.add(cityNode);
		}
		provinceNode.setNodes(cities);
	}

	public static void parseCounty(String url, Node cityNode) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("countytr");
		List<Node> counties = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements links = tr.getElementsByTag("a");
			if (links == null || links.size() != 2) {
				continue;
			}
			String href = links.get(0).attr("href");
			String countyCode = links.get(0).text().substring(0, 6);
			String countyName = links.get(1).text();

			Node countyNode = Node.builder().name(countyName).code(countyCode).dataFromUrl(url).build();

			StaticLog.info("		縣級數據:  {}  ", countyNode);

			parseTowntr(COMMON_URL + href.subSequence(2, 5).toString() + "/" + href, countyNode);
			counties.add(cityNode);
		}
		cityNode.setNodes(counties);
	}

	public static void parseTowntr(String url, Node countyNode) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("towntr");

		List<Node> counties = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements links = tr.getElementsByTag("a");
			if (links == null || links.size() != 2) {
				continue;
			}
			String href = links.get(0).attr("href");
			String towntrCode = links.get(0).text().substring(0, 9);
			String towntrName = links.get(1).text();

			Node towntrNode = Node.builder().name(towntrName).code(towntrCode).dataFromUrl(url).build();

			StaticLog.info("		鄉鎮級數據:  {}  ", towntrNode);

			parseVillagetr(COMMON_URL + href.subSequence(2, 5).toString() + "/" + href.substring(5, 7) + "/" + href,
					countyNode);

			counties.add(towntrNode);
		}
		countyNode.setNodes(counties);
	}

	public static void parseVillagetr(String url, Node countyNode) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("villagetr");

		List<Node> counties = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements tds = tr.getElementsByTag("td");
			if (tds == null || tds.size() != 3) {
				continue;
			}
			String villagetrCode = tds.get(0).text();
			String villagetrName = tds.get(2).text();

			Node villagetrNode = Node.builder().code(villagetrCode).name(villagetrName).dataFromUrl(url).build();
			StaticLog.info("		村級數據:  {}  ", villagetrNode);

			counties.add(villagetrNode);

		}
		countyNode.setNodes(counties);
	}

	public static void main(String[] args) {
		/**
		 * # 查看省份數據 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html
		 * 
		 * # 查看 內蒙古 市級數據 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15.html
		 * 
		 * # 查看 內蒙古 區級數據
		 * http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15/1509.html
		 * 
		 * # 查看 內蒙古 街道級數據
		 * http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15/09/150902.html
		 * 
		 * # 查看 內蒙古 社區居委會級數據
		 * http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15/09/02/150902003.html
		 *
		 * *我們發現這個是有規律的,15是內蒙古的區劃代碼,而1509是烏蘭察布市的區劃代碼,
		 * *前面的http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/這一大串都是一樣的,我們就叫commonUrl。
		 * *規律就是:
		 *
		 * # 獲取省的數據 commonUrl + index.html
		 * 
		 * # 獲取市級數據 commonUrl + 對應省級區劃代碼.html
		 * 
		 * # 獲取縣區級數據 commonUrl + 對應省級區劃代碼 + / + 對應市級區劃代碼.html
		 */

		String provinceUrl = COMMON_URL + "index.html";
		CityStats.parseProvince(provinceUrl);

		String cityUrl = COMMON_URL + "15.html";
		CityStats.parseCity(cityUrl, new Node());

		String countyUrl = COMMON_URL + "15/1509.html";
		CityStats.parseCounty(countyUrl, new Node());

		String towntrUrl = COMMON_URL + "15/09/150981.html";
		CityStats.parseTowntr(towntrUrl, new Node());
	}
}

####################################################################################################

mysql數據等代碼實現

package com.reptile.area.decorator;

import java.nio.charset.Charset;
import java.util.LinkedList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.reptile.area.jsoup.Node;

import cn.hutool.core.util.CharsetUtil;
import cn.hutool.http.HttpUtil;
import cn.hutool.log.StaticLog;

/**
 * *省市縣解析器
 */
public class CityParser implements ICityParser {

	private static final String COMMON_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/";

	private static final Charset CHARSET = CharsetUtil.CHARSET_GBK;

	public List<Node> parseProvinces(String url) {
		return parseProvince(COMMON_URL + "index.html");
	}

	private List<Node> parseProvince(String url) {

		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);

		// 獲取 class='provincetr' 的元素
		Elements elements = document.getElementsByClass("provincetr");
		List<Node> provinces = new LinkedList<Node>();
		for (Element element : elements) {
			// 獲取 elements 下屬性是 href 的元素
			Elements links = element.getElementsByAttribute("href");
			for (Element link : links) {
				String provinceName = link.text();
				String href = link.attr("href");
				String provinceCode = href.substring(0, 2);

				Node provinceNode = Node.builder().code(provinceCode).name(provinceName).dataFromUrl(url)
						.nodes(parseCity(COMMON_URL + href)).build();

				StaticLog.info("省級數據:  {}  ", provinceNode);

				provinces.add(provinceNode);
			}
		}
		return provinces;
	}

	private List<Node> parseCity(String url) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("citytr");

		List<Node> cities = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements links = tr.getElementsByTag("a");
			String href = links.get(0).attr("href");
			String cityCode = links.get(0).text().substring(0, 4);
			String cityName = links.get(1).text();

			Node cityNode = Node.builder().name(cityName).code(cityCode).dataFromUrl(url)
					.nodes(parseCounty(COMMON_URL + href)).build();

			StaticLog.info("	市級數據:  {}  ", cityNode);

			cities.add(cityNode);
		}
		return cities;
	}

	private List<Node> parseCounty(String url) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("countytr");

		List<Node> counties = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements links = tr.getElementsByTag("a");
			if (links == null || links.size() != 2) {
				continue;
			}
			String href = links.get(0).attr("href");
			String countyCode = links.get(0).text().substring(0, 6);
			String countyName = links.get(1).text();

			Node countyNode = Node.builder().code(countyCode).name(countyName).dataFromUrl(url)
					.nodes(parseTowntr(COMMON_URL + href.subSequence(2, 5).toString() + "/" + href)).build();
			StaticLog.info("		縣級數據:  {}  ", countyNode);

			counties.add(countyNode);
		}
		return counties;
	}

	public List<Node> parseTowntr(String url) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("towntr");

		List<Node> counties = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements links = tr.getElementsByTag("a");
			if (links == null || links.size() != 2) {
				continue;
			}
			String href = links.get(0).attr("href");
			String towntrCode = links.get(0).text().substring(0, 6);
			String towntrName = links.get(1).text();

			Node towntrNode = Node.builder().name(towntrName).code(towntrCode).dataFromUrl(url)
					.nodes(parseVillagetr(
							COMMON_URL + href.subSequence(2, 5).toString() + "/" + href.substring(5, 7) + "/" + href))
					.build();

			StaticLog.info("			鄉鎮級數據:  {}  ", towntrNode);

			counties.add(towntrNode);
		}
		return counties;
	}

	public List<Node> parseVillagetr(String url) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("villagetr");

		List<Node> counties = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements tds = tr.getElementsByTag("td");
			if (tds == null || tds.size() != 3) {
				continue;
			}
			String villagetrCode = tds.get(0).text();
			String villagetrName = tds.get(2).text();
			
			Node villagetrNode = Node.builder().code(villagetrCode).name(villagetrName).dataFromUrl(url).build();
			StaticLog.info("				村級數據:  {}  ", villagetrNode);
			
			counties.add(villagetrNode);
		}
		return counties;
	}

}
package com.reptile.area.decorator;

import java.util.List;

import com.reptile.area.jsoup.Node;

public class CityParserDecorator implements ICityParser {

	private ICityParser cityParser;

	public CityParserDecorator(ICityParser cityParser) {
		this.cityParser = cityParser;
	}

	public List<Node> parseProvinces(String url) {
		return this.cityParser.parseProvinces(url);
	}
}
package com.reptile.area.decorator;


import java.util.List;

import com.reptile.area.jsoup.Node;

public interface ICityParser {

    /**
     * *解析得到省市區數據
     *
     * @param url 請求url
     * @return 城市
     */
    List<Node> parseProvinces(String url);
}
package com.reptile.area.decorator;

import java.util.List;

import com.reptile.area.jsoup.Node;

import cn.hutool.core.io.FileUtil;
import cn.hutool.core.io.file.FileWriter;
import cn.hutool.json.JSONUtil;

public class JsonCityParserDecorator extends CityParserDecorator{

    public JsonCityParserDecorator(ICityParser cityParser) {
        super(cityParser);
    }

    @Override
    public List<Node> parseProvinces(String url) {
        List<Node> provinces = super.parseProvinces(url);
        String jsonStr = JSONUtil.toJsonStr(provinces);
        // json數據寫入到文件
        FileWriter fileWriter = new FileWriter(FileUtil.touch("F://area.json"));
        fileWriter.write(jsonStr);
        return provinces;
    }
}
package com.reptile.area.decorator;

import java.util.List;

import com.reptile.area.jsoup.Node;

import cn.hutool.log.StaticLog;

public class LocationCityParserDecorator extends CityParserDecorator {

	public LocationCityParserDecorator(ICityParser cityParser) {
		super(cityParser);
	}

	@Override
	public List<Node> parseProvinces(String url) {
		List<Node> provinces = super.parseProvinces(url);
		StaticLog.info("查詢出經緯度了. . . ");
		return provinces;
	}
}
package com.reptile.area.decorator;

import java.util.ArrayList;
import java.util.List;

import com.github.stuxuhai.jpinyin.PinyinException;
import com.github.stuxuhai.jpinyin.PinyinFormat;
import com.github.stuxuhai.jpinyin.PinyinHelper;
import com.reptile.area.jsoup.Node;

import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.io.file.FileWriter;
import cn.hutool.core.text.StrFormatter;
import cn.hutool.log.StaticLog;

/**
 * sql打印裝飾器
 */
public class SqlCityParserDecorator extends CityParserDecorator {

	private static final String SQL = "insert into area(`name`, `code`, full_spell, easy_spell, initial, parent_code, depth, data_from_url) values ('{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}');";

	public SqlCityParserDecorator(ICityParser cityParser) {
		super(cityParser);
	}

	@Override
	public List<Node> parseProvinces(String url) {
		List<Node> provinces = super.parseProvinces(url);
		
		List<String> buildSql = buildSql(provinces);
		if (CollUtil.isNotEmpty(buildSql)) {
			// json數據寫入到文件
			FileWriter fileWriter = new FileWriter(FileUtil.touch("F://area.sql"));
			fileWriter.writeLines(buildSql);
		}
		return provinces;
	}

	/**
	 * *實體轉sql數據
	 * 
	 * @param provinces 省市縣數據
	 */
	private List<String> buildSql(List<Node> provinces) {
		List<String> sqls = null;
		if (CollUtil.isNotEmpty(provinces)) {
			sqls = new ArrayList<>();
			for (Node province : provinces) {
				sqls.add(initSql(province.getName(), province.getCode(), province.getDataFromUrl(), "", 1));
				buildCitySql(sqls, province.getNodes(), province.getCode());
			}
		}
		return sqls;
	}

	private void buildCitySql(List<String> sqls, List<Node> cities, String parentCode) {
		if (CollUtil.isNotEmpty(cities)) {
			for (Node city : cities) {
				sqls.add(initSql(city.getName(), city.getCode(), city.getDataFromUrl(), parentCode, 2));
				buildCountySql(sqls, city.getNodes(), city.getCode());
			}
		}
	}

	private void buildCountySql(List<String> sqls, List<Node> counties, String parentCode) {
		if (CollUtil.isNotEmpty(counties)) {
			for (Node county : counties) {
				sqls.add(initSql(county.getName(), county.getCode(), county.getDataFromUrl(), parentCode, 3));
				buildTowntrSql(sqls, county.getNodes(), county.getCode());
			}
		}
	}

	private void buildTowntrSql(List<String> sqls, List<Node> towies, String parentCode) {
		if (CollUtil.isNotEmpty(towies)) {
			for (Node towntr : towies) {
				sqls.add(initSql(towntr.getName(), towntr.getCode(), towntr.getDataFromUrl(), parentCode, 4));
				buildVillagetrSql(sqls, towntr.getNodes(), towntr.getCode());
			}
		}
	}

	private void buildVillagetrSql(List<String> sqls, List<Node> vilies, String parentCode) {
		if (CollUtil.isNotEmpty(vilies)) {
			for (Node villagetr : vilies) {
				sqls.add(initSql(villagetr.getName(), villagetr.getCode(), villagetr.getDataFromUrl(), parentCode, 5));
			}
		}
	}

	/**
	 ** 初始化sql語句
	 */
	private String initSql(String name, String code, String dataFromUrl, String parentCode, Integer depth) {
		String insertSql = null;
		try {
			insertSql = StrFormatter.format(SQL, name, code,
					PinyinHelper.convertToPinyinString(name, "", PinyinFormat.WITHOUT_TONE),
					PinyinHelper.getShortPinyin(name), PinyinHelper.getShortPinyin(name).substring(0, 1).toString(),
					parentCode, depth, dataFromUrl);
			StaticLog.info(insertSql);
		} catch (PinyinException e) {
			StaticLog.error("拼音解析失敗:{} .", e.getMessage());
		}
		return insertSql;
	}
}

數據庫表:

-- ----------------------------
-- Table structure for area
-- ----------------------------
DROP TABLE IF EXISTS `area`;
CREATE TABLE `area` (
  `code` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '' COMMENT '城市代碼',
  `name` varchar(64) COLLATE utf8mb4_unicode_ci DEFAULT '' COMMENT '城市名稱',
  `full_spell` varchar(256) COLLATE utf8mb4_unicode_ci DEFAULT '' COMMENT '全拼,北京全拼爲beijing',
  `easy_spell` varchar(128) COLLATE utf8mb4_unicode_ci DEFAULT '' COMMENT '簡拼,北京簡拼爲bj',
  `initial` char(8) COLLATE utf8mb4_unicode_ci DEFAULT '' COMMENT '首字母,北京首字母爲b',
  `parent_code` varchar(32) COLLATE utf8mb4_unicode_ci DEFAULT '' COMMENT '父級城市代碼',
  `depth` char(1) COLLATE utf8mb4_unicode_ci DEFAULT '0' COMMENT '等級:省=1,市=2,縣區=3,鄉鎮=4,村=5  ',
  `data_from_url` varchar(128) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '數據來源地址',
  PRIMARY KEY (`code`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='省市縣(區)表';

-- ----------------------------

 

#######################################-------完結----------###############################################

測試代碼

package com.reptile.area;

import java.util.List;

import com.reptile.area.decorator.CityParser;
import com.reptile.area.decorator.ICityParser;
import com.reptile.area.decorator.JsonCityParserDecorator;
import com.reptile.area.decorator.LocationCityParserDecorator;
import com.reptile.area.decorator.SqlCityParserDecorator;
import com.reptile.area.jsoup.Node;

import cn.hutool.core.date.DateUtil;
import cn.hutool.core.date.TimeInterval;
import cn.hutool.log.StaticLog;

public class CityParserTest {
	private static final String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html";

	public static void main(String[] args) {
		TimeInterval timer = DateUtil.timer();
		// -------這是執行過程--------------
		cityParserDecorator();
		// ---------------------------------
		long interval = timer.interval();// 花費毫秒數
		long intervalMinute = timer.intervalMinute();// 花費分鐘數
		StaticLog.info("本次程序執行 花費毫秒數: {} ,   花費分鐘數:{} . ", interval, intervalMinute);
	}

	private static List<Node> cityParserDecorator() {

		ICityParser cityParser = new CityParser();

		// 1. 先查經緯度
		ICityParser locationCityParser = new LocationCityParserDecorator(cityParser);

		// 展示sql
		ICityParser sqlCityParser = new SqlCityParserDecorator(locationCityParser);

		// 打印json
		ICityParser jsonCityParser = new JsonCityParserDecorator(sqlCityParser);

		List<Node> parseProvinces = jsonCityParser.parseProvinces(url);

		return parseProvinces;
	}
}

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章