前言:
參考文章:
https://www.cnblogs.com/yangzhilong/p/3530700.html
https://www.cnblogs.com/liushaofeng89/p/4873086.html
最近因爲用戶反饋省份數據表單有部分缺失,百度了一圈度娘以後決定還是自己拉取一下,省份數據的來源於國家統計局,筆者拉取的是2019年,2020-02-25拉取的數據。
省份數據來源:國家統計局
http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/
筆者用的是java 中jsoup ,關於jsoup的用法,可參考下面這個文章:https://www.open-open.com/jsoup/
開始
1.準備一張表 region_directory
CREATE TABLE `region_directory` (
`id` int(32) NOT NULL AUTO_INCREMENT,
`pid` int(32) DEFAULT NULL COMMENT '父級ID',
`name` varchar(64) DEFAULT NULL COMMENT '地域名稱',
`name_CN` varchar(64) DEFAULT NULL COMMENT '地域英文名',
`create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '創建時間',
`update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '修改時間',
`create_user` varchar(255) DEFAULT NULL COMMENT '創建人',
`update_user` varchar(255) DEFAULT NULL COMMENT '修改人',
`is_open` char(2) DEFAULT NULL COMMENT '是否開啓 (0代表未開啓 1代表開啓)',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=2421 DEFAULT CHARSET=utf8 COMMENT='地域表';
2.需要在pom文件中引入jsoup 的jar 包。
官方上現在有更高版本,我這邊使用的是目前使用人數比較多的版本。
<!-- jsoup HTML parser library @ https://jsoup.org/ -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
3.拉取數據的代碼主要在getRegionDirectory 這個接口中。
4.需要注意的一點是:下圖中的這個name ,這個name 代表的是全國省份一級數據,我加了一個判斷,先拉取北京市的數據,之所以加這個判斷的原因是 數據量比較大,我如果一次性拉取過多的話,連接會報502 ,現在很多網站會做這種惡意攻擊的防範,這裏需要注意。
4.1 這就是上述圖片中描述的502報錯
5.接下來就可以在瀏覽器上訪問拉取數據的接口:
控制檯打印一下 獲取的數據:
存到數據庫中的數據:
6.文章中涉及的所有代碼
RegionDirectoryController
package com.bos.controller.basic;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.bos.data.model.RegionDirectoryModel;
import com.bos.data.model.vo.basic.RegionVo;
import com.bos.data.repositories.jpa.setting.RegionDirectoryJPARepository;
import com.google.gson.Gson;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.transaction.interceptor.TransactionAspectSupport;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
/**
* @Author tanghh
* @Date 2020/6/23 10:37
*/
@RestController
@RequestMapping(value = "/region")
public class RegionDirectoryController {
private Logger logger = LoggerFactory.getLogger(RegionDirectoryController.class);
@Autowired
private RegionDirectoryJPARepository regionDirectoryJPARepository;
private static List<String> types = new ArrayList<>();
private static List<String> specialCitys = new ArrayList<>();
/**
* 省份
*/
public static final String LEVEL_PROVINCE = "provincetr";
/**
* 城市
*/
public static final String LEVEL_CITY = "citytr";
/**
* 區
*/
public static final String LEVEL_COUNTY = "countytr";
/**
* 街道
*/
public static final String LEVEL_TOWN = "towntr";
/**
* 居委會
*/
public static final String LEVEL_VILLAGE = "villagetr";
public static final int LEVEL_MODE_STRING = 1;
public static final int LEVEL_MODE_NUMBER = 2;
public static final String CHARSET = "GBK";
static {
types.add(LEVEL_PROVINCE);
types.add(LEVEL_CITY);
types.add(LEVEL_COUNTY);
types.add(LEVEL_TOWN);
types.add(LEVEL_VILLAGE);
}
/**
* 這個列表存放的是比較特殊的市,它們是屬於LEVEL_CITY,但下一級卻跳過了LEVEL_COUNTY,而直接到LEVEL_TOWN
* 由於數據較多,不能一一比對,使用當中發現屬於這種情況的城市加入到這裏即可
*/
static {
specialCitys.add("東莞市");
specialCitys.add("中山市");
specialCitys.add("儋州市");
}
//**************************以下值請根據實際情況修改*************************************
/**
* 抓取的首頁
*/
public static final String webUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html";
/**
* 保存路徑
*/
public static final String savePath = "C:/project/latestEbo/ebo-web/ebo/src/main/resources/china.json";
/**
* 抓取數據的範圍[]支持第一級和中國,比如中國,廣東省,北京市
*/
public static final String AREA = "中國";
public static int TARGET_LEVEL = 3;
/**
* 表示抓取數據的層級採用的模式:LEVEL_MODE_STRING--表示按字符級別 LEVEL_MODE_NUMBER--表示按數字級別
*/
public static int LEVEL_MODE = LEVEL_MODE_NUMBER;
//**************************以上值請根據實際情況修改*************************************
@GetMapping(value = "/getRegionDirectory")
public void getRegionDirectory() {
try {
System.out.println("開始抓取,請耐心等待!!!");
System.out.println("抓取範圍:" + AREA + ",抓取模式(1--字符 2--數字):" + LEVEL_MODE + ",抓取層級:" + TARGET_LEVEL + "(模式爲字符:1--province,2--city,3--county,4--town,5--village;)");
long starttime = System.currentTimeMillis();
RegionVo region = new RegionVo("000000000000", "中國", 0);
region.child = new ArrayList<>();
Document doc = getDocument(webUrl);
Elements provincetr = doc.getElementsByClass(LEVEL_PROVINCE);
for (Element e : provincetr) {
Elements a = e.getElementsByTag("a");
for (Element ea : a) {
//拿到絕對路徑
String nextUrl = ea.attr("abs:href");
String[] arr = nextUrl.split("/");
String code = arr[arr.length - 1].split("\\.")[0] + "0000000000";
String name = ea.text();
if (name.equals("北京市")) {
if (AREA.equals("中國") || AREA.equals(name)) {
System.out.println(name);
RegionVo child = new RegionVo(code, name, 1);
region.child.add(child);
int currentlevel = LEVEL_MODE == LEVEL_MODE_STRING ? getLevel(LEVEL_PROVINCE) : child.level;
//表示還需要繼續解析
if (currentlevel < TARGET_LEVEL) {
parseNext(types.get(1), nextUrl, child);
}
}
}
}
}
//解析json
String jsonStr = new Gson().toJson(region);
System.out.println(jsonStr);
JSONObject jsonObject = JSONObject.parseObject(jsonStr);
JSONArray childJsonArray = jsonObject.getJSONArray("child");
for (int i = 0; i < childJsonArray.size(); i++) {
JSONObject childJsonObject = (JSONObject) childJsonArray.get(i);
//獲取省份級別數據
String provinceName = childJsonObject.getString("name");
RegionDirectoryModel regionDirectoryModel = new RegionDirectoryModel(0, provinceName, "湯輝紅", "湯輝紅");
RegionDirectoryModel provinceModel = regionDirectoryJPARepository.saveAndFlush(regionDirectoryModel);
JSONArray jsonArray = childJsonObject.getJSONArray("child");
for (Object o : jsonArray) {
//獲取省份下的市數據
JSONObject itemJsonObject = (JSONObject) o;
String cityName = itemJsonObject.getString("name");
RegionDirectoryModel cityModel = new RegionDirectoryModel(provinceModel.getId(), cityName, "湯輝紅", "湯輝紅");
RegionDirectoryModel newCityModel = regionDirectoryJPARepository.saveAndFlush(cityModel);
JSONArray finalChildJsonArray = itemJsonObject.getJSONArray("child");
for (Object o1 : finalChildJsonArray) {
//獲取城市下的縣數據
JSONObject finalJsonObject = (JSONObject) o1;
String prefectureName = finalJsonObject.getString("name");
RegionDirectoryModel prefectureModel = new RegionDirectoryModel(newCityModel.getId(), prefectureName, "湯輝紅", "湯輝紅");
regionDirectoryJPARepository.save(prefectureModel);
}
}
}
long endtime = System.currentTimeMillis();
System.out.println("抓取完畢!!!耗時:" + (endtime - starttime) / 1000 / 60 + "min");
} catch (Exception e) {
logger.error("獲取省份數據失敗", e);
TransactionAspectSupport.currentTransactionStatus().setRollbackOnly();
}
}
private static Document getDocument(String url) throws IOException {
return Jsoup.parse(new URL(url).openStream(), CHARSET, url);
}
/**
* @param type 見LEVEL_
* @return
*/
private static int getLevel(String type) {
return types.indexOf(type) + 1;
}
private static void saveJson(RegionVo region) throws IOException {
FileWriter fw = new FileWriter(new File(savePath));
BufferedWriter bw = new BufferedWriter(fw);
bw.write(new Gson().toJson(region));
bw.flush();
bw.close();
}
/**
* 解析下一級數據
*
* @param type 見LEVEL_開頭
* @param url 要抓取的網頁url
* @param region 將要保存的數據
* @throws Exception
*/
public static void parseNext(String type, String url, RegionVo region) throws Exception {
region.child = new ArrayList<>();
Document doc = getDocument(url);
Elements es = doc.getElementsByClass(type);
if (LEVEL_VILLAGE.equals(type)) {
//<tr class="villagetr"><td>110101001001</td><td>111</td><td>多福巷社區居委會</td></tr>
for (Element e : es) {
Elements tds = e.getElementsByTag("td");
String code = tds.get(0).text();
String name = tds.get(2).text();
RegionVo child = new RegionVo(code, name, region.level + 1);
region.child.add(child);
System.out.println(space(child.level) + name);
}
} else {
//需要處理以下兩種情況
//第一種:<tr class="countytr"><td>130101000000</td><td>市轄區</td></tr>
//第二種:<tr class="countytr"><td><a href="01/130102.html">130102000000</a></td><td><a href="01/130102.html">長安區</a></td></tr>
for (Element e : es) {
String code = null;
String name = null;
String nextUrl = null;
Elements a = e.getElementsByTag("a");
if (a.isEmpty()) {
//屬於第一種情況
Elements tds = e.getElementsByTag("td");
code = tds.get(0).text();
name = tds.get(1).text();
} else {
//13/1301.html
nextUrl = a.get(0).attr("abs:href");
code = a.get(0).text();
name = a.get(1).text();
}
RegionVo child = new RegionVo(code, name, region.level + 1);
region.child.add(child);
System.out.println(space(child.level) + name);
int currentlevel = LEVEL_MODE == LEVEL_MODE_STRING ? getLevel(type) : child.level;
if (!a.isEmpty() && currentlevel < TARGET_LEVEL) {
//如果是東莞市,LEVEL_CITY下一級是LEVEL_TOWN,而不是LEVEL_COUNTY這裏需要特殊處理
String nextType = null;
if (LEVEL_MODE == LEVEL_MODE_NUMBER
&& (specialCitys.contains(name))) {
nextType = LEVEL_TOWN;
} else {
nextType = types.get(types.indexOf(type) + 1);
}
parseNext(nextType, nextUrl, child);
}
}
}
}
private static String space(int level) {
if (level > 5) {
return "";
}
return " ".substring(0, level);
}
}
RegionVo
package com.bos.data.model.vo.basic;
import lombok.Data;
import java.util.List;
/**
* @Author tanghh
* @Date 2020/6/23 10:41
*/
@Data
public class RegionVo {
/**
* 編碼
*/
public String code;
/**
* 名稱
*/
public String name;
/**
* 當前級別
*/
public int level;
/**
* 子數據
*/
public List<RegionVo> child;
public RegionVo(String code, String name, int level) {
this.code = code;
this.name = name;
this.level = level;
}
}
RegionDirectoryModel
package com.bos.data.model;
import javax.persistence.*;
import java.io.Serializable;
import java.sql.Timestamp;
import java.util.Objects;
/**
* @author luojie 2018/7/4
*/
@Entity
@Table(name = "region_directory", schema = "test", catalog = "")
public class RegionDirectoryModel implements Serializable {
private Integer id;
private Integer pid;
private String name;
private String nameCn;
private String isOpen="0";
private Timestamp createTime;
private Timestamp updateTime;
private String createUser;
private String updateUser;
@Id
@Column(name = "id")
@GeneratedValue(strategy = GenerationType.IDENTITY)
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
@Basic
@Column(name = "name")
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
@Basic
@Column(name = "name_CN")
public String getNameCn() {
return nameCn;
}
public void setNameCn(String nameCn) {
this.nameCn = nameCn;
}
@Basic
@Column(name = "pid")
public Integer getPid() {
return pid;
}
public void setPid(Integer pid) {
this.pid = pid;
}
@Basic
@Column(name = "is_open")
public String getIsOpen() {
return isOpen;
}
public void setIsOpen(String isOpen) {
this.isOpen = isOpen;
}
@Basic
@Column(name = "create_time")
public Timestamp getCreateTime() {
return createTime;
}
public void setCreateTime(Timestamp createTime) {
this.createTime = createTime;
}
@Basic
@Column(name = "update_time")
public Timestamp getUpdateTime() {
return updateTime;
}
public void setUpdateTime(Timestamp updateTime) {
this.updateTime = updateTime;
}
@Basic
@Column(name = "create_user")
public String getCreateUser() {
return createUser;
}
public void setCreateUser(String createUser) {
this.createUser = createUser;
}
@Basic
@Column(name = "update_user")
public String getUpdateUser() {
return updateUser;
}
public void setUpdateUser(String updateUser) {
this.updateUser = updateUser;
}
public RegionDirectoryModel() {
}
public RegionDirectoryModel(Integer pid, String name, String createUser, String updateUser) {
this.pid = pid;
this.name = name;
this.createUser = createUser;
this.updateUser = updateUser;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
RegionDirectoryModel that = (RegionDirectoryModel) o;
return id == that.id &&
Objects.equals(name, that.name) &&
Objects.equals(nameCn, that.nameCn) &&
Objects.equals(pid, that.pid);
}
@Override
public int hashCode() {
return Objects.hash(id, name, nameCn, pid);
}
}
本篇文章就到這裏,
如果覺得筆者寫的不錯的話,歡迎評論點贊。
下篇文章貼出所有省份數據。