2020 年省份數據拉取

前言:

參考文章:

https://www.cnblogs.com/yangzhilong/p/3530700.html

https://www.cnblogs.com/liushaofeng89/p/4873086.html

最近因爲用戶反饋省份數據表單有部分缺失,百度了一圈度娘以後決定還是自己拉取一下,省份數據的來源於國家統計局,筆者拉取的是2019年,2020-02-25拉取的數據。 

省份數據來源:國家統計局

http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/

筆者用的是java 中jsoup ,關於jsoup的用法,可參考下面這個文章:https://www.open-open.com/jsoup/

開始

1.準備一張表 region_directory

CREATE TABLE `region_directory` (
  `id` int(32) NOT NULL AUTO_INCREMENT,
  `pid` int(32) DEFAULT NULL COMMENT '父級ID',
  `name` varchar(64) DEFAULT NULL COMMENT '地域名稱',
  `name_CN` varchar(64) DEFAULT NULL COMMENT '地域英文名',
  `create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '創建時間',
  `update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '修改時間',
  `create_user` varchar(255) DEFAULT NULL COMMENT '創建人',
  `update_user` varchar(255) DEFAULT NULL COMMENT '修改人',
  `is_open` char(2) DEFAULT NULL COMMENT '是否開啓 (0代表未開啓 1代表開啓)',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=2421 DEFAULT CHARSET=utf8 COMMENT='地域表';

2.需要在pom文件中引入jsoup 的jar 包。

 官方上現在有更高版本,我這邊使用的是目前使用人數比較多的版本。

     <!-- jsoup HTML parser library @ https://jsoup.org/ -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.3</version>
        </dependency>

3.拉取數據的代碼主要在getRegionDirectory 這個接口中。

4.需要注意的一點是:下圖中的這個name ,這個name 代表的是全國省份一級數據,我加了一個判斷,先拉取北京市的數據,之所以加這個判斷的原因是 數據量比較大,我如果一次性拉取過多的話,連接會報502 ,現在很多網站會做這種惡意攻擊的防範,這裏需要注意。

 

4.1 這就是上述圖片中描述的502報錯

5.接下來就可以在瀏覽器上訪問拉取數據的接口:

 控制檯打印一下 獲取的數據:

存到數據庫中的數據:

6.文章中涉及的所有代碼

RegionDirectoryController

package com.bos.controller.basic;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.bos.data.model.RegionDirectoryModel;
import com.bos.data.model.vo.basic.RegionVo;
import com.bos.data.repositories.jpa.setting.RegionDirectoryJPARepository;
import com.google.gson.Gson;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.transaction.interceptor.TransactionAspectSupport;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

/**
 * @Author tanghh
 * @Date 2020/6/23 10:37
 */
@RestController
@RequestMapping(value = "/region")
public class RegionDirectoryController {
    private Logger logger = LoggerFactory.getLogger(RegionDirectoryController.class);

    @Autowired
    private RegionDirectoryJPARepository regionDirectoryJPARepository;


    private static List<String> types = new ArrayList<>();
    private static List<String> specialCitys = new ArrayList<>();
    /**
     * 省份
     */
    public static final String LEVEL_PROVINCE = "provincetr";
    /**
     * 城市
     */
    public static final String LEVEL_CITY = "citytr";
    /**
     * 區
     */
    public static final String LEVEL_COUNTY = "countytr";
    /**
     * 街道
     */
    public static final String LEVEL_TOWN = "towntr";
    /**
     * 居委會
     */
    public static final String LEVEL_VILLAGE = "villagetr";

    public static final int LEVEL_MODE_STRING = 1;
    public static final int LEVEL_MODE_NUMBER = 2;
    public static final String CHARSET = "GBK";


    static {
        types.add(LEVEL_PROVINCE);
        types.add(LEVEL_CITY);
        types.add(LEVEL_COUNTY);
        types.add(LEVEL_TOWN);
        types.add(LEVEL_VILLAGE);
    }


    /**
     * 這個列表存放的是比較特殊的市,它們是屬於LEVEL_CITY,但下一級卻跳過了LEVEL_COUNTY,而直接到LEVEL_TOWN
     * 由於數據較多,不能一一比對,使用當中發現屬於這種情況的城市加入到這裏即可
     */
    static {
        specialCitys.add("東莞市");
        specialCitys.add("中山市");
        specialCitys.add("儋州市");
    }

    //**************************以下值請根據實際情況修改*************************************
    /**
     * 抓取的首頁
     */
    public static final String webUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html";
    /**
     * 保存路徑
     */
    public static final String savePath = "C:/project/latestEbo/ebo-web/ebo/src/main/resources/china.json";

    /**
     * 抓取數據的範圍[]支持第一級和中國,比如中國,廣東省,北京市
     */
    public static final String AREA = "中國";

    public static int TARGET_LEVEL = 3;
    /**
     * 表示抓取數據的層級採用的模式:LEVEL_MODE_STRING--表示按字符級別 LEVEL_MODE_NUMBER--表示按數字級別
     */
    public static int LEVEL_MODE = LEVEL_MODE_NUMBER;
    //**************************以上值請根據實際情況修改*************************************

    @GetMapping(value = "/getRegionDirectory")
    public void getRegionDirectory() {
        try {
            System.out.println("開始抓取,請耐心等待!!!");
            System.out.println("抓取範圍:" + AREA + ",抓取模式(1--字符 2--數字):" + LEVEL_MODE + ",抓取層級:" + TARGET_LEVEL + "(模式爲字符:1--province,2--city,3--county,4--town,5--village;)");
            long starttime = System.currentTimeMillis();

            RegionVo region = new RegionVo("000000000000", "中國", 0);
            region.child = new ArrayList<>();
            Document doc = getDocument(webUrl);
            Elements provincetr = doc.getElementsByClass(LEVEL_PROVINCE);
            for (Element e : provincetr) {
                Elements a = e.getElementsByTag("a");
                for (Element ea : a) {
                    //拿到絕對路徑
                    String nextUrl = ea.attr("abs:href");
                    String[] arr = nextUrl.split("/");
                    String code = arr[arr.length - 1].split("\\.")[0] + "0000000000";
                    String name = ea.text();

                    if (name.equals("北京市")) {
                        if (AREA.equals("中國") || AREA.equals(name)) {
                            System.out.println(name);
                            RegionVo child = new RegionVo(code, name, 1);
                            region.child.add(child);
                            int currentlevel = LEVEL_MODE == LEVEL_MODE_STRING ? getLevel(LEVEL_PROVINCE) : child.level;
                            //表示還需要繼續解析
                            if (currentlevel < TARGET_LEVEL) {
                                parseNext(types.get(1), nextUrl, child);
                            }
                        }
                    }
                }
            }
            //解析json
            String jsonStr = new Gson().toJson(region);
            System.out.println(jsonStr);
            JSONObject jsonObject = JSONObject.parseObject(jsonStr);
            JSONArray childJsonArray = jsonObject.getJSONArray("child");
            for (int i = 0; i < childJsonArray.size(); i++) {
                JSONObject childJsonObject = (JSONObject) childJsonArray.get(i);
                //獲取省份級別數據
                String provinceName = childJsonObject.getString("name");
                RegionDirectoryModel regionDirectoryModel = new RegionDirectoryModel(0, provinceName, "湯輝紅", "湯輝紅");
                RegionDirectoryModel provinceModel = regionDirectoryJPARepository.saveAndFlush(regionDirectoryModel);
                JSONArray jsonArray = childJsonObject.getJSONArray("child");
                for (Object o : jsonArray) {
                    //獲取省份下的市數據
                    JSONObject itemJsonObject = (JSONObject) o;
                    String cityName = itemJsonObject.getString("name");
                    RegionDirectoryModel cityModel = new RegionDirectoryModel(provinceModel.getId(), cityName, "湯輝紅", "湯輝紅");
                    RegionDirectoryModel newCityModel = regionDirectoryJPARepository.saveAndFlush(cityModel);
                    JSONArray finalChildJsonArray = itemJsonObject.getJSONArray("child");
                    for (Object o1 : finalChildJsonArray) {
                        //獲取城市下的縣數據
                        JSONObject finalJsonObject = (JSONObject) o1;
                        String prefectureName = finalJsonObject.getString("name");
                        RegionDirectoryModel prefectureModel = new RegionDirectoryModel(newCityModel.getId(), prefectureName, "湯輝紅", "湯輝紅");
                        regionDirectoryJPARepository.save(prefectureModel);
                    }
                }

            }
            long endtime = System.currentTimeMillis();
            System.out.println("抓取完畢!!!耗時:" + (endtime - starttime) / 1000 / 60 + "min");
        } catch (Exception e) {
            logger.error("獲取省份數據失敗", e);
            TransactionAspectSupport.currentTransactionStatus().setRollbackOnly();
        }

    }

    private static Document getDocument(String url) throws IOException {
        return Jsoup.parse(new URL(url).openStream(), CHARSET, url);
    }

    /**
     * @param type 見LEVEL_
     * @return
     */
    private static int getLevel(String type) {
        return types.indexOf(type) + 1;
    }

    private static void saveJson(RegionVo region) throws IOException {
        FileWriter fw = new FileWriter(new File(savePath));
        BufferedWriter bw = new BufferedWriter(fw);
        bw.write(new Gson().toJson(region));
        bw.flush();
        bw.close();
    }

    /**
     * 解析下一級數據
     *
     * @param type   見LEVEL_開頭
     * @param url    要抓取的網頁url
     * @param region 將要保存的數據
     * @throws Exception
     */
    public static void parseNext(String type, String url, RegionVo region) throws Exception {
        region.child = new ArrayList<>();
        Document doc = getDocument(url);
        Elements es = doc.getElementsByClass(type);
        if (LEVEL_VILLAGE.equals(type)) {
            //<tr class="villagetr"><td>110101001001</td><td>111</td><td>多福巷社區居委會</td></tr>
            for (Element e : es) {
                Elements tds = e.getElementsByTag("td");
                String code = tds.get(0).text();
                String name = tds.get(2).text();
                RegionVo child = new RegionVo(code, name, region.level + 1);
                region.child.add(child);
                System.out.println(space(child.level) + name);
            }
        } else {
            //需要處理以下兩種情況
            //第一種:<tr class="countytr"><td>130101000000</td><td>市轄區</td></tr>
            //第二種:<tr class="countytr"><td><a href="01/130102.html">130102000000</a></td><td><a href="01/130102.html">長安區</a></td></tr>
            for (Element e : es) {
                String code = null;
                String name = null;
                String nextUrl = null;
                Elements a = e.getElementsByTag("a");
                if (a.isEmpty()) {
                    //屬於第一種情況
                    Elements tds = e.getElementsByTag("td");
                    code = tds.get(0).text();
                    name = tds.get(1).text();
                } else {
                    //13/1301.html
                    nextUrl = a.get(0).attr("abs:href");
                    code = a.get(0).text();
                    name = a.get(1).text();
                }
                RegionVo child = new RegionVo(code, name, region.level + 1);
                region.child.add(child);
                System.out.println(space(child.level) + name);
                int currentlevel = LEVEL_MODE == LEVEL_MODE_STRING ? getLevel(type) : child.level;
                if (!a.isEmpty() && currentlevel < TARGET_LEVEL) {
                    //如果是東莞市,LEVEL_CITY下一級是LEVEL_TOWN,而不是LEVEL_COUNTY這裏需要特殊處理
                    String nextType = null;
                    if (LEVEL_MODE == LEVEL_MODE_NUMBER
                            && (specialCitys.contains(name))) {
                        nextType = LEVEL_TOWN;
                    } else {
                        nextType = types.get(types.indexOf(type) + 1);
                    }

                    parseNext(nextType, nextUrl, child);
                }
            }
        }
    }

    private static String space(int level) {
        if (level > 5) {
            return "";
        }
        return "      ".substring(0, level);
    }


}

RegionVo

package com.bos.data.model.vo.basic;

import lombok.Data;

import java.util.List;

/**
 * @Author tanghh
 * @Date 2020/6/23 10:41
 */
@Data
public class RegionVo {
    /**
     * 編碼
     */
    public String code;
    /**
     * 名稱
     */
    public String name;
    /**
     * 當前級別
     */
    public int level;
    /**
     * 子數據
     */
    public List<RegionVo> child;

    public RegionVo(String code, String name, int level) {
        this.code = code;
        this.name = name;
        this.level = level;
    }

}

RegionDirectoryModel

package com.bos.data.model;

import javax.persistence.*;
import java.io.Serializable;
import java.sql.Timestamp;
import java.util.Objects;

/**
 * @author luojie 2018/7/4
 */
@Entity
@Table(name = "region_directory", schema = "test", catalog = "")
public class RegionDirectoryModel implements Serializable {
    private Integer id;
    private Integer pid;
    private String name;
    private String nameCn;
    private String isOpen="0";
    private Timestamp createTime;
    private Timestamp updateTime;
    private String createUser;
    private String updateUser;

    @Id
    @Column(name = "id")
    @GeneratedValue(strategy = GenerationType.IDENTITY)
    public Integer getId() {
        return id;
    }

    public void setId(Integer id) {
        this.id = id;
    }

    @Basic
    @Column(name = "name")
    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    @Basic
    @Column(name = "name_CN")
    public String getNameCn() {
        return nameCn;
    }

    public void setNameCn(String nameCn) {
        this.nameCn = nameCn;
    }

    @Basic
    @Column(name = "pid")
    public Integer getPid() {
        return pid;
    }

    public void setPid(Integer pid) {
        this.pid = pid;
    }
    @Basic
    @Column(name = "is_open")
    public String getIsOpen() {
        return isOpen;
    }

    public void setIsOpen(String isOpen) {
        this.isOpen = isOpen;
    }
    @Basic
    @Column(name = "create_time")
    public Timestamp getCreateTime() {
        return createTime;
    }

    public void setCreateTime(Timestamp createTime) {
        this.createTime = createTime;
    }
    @Basic
    @Column(name = "update_time")
    public Timestamp getUpdateTime() {
        return updateTime;
    }

    public void setUpdateTime(Timestamp updateTime) {
        this.updateTime = updateTime;
    }
    @Basic
    @Column(name = "create_user")
    public String getCreateUser() {
        return createUser;
    }

    public void setCreateUser(String createUser) {
        this.createUser = createUser;
    }
    @Basic
    @Column(name = "update_user")
    public String getUpdateUser() {
        return updateUser;
    }

    public void setUpdateUser(String updateUser) {
        this.updateUser = updateUser;
    }

    public RegionDirectoryModel() {

    }

    public RegionDirectoryModel(Integer pid, String name, String createUser, String updateUser) {
        this.pid = pid;
        this.name = name;
        this.createUser = createUser;
        this.updateUser = updateUser;
    }

    @Override
    public boolean equals(Object o) {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;
        RegionDirectoryModel that = (RegionDirectoryModel) o;
        return id == that.id &&
                Objects.equals(name, that.name) &&
                Objects.equals(nameCn, that.nameCn) &&
                Objects.equals(pid, that.pid);
    }

    @Override
    public int hashCode() {

        return Objects.hash(id, name, nameCn, pid);
    }
}

本篇文章就到這裏,

如果覺得筆者寫的不錯的話,歡迎評論點贊。

下篇文章貼出所有省份數據。

https://blog.csdn.net/tangthh123/article/details/106948980

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章