python_爬蟲爬取前瞻產業研究院的全國產業園區polygon範圍面狀圖層shp

總體思路爲:
1、從網站上獲取每個園區的id
2、從每個園區的網頁上獲取地圖的iframe
3、保存加載地圖iframe的網頁,因爲裏面有polygon數據
4、從html網頁中使用正則表達式提取polygon,並將數據處理成arcgis所能識別的數據格式
5、使用arcgis將點轉線,在將線轉面,最後導出爲shp文件

最終成果如下圖:
全國產業園區shp文件

import os
from datetime import datetime
from urllib import request
import pandas as pd
import re
# 瀏覽器的請求頭
from bs4 import BeautifulSoup

from utils.read_write import writeOneCSV, readTXT, writeTXT, writeOneTXT

headers = {"User-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "
                        "(KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0"}

opener = request.build_opener()
opener.add_headers = [headers]
request.install_opener(opener)

def getCodeList():
    code = readTXT('D:\project\jianguiyuan\data\省份id.txt')
    return code


# 發送請求
def requerts_url(url,i,start_page):
    try:
        response = request.urlopen(url).read().decode("utf-8")
        return response
    except:
        print(datetime.now())
        print(i)
        print(start_page)
        print(url)
        bian(i,start_page)


# 共26123個數哦
def bian(start_x,start_page):
    for i in range(start_x,len(codeList)):
        print(codeList[i])
        firsturl = "https://f.qianzhan.com/yuanqu/diqu/"+codeList[i]+'/'
        data = requerts_url(firsturl, i,start_page)
        soup = BeautifulSoup(data, 'lxml')
        tr = soup.table.find_all('tr')
        for row in tr[1:]:
            a = row.find_all('a')
            href = a[0].attrs['href']
            # yid = re.findall(r"item/(.+?)\.html",a)
            # /yuanqu/item/eb42f331cc0f3062.html
            # https://f.qianzhan.com/yuanqu/yqmap?center=122,23&zoom=14&yid=5292a91bbac8d78f
            url = "https://f.qianzhan.com" + href
            detail = requerts_url(url, i,start_page)
            soup = BeautifulSoup(detail, 'lxml')
            iframe = soup.findAll('iframe')
            src = iframe[0].attrs['bus_taxi']
            mapUrl = "https://f.qianzhan.com" + src
            detail = requerts_url(mapUrl, i,start_page)
            writeOneTXT(detail,savePath+href)

        end = re.findall(r"收錄(.+?)個", data)
        page = int(int(end[0])/20)+2
        for x in range(start_page, page):
            print(x)
            second = firsturl + '?pg='+str(x)
            data1 = requerts_url(second,  i,start_page)
            soup = BeautifulSoup(data1, 'lxml')
            tr = soup.table.find_all('tr')
            for row in tr[1:]:
                a = row.find_all('a')
                href = a[0].attrs['href']
                url = "https://f.qianzhan.com" + href
                detail = requerts_url(url,  i,start_page)
                soup = BeautifulSoup(detail, 'lxml')
                iframe = soup.findAll('iframe')
                src = iframe[0].attrs['bus_taxi']
                mapUrl = "https://f.qianzhan.com" + src
                detail = requerts_url(mapUrl,  i,start_page)
                writeOneTXT(detail, savePath + href)
                print(savePath+href)


if __name__ == '__main__':
    #     city_range(1,1)
    # bianli(195551, 856009)194686
    codeList = getCodeList()
    savePath ='D:\da\map'
    path = r'D:\dat區\\'
    bian(0,2)

如需數據或幫忙處理數據請私聊我。。。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章