2020 最新 Python3.8 + Mysql 爬取國家統計局區域代碼,省、市、區 街道鄉鎮區域代碼

 

#coding=utf-8
#-*- coding: utf-8 -*-



import urllib.request
import time

from bs4 import BeautifulSoup


import pymysql
#當成是mysqldb一樣使用,當然也可以不寫這句,那就按照pymysql的方式
pymysql.install_as_MySQLdb()

user='root'
pas='123456'
db ='testdb'




def execSql(sql):
    conn=pymysql.connect(host='localhost',user=user,passwd=pas,db=db,port=3306,charset='utf8')
    cur=conn.cursor()#獲取一個遊標
    try:
        #print(sql)
        cur.execute(sql)

        conn.commit()
    except:
        conn.rollback()
        conn.close()
    #data=cur.fetchall()
    #cur.close()#關閉遊標
    #conn.close()#釋放數據庫資源

def getHTMLText(url):
    maxTryNum = 20
    for tries in range(maxTryNum):
        try:
            #kv = {"user-agent": "Mizilla/5.0"}
            response = urllib.request.urlopen(url,timeout=30000).read().decode('gbk')
            return response
        except:
            if tries < (maxTryNum - 1):
                continue
            else:
                print("Has tried %d times to access url %s, all failed!" % (maxTryNum, url))
                break


indexs = 'index.html'
url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/'

txt = getHTMLText(url + indexs) #urllib.request.urlopen(url + indexs).read().decode('gbk')
soup = BeautifulSoup(txt, 'html.parser')
lista = soup.find_all('a')
lista.pop()
flag = 0
dd = ''
for idx,a in enumerate(lista):
    id =  flag= flag + idx +1
    dd = code = a['href'][0:2]
    name = a.text
    level = '0'
    pid = id

    sql = "insert into test2 (id,code,name,level,pid)values('"+str(id)+"','"+code+"','"+name+"','"+level+"','"+str(pid)+"');"
    execSql(sql)
    print("========" +a['href'][0:2] + "," + a.text + "========" )
    time.sleep(2)
    txt = getHTMLText(url + a['href']) #urllib.request.urlopen(url + a['href'],timeout=30000).read().decode('gbk')
    soup = BeautifulSoup(txt, 'html.parser')
    listb = soup.find_all('a')
    listb.pop()
    bb = {}
    l = len(listb)
    #print("----->>>>> "+str(l/2)+" <<<<<<------")
    strName = ''


    pida =id
    for i in range(0,l-1):
        time.sleep(3)
        if(listb[i].text == strName) :
            continue

        strIndex = listb[i]['href']
        code = listb[i].text
        strName = name = listb[i+1].text

        ida = flag = flag +1
        level = '1'
        pid = pida

        sql = "insert into test2 (id,code,name,level,pid)values('"+str(ida)+"','"+code+"','"+name+"','"+level+"','"+str(pid)+"');"
        execSql(sql)
        print(strIndex+","+code +"," + name)

        ctxt = getHTMLText(url + strIndex) #urllib.request.urlopen(url + strIndex,timeout=30000).read().decode('gbk')
        soup = BeautifulSoup(ctxt, 'html.parser')
        listc = soup.find_all('a')
        listc.pop()
        lc = len(listc)
        #print("----->>>>> "+str(lc/2)+" <<<<<<------")
        cstrName = ''


        pidc = ida
        for c in range(0,lc-1):
            time.sleep(3)
            if(listc[c].text == cstrName):
                continue

            strIndex = listc[c]['href']

            code = listc[c].text
            cstrName = name = listc[c+1].text
            idc =  flag= flag +1
            level = '2'
            pid = pidc

            sql = "insert into test2 (id,code,name,level,pid)values('"+str(idc)+"','"+code+"','"+name+"','"+level+"','"+str(pid)+"');"
            execSql(sql)
            print("   >["+code +"," + name+"]")



            dtxt = getHTMLText(url +'/'+dd+'/'+ strIndex) #urllib.request.urlopen(url +'/'+dd+'/'+ strIndex,timeout=30000).read().decode('gbk')
            soup = BeautifulSoup(dtxt, 'html.parser')
            listd = soup.find_all('a')
            listd.pop()

            ld = len(listd)
            print("----->>>>> "+str(ld/2)+" <<<<<<------")
            dstrName = ''


            pidd = idc
            for d in range(0,ld-1):
                if(listd[d].text == dstrName) :
                    continue
                strIndex = listd[d]['href']
                code = listd[d].text
                dstrName = name = listd[d+1].text
                idd =  flag= flag +1
                level = '3'
                pid = pidd

                sql = "insert into test2 (id,code,name,level,pid)values('"+str(idd)+"','"+code+"','"+name+"','"+level+"','"+str(pid)+"');"
                execSql(sql)
                print("   ====["+code +"," + name+"]====")







發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章