獲取最新中國行政區劃

廢話少說,上代碼:

import urlparse
from StringIO import StringIO
import datetime
import requests
import lxml
from lxml import etree

def get_latest_url(index_url):

    response=requests.get(index_url)

    parser=etree.HTMLParser()

    tree   = etree.parse(StringIO(response.content ), parser)

    r = tree.xpath('//ul[@class="center_list_contlist"]')
    if len(r)==1:
        div=r[0]
        href = div.xpath('li/a/@href')[0]
        return urlparse.urljoin(index_url,href)
    else:
        return None

def get_xingzhengquhua_text(latest_url, referer=None):

    response=requests.get(latest_url)
    parser= etree.HTMLParser()
    tree   = etree.parse(StringIO(response.content ), parser)
    r = tree.xpath('//div[@class="xilan_con"]')
    print r
    if len(r)==1:
        div=r[0]
        div2 = div.xpath('div/div')[0]
        div3 = div2.xpath('.//p')
        p=[]
        for line in div3:
            #line = line.replace(u'\xa0', u' ').strip()
            #if not line:
            #    continue
            try:
                code=line.xpath('span[1]/text()')[0]
                name=line.xpath('span[2]/text()')[0].strip(u'\u3000')
            except:
                continue

            if code.endswith('0000'):
                parent=''
            elif code.endswith('00'):
                parent=code[:2]+'0000'
            else:
                parent=code[:4]+'00'
            p.append((parent,code,name))

        text='\n'.join(map(lambda x:','.join(x),p))
        text=text.encode('utf-8')
        print text
        return text
    else:
        text=None

if __name__ == '__main__':
    index_url='http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/'
    #latest_url='http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201401/t20140116_501070.html'
    latest_url=get_latest_url(index_url)
    print latest_url
    if latest_url:
        text=get_xingzhengquhua_text(latest_url)
        filename=latest_url.strip().split('/')[-1]
        print filename
        try:
            filename=filename.split('_')[0][1:]
        except:
            now=datetime.datetime.now()
            filename=now.strftime('%Y-%m-%d')

        if text:
            ff=open('latest-xingzhengquhua-%s.txt' % filename,'w')
            ff.write(text)
            ff.close()
        else:
            print 'Failed get xingzhengquehua data!'
    else:
        print 'Failed get latest data url'
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章