廢話少說,上代碼:
import urlparse
from StringIO import StringIO
import datetime
import requests
import lxml
from lxml import etree
def get_latest_url(index_url):
response=requests.get(index_url)
parser=etree.HTMLParser()
tree = etree.parse(StringIO(response.content ), parser)
r = tree.xpath('//ul[@class="center_list_contlist"]')
if len(r)==1:
div=r[0]
href = div.xpath('li/a/@href')[0]
return urlparse.urljoin(index_url,href)
else:
return None
def get_xingzhengquhua_text(latest_url, referer=None):
response=requests.get(latest_url)
parser= etree.HTMLParser()
tree = etree.parse(StringIO(response.content ), parser)
r = tree.xpath('//div[@class="xilan_con"]')
print r
if len(r)==1:
div=r[0]
div2 = div.xpath('div/div')[0]
div3 = div2.xpath('.//p')
p=[]
for line in div3:
#line = line.replace(u'\xa0', u' ').strip()
#if not line:
# continue
try:
code=line.xpath('span[1]/text()')[0]
name=line.xpath('span[2]/text()')[0].strip(u'\u3000')
except:
continue
if code.endswith('0000'):
parent=''
elif code.endswith('00'):
parent=code[:2]+'0000'
else:
parent=code[:4]+'00'
p.append((parent,code,name))
text='\n'.join(map(lambda x:','.join(x),p))
text=text.encode('utf-8')
print text
return text
else:
text=None
if __name__ == '__main__':
index_url='http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/'
#latest_url='http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201401/t20140116_501070.html'
latest_url=get_latest_url(index_url)
print latest_url
if latest_url:
text=get_xingzhengquhua_text(latest_url)
filename=latest_url.strip().split('/')[-1]
print filename
try:
filename=filename.split('_')[0][1:]
except:
now=datetime.datetime.now()
filename=now.strftime('%Y-%m-%d')
if text:
ff=open('latest-xingzhengquhua-%s.txt' % filename,'w')
ff.write(text)
ff.close()
else:
print 'Failed get xingzhengquehua data!'
else:
print 'Failed get latest data url'