參考鏈接: 從人人網獲取全國中學信息(省市縣)
主要代碼爲參考+改進上面博客的原創,在人人網的select彈框form裏面抓取出全國高校名單.
主要代碼塊如下
def getProvinceData():
content = open("./cityArray.js", encoding='utf-8')
# 分離出市級id和名稱
partten = re.compile("(\d+):([\w\d\\\\]+)")
provinceList = []
for line in content.readlines():
data = partten.findall(line)
citys = []
province = {}
for s in data:
# print(s)
if len(s[0]) == 4: # 城市
# print s[0],s[1].decode('unicode_escape')
citys.append({"id": s[0], "name": unescape(s[1])})
province_id = len(data[0][0]) == 4 and data[0][0] or data[0][0][0:4]
# 只處理列表中的幾個省
if int(province_id) in provinceMap.keys():
province['id'] = province_id
province['name'] = provinceMap[int(province_id)]
province['citys'] = citys
provinceList.append(province)
return provinceList
def getTownHtml(town_id, scoolType):
try:
url = "http://support.renren.com/{0}/{1}.html".format(scoolType, town_id)
# print "請求網絡數據:",url
a = requests.get(url, headers=headers).text
print(a)
return a
except:
print("網絡錯誤!")
pass
def getCitySchool(content):
selector = etree.HTML(content)
# 某個城市的中學列表
# 縣區的列表
townlist = selector.xpath('//ul')
# print(townlist)
d = {}
for town1 in townlist:
name1 = town1.xpath('./@id')[0].strip()
if name1 == "schoolCityQuList":
d["city"] = []
city1 = town1.xpath('.//a')
for y in city1:
y1 = etree.tostring(y, encoding='utf-8', pretty_print=True, method="html").decode(encoding="utf-8")
d["city"].append({
"name": re.findall('>(.*?)</a>', y1)[0],
"id": re.findall("'city_qu_(.*?)'", y1)[0]
})
continue
citySchoolData = []
townLiList = town1.xpath('.//a')
for town in townLiList:
p = {}
town = etree.tostring(town, encoding='utf-8', pretty_print=True, method="html").decode(encoding="utf-8")
print(town)
# input()
try:
p['name'] = re.findall('>(.*?)</a>', town)[0]
except:
p['name'] = re.findall('>(.*?)\n', town)[0]
p['id'] = re.findall('href="(.*?)"', town)[0]
citySchoolData.append(p)
for x in d.get('city'):
if name1.replace('city_qu_', '') == x.get('id'):
x['schoolList'] = citySchoolData
return d.get('city')
def getUnicodeStr(s):
name = []
for word in s.split(";"):
try:
name.append(chr(int(word[2:])))
except:
pass
return "".join(name)