本文僅供學習參考
1.採用轉文件轉MySQL數據庫形式存儲內容
2.爬取速度不宜過快,太快容易失敗
代碼如下:
先爬取並將其全部存儲至文件中
import json, time
import random
import requests
six_cities_list = ['北京市', '上海市', '重慶市', '天津市', '香港特別行政區', '澳門特別行政區']
province_list = ['河北省', '山西省', '遼寧省', '吉林省', '黑龍江省', '江蘇省', '浙江省', '安徽省', '福建省', '江西省',
'山東省', '河南省','湖北省', '湖南省', '廣東省', '海南省', '四川省', '貴州省', '雲南省', '陝西省', '甘肅省',
'青海省', '臺灣省', '內蒙古自治區', '廣西壯族自治區', '西藏自治區', '寧夏回族自治區', '新疆維吾爾自治區']
def getjson(loc, page_num=0):
headers = {
'user-agent': 'Mozilla/5.0 (Windows; U; Windows NT 10.0; en-US; rv:1.9.1.6)\
Gecko Chrome/63.0.3239.132'
}
pa = {
'query': '公園',
# 'tag': '',
'region': loc,
'scope': '2',
'page_size': 20,
'page_num': page_num,
'ak': '自己的API'
}
try:
r = requests.get('http://api.map.baidu.com/place/v2/search?query={}®ion={}&output=json&ak={}'\
.format(pa['query'], pa['region'], pa['ak']), params=pa, headers=headers)
decodejson=json.loads(r.text)
return decodejson
except Exception as e:
getjson(loc)
print('over-requests! Error:', e)
def six_city():
decodejson = getjson('全國')
for eachprovince in decodejson['results']:
try:
city = eachprovince['name']
num = eachprovince['num']
if city in six_cities_list:
output = '\t'.join([city, str(num)]) + '\n'
with open('cities.txt', 'a+', encoding='UTF-8') as f:
f.write(output)
f.close()
except Exception as e:
print('over_cities! Error:', e)
def else_city():
for eachprovince in province_list:
decodejson = getjson(eachprovince)
try:
for eachcity in decodejson['results']:
try:
city = eachcity['name']
num = eachcity['num']
output = '\t'.join([city, str(num)]) + '個\n'
with open('cities.txt', 'a+', encoding='UTF-8') as f:
f.write(output)
f.close()
except Exception as e:
continue
except Exception as e:
print('over-eachprovince! Error:', e)
finally:
time.sleep(random.random())
if __name__ == '__main__':
print('正在爬取全國各地"公園"分佈數目並存入cities.txt.')
six_city()
else_city()
利用全國公園裏各大城市的地址獲取想要的結果
import json, time
import random
import pymysql
import requests
city_list = list()
config = {
'host':'localhost',
'port':3306,
'user':'root',
'password':'113754',
'db':'baidu_map',
'charset':'utf8mb4',
'cursorclass':pymysql.cursors.DictCursor,
}
conn = pymysql.connect(**config)
cur = conn.cursor()
def Word():
with open('cities.txt', 'r', encoding='UTF-8') as txt_file:
for eachLine in txt_file:
if eachLine != '' and eachLine != '\n':
fields = eachLine.split('\t')
city = fields[0]
city_list.append(city)
txt_file.close()
def getjson(loc, page_num=0):
headers = {
'user-agent': 'Mozilla/5.0 (Windows; U; Windows NT 10.0; en-US; rv:1.9.1.6)\
Gecko Chrome/63.0.3239.132'
}
pa = {
'query': 'CoCo',
'tag': '美食',
'region': loc,
'scope': '2',
'page_size': 20,
'page_num': page_num,
'ak': 'yX91zbGwxNxaGWMwo3LPx3MWovVCScHj'
}
try:
r = requests.get('http://api.map.baidu.com/place/v2/search?query={}&tag={}®ion={}&output=json&ak={}'\
.format(pa['query'], pa['tag'], pa['region'], pa['ak']), params=pa, headers=headers)
decodejson=json.loads(r.text)
return decodejson
except Exception as e:
print('over-requests! Error:', e)
getjson(loc)
def Insert_mysql():
for eachcity in city_list:
not_last_page = True
page_num = 0
while not_last_page:
decodejson = getjson(eachcity, page_num)
time.sleep(random.random())
print(eachcity, page_num)
try:
if decodejson['results']:
for eachone in decodejson['results']:
try:
park = eachone['name']
except:
park = None
try:
location_lat = eachone['location']['lat']
except:
location_lat = None
try:
location_lng = eachone['location']['lng']
except:
location_lng = None
try:
address = eachone['address']
except:
address = None
try:
street_id = eachone['street_id']
except:
street_id = None
try:
uid = eachone['uid']
except:
uid = None
sql = '''INSERT INTO baidu_map.city
(city, park, location_lat, location_lng, address, street_id, uid)
VALUES (%s, %s, %s, %s, %s, %s, %s)'''
cur.execute(sql, (eachcity, park, location_lat, location_lng, address, street_id, uid))
conn.commit()
page_num += 1
except Exception as e:
print('Error:', e)
not_last_page = False
if __name__ == '__main__':
Word()
Insert_mysql()
cur.close()
conn.close()
運行截圖如下: