旅遊
爬蟲篇
-
將這部分內容獲取到,主要內容有城市名稱,城市id,城市網頁
-
進一步增加json數據量,把這個目的地下的 行程路線,景點的路由獲取 , 以杭州爲例,其他城市替換對應的城市id即可,本文就直接進行字符串替換了
城市數據獲取
host = "http://www.mafengwo.cn"
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Host": "www.mafengwo.cn",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
def get_city_id_json(city_page):
"""
獲取城市id
:param city_page: 城市頁面
:return: {'city_id': 10065, 'city_name': '北京'}
"""
try:
time.sleep(3)
res = requests.get(city_page, headers=headers)
res_test = res.text
# 匹配正確的目標地址,過濾首頁等
regex = 'mafengwo/(\d+).html'
# 匹配所有a標籤
regex_a = '<a.+?href=\"(.+?)\".*>(.+)</a>'
dt = re.findall(regex_a, res_test)
city_id_json = []
for item in dt:
url = item[0]
city_name = item[1]
if re.findall(regex, url):
city_id_json.append(
{
"city_id": eval(re.findall(regex, url)[0]),
"city_name": city_name,
"url": "http://www.mafengwo.cn" + url,
"route": 'http://www.mafengwo.cn/mdd/route/%d.html' % (eval(re.findall(regex, url)[0])),
"gonglve": 'http://www.mafengwo.cn/jd/%d/gonglve.html' % (eval(re.findall(regex, url)[0])),
}
)
return city_id_json
except Exception as e:
print(e)
return None
路線
- 主要內容爲紅色框內內容
如杭州4日經典線路中有4條路線
我們獲取這四條路線
def rote_link(url):
"""
獲取路線
:param url:
:return:
"""
try:
time.sleep(3)
req = requests.get(url=url, headers=headers)
if req.status_code == 200:
req_text = req.text
html = etree.HTML(req_text)
rote_url_list = html.xpath('//*[@class="J_overview"]/p')
rote_name_link = []
for i in range(len(rote_url_list)):
s = '//*[@class="J_overview"]/p[' + str(i + 1) + ']/a/text()'
cn_rote_name = html.xpath(s)
rote_name_link.append(
{
"rote_id": i,
"rote_link": cn_rote_name
}
)
return rote_name_link
else:
print("rote_link !=200")
except Exception as e:
print("rote_link", e)
return None
景點
- 主要獲取內容爲景點名稱
def gonlve_list(url):
"""
獲取景點
:param url:
:return:
"""
try:
time.sleep(3)
req = requests.get(url=url, headers=headers)
if req.status_code == 200:
req_text = req.text
html = etree.HTML(req_text)
gonglve_list = html.xpath('//*[@class="item clearfix"]/div/div/h3/a[2]/@title')
return gonglve_list
else:
print("gonlve_list !=200")
return None
except Exception as e:
print("gonlve_list", e)
return None
路線 景點整合
def demo_hz(hz):
# 獲取線路
try:
time.sleep(3)
req = requests.get(url=hz.get("route"), headers=headers)
if req.status_code == 200:
req_text = req.text
html = etree.HTML(req_text)
rote_url_list = html.xpath('//*[@class="lp-detail"]/dt/a/@href')
rote_name_list = html.xpath('//*[@class="lp-detail"]/dt/a/h2/text()')
# 路線具體數據
city_rote_data = []
for url, name in zip(rote_url_list, rote_name_list):
ad_rote_url = host + url
data = {
"rote_name": name,
"rote_url": ad_rote_url,
"rote_link": rote_link(ad_rote_url)
}
city_rote_data.append(data)
# 景點數據
gonlve_list_data = gonlve_list(url=hz.get("gonglve"))
return {
"rote": city_rote_data,
"gonlve": gonlve_list_data
}
else:
print("demo_hz !=200")
return None
except Exception as e:
return None
部分數據結果
[
{
"city_id": 10099,
"city_name": "上海",
"url": "http://www.mafengwo.cn/travel-scenic-spot/mafengwo/10099.html",
"route": "http://www.mafengwo.cn/mdd/route/10099.html",
"gonglve": "http://www.mafengwo.cn/jd/10099/gonglve.html",
"ly": {
"rote": [
{
"rote_name": "上海3日經典線路",
"rote_url": "http://www.mafengwo.cn/mdd/cityroute/10099_232.html",
"rote_link": [
{
"rote_id": 0,
"rote_link": [
"上海城隍廟(2小時)",
"豫園(1小時)",
"南京路步行街(3小時)",
"外灘(1小時)"
]
},
{
"rote_id": 1,
"rote_link": [
"中華藝術宮(4小時)",
"田子坊(2小時)",
"上海新天地(2小時)"
]
},
{
"rote_id": 2,
"rote_link": [
"上海杜莎夫人蠟像館(3小時)",
"陸家嘴(2小時)",
"東方明珠廣播電視塔(2小時)"
]
}
]
},
{
"rote_name": "上海4日經典線路",
"rote_url": "http://www.mafengwo.cn/mdd/cityroute/10099_99.html",
"rote_link": [
{
"rote_id": 0,
"rote_link": [
"上海城隍廟(2小時)",
"豫園(1小時)",
"南京路步行街(3小時)",
"外灘(1小時)"
]
},
{
"rote_id": 1,
"rote_link": [
"中華藝術宮(4小時)",
"田子坊(2小時)",
"上海新天地(2小時)"
]
},
{
"rote_id": 2,
"rote_link": [
"上海迪士尼樂園(1天)"
]
},
{
"rote_id": 3,
"rote_link": [
"上海杜莎夫人蠟像館(3小時)",
"陸家嘴(2小時)",
"東方明珠廣播電視塔(2小時)"
]
}
]
},
{
"rote_name": "上海、蘇州、烏鎮、杭州9日線路",
"rote_url": "http://www.mafengwo.cn/mdd/route/10099_289.html",
"rote_link": []
}
],
"gonlve": [
"上海迪士尼樂園",
"外灘",
"田子坊",
"上海野生動物園",
"南京路步行街"
]
}
}
]
地圖處理篇
處理內容
- 利用高德地圖API進行旅遊景點的座標獲取
- 利用高德地圖駕車API對爬蟲獲取的路線進行路線設計
- 儲存到mongodb
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# __file__: lv_data
import requests
import json
import geojson
from geomet import wkt
import pymongo
import geojsonio
GD_API_KEY = '93a19a8f482c4d60d98c180425e4967f'
def do_gd_location(address, city):
"""
高德座標數據獲取
:param address:
:param city:
:return:
"""
url = "https://restapi.amap.com/v3/geocode/geo?key={}&address={}&city={}".format(GD_API_KEY, address.split("(")[0],
city)
jd = eval(requests.get(url=url).text)
if jd.get("status") =="1":
# 默認獲取第一個
if len(jd.get('geocodes')) > 0:
location = jd.get('geocodes')[0].get('location')
return location.split(",")
else:
return None
def do_gonlve(gonlve, city):
"""
景點處理
:param gonlve:
:param city:
:return:
"""
result = []
for item in gonlve:
location = do_gd_location(item, city)
if location == None:
location = ["0", "0"]
point_path = "POINT(" + " ".join(location) + ")"
data = {
"addr": item,
"location": location,
"wkt": point_path,
}
result.append(data)
return result
pass
def gd_driver(start_point, end_point, waypoints, city):
"""
高德駕車路線導航
:param start_point:起點
:param end_point:終點
:param waypoints:途徑點
:param city:城市
:return:
"""
url = "https://restapi.amap.com/v3/direction/driving"
origin = do_gd_location(start_point, city)
destination = do_gd_location(end_point, city)
ac = None
if origin == None or destination == None:
return None
else:
waypoints_data = []
for i in waypoints:
location = do_gd_location(i, city)
if location:
waypoints_data.append(",".join(location))
if len(waypoints_data) > 1:
ac = ";".join(waypoints_data)
elif len(waypoints_data) == 1:
ac = waypoints_data[0]
else:
return None
params = {'key': GD_API_KEY,
'origin': ",".join(origin),
'destination': ",".join(destination),
'waypoints': ac,
}
req = requests.get(url=url, params=params)
dic = eval(req.text)
if dic.get("status") == "1":
steps = dic.get("route").get('paths')[0].get('steps')
try:
if steps:
point_list = []
for i in steps:
polyline = i.get('polyline')
point_list.append(polyline)
a = ';'.join(point_list).replace(",", ' ').replace(";", ',')
return 'LINESTRING(' + a + ')'
except Exception as e:
print(e)
def rote_link_location(rote_link, city):
"""
路線規劃
:param rote_link:
:param city:
:return:
"""
ls = [x.split("(")[0] for x in rote_link]
path = gd_driver(
start_point=ls[0],
end_point=ls[(len(ls) - 1)],
waypoints=ls[1:-1],
city=city
)
return {
"path": ls,
"wkt": path,
}
def do_rote_link(rote_link_list, city):
"""
路線處理
:param rote_link_list:
:param city:
:return:
"""
res = []
for items in rote_link_list:
rote_name = items.get('rote_name')
rote_url = items.get('rote_url')
di = {
"rote_name": rote_name,
"rote_url": rote_url,
"rote_link": list()
}
for item in items.get('rote_link'):
rote_link = item.get("rote_link")
rs = rote_link_location(rote_link, city)
di.get("rote_link").append(rs)
res.append(di)
return res
pass
def city_gonglve_data(load_dict):
"""
主要運行函數
:return:
"""
one_data = {
"city_id": None,
"city_name": None,
"url": None,
"route": None,
"gonglve": None,
"gonlve": None,
"rotes": None
}
city_name = load_dict.get('city_name')
# gonlve 座標數據
gonlve = load_dict.get('ly').get('gonlve')
gonlve_data = do_gonlve(gonlve, city=city_name)
# 路線數據
rote_link_list = load_dict.get('ly').get('rote')
rote_link_list_data = do_rote_link(rote_link_list, city=city_name)
one_data['city_id'] = load_dict.get('city_id')
one_data['city_name'] = load_dict.get('city_name')
one_data['url'] = load_dict.get('url')
one_data['route'] = load_dict.get('route')
one_data['gonglve'] = load_dict.get('gonglve')
one_data['gonlve'] = gonlve_data
one_data['rotes'] = rote_link_list_data
# with open('shanghai-shuju-gs.json', 'w') as result_file:
# json.dump(one_data, result_file, indent=2, ensure_ascii=False)
# save_mg_gdshuju(one_data)
print("數據生成完成")
return one_data
pass
def get_mg_client():
client = pymongo.MongoClient(host='localhost', port=27017)
db = client.lvyou
return db
def save_mg_gdshuju(l):
"""
保存座標信息
:param l:
:return:
"""
db = get_mg_client()
collection = db.gdshj
rs = collection.find(l)
if list(rs) == []:
collection.insert_one(l)
def save_mg_gdllx(l):
"""
保存高德路線
:param l:
:return:
"""
db = get_mg_client()
collection = db.gdlx
rs = collection.find(l)
if list(rs) == []:
collection.insert_one(l)
def gd_geojson_mg(load_dict):
"""
高德geojson生產
:return:
"""
gonlve = load_dict.get('gonlve')
rotes = load_dict.get('rotes')
feature_list = []
for i in gonlve:
geom = eval(json.dumps(wkt.loads(i.get("wkt"))))
atr = {
"addr": i.get('addr'),
"marker-color": "#e0b91d",
"marker-size": "medium",
"marker-symbol": "circle",
}
feature_list.append(
dict(type="Feature", geometry=geom, properties=atr)
)
for i in rotes:
item = i.get('rote_link')
rote_name = i.get("rote_name")
for it in item:
if it.get("wkt"):
geom = eval(json.dumps(wkt.loads(it.get("wkt"))))
atr = {
"rote_name": rote_name + str(list(item).index(it) + 1),
"rote_path": ",".join(it.get("path")),
}
feature_list.append(
dict(type="Feature", geometry=geom, properties=atr)
)
data = {"type": "FeatureCollection", "features": feature_list}
return data
def run_mg_ins_gd():
db = get_mg_client()
# 路由路線
con1 = db.lsa
# 查詢所有旅遊路線 ly !=null
r1 = con1.find({'ly': {"$ne": None}})
for r in r1:
dt1 = city_gonglve_data(r)
dt2 = gd_geojson_mg(dt1)
save_mg_gdshuju(dt1)
save_mg_gdllx(dt2)
if __name__ == '__main__':
# city_gonglve_data()
# main()
run_mg_ins_gd()
pass
- 最終成果