[python] 旅遊

旅遊

爬蟲篇

城市數據獲取

host = "http://www.mafengwo.cn"

headers = {
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Connection": "keep-alive",
    "Host": "www.mafengwo.cn",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"
}


def get_city_id_json(city_page):
    """
    獲取城市id
    :param city_page: 城市頁面
    :return: {'city_id': 10065, 'city_name': '北京'}
    """
    try:
        time.sleep(3)
        res = requests.get(city_page, headers=headers)
        res_test = res.text
        # 匹配正確的目標地址,過濾首頁等
        regex = 'mafengwo/(\d+).html'
        # 匹配所有a標籤
        regex_a = '<a.+?href=\"(.+?)\".*>(.+)</a>'
        dt = re.findall(regex_a, res_test)

        city_id_json = []

        for item in dt:
            url = item[0]
            city_name = item[1]
            if re.findall(regex, url):
                city_id_json.append(
                    {
                        "city_id": eval(re.findall(regex, url)[0]),
                        "city_name": city_name,
                        "url": "http://www.mafengwo.cn" + url,
                        "route": 'http://www.mafengwo.cn/mdd/route/%d.html' % (eval(re.findall(regex, url)[0])),
                        "gonglve": 'http://www.mafengwo.cn/jd/%d/gonglve.html' % (eval(re.findall(regex, url)[0])),
                    }
                )
        return city_id_json
    except Exception as  e:
        print(e)

        return None

路線

  • 主要內容爲紅色框內內容

1555054615501

如杭州4日經典線路中有4條路線1555054674108
我們獲取這四條路線

def rote_link(url):
    """
    獲取路線
    :param url:
    :return:
    """
    try:
        time.sleep(3)
        req = requests.get(url=url, headers=headers)
        if req.status_code == 200:
            req_text = req.text
            html = etree.HTML(req_text)
            rote_url_list = html.xpath('//*[@class="J_overview"]/p')
            rote_name_link = []
            for i in range(len(rote_url_list)):
                s = '//*[@class="J_overview"]/p[' + str(i + 1) + ']/a/text()'
                cn_rote_name = html.xpath(s)
                rote_name_link.append(
                    {
                        "rote_id": i,
                        "rote_link": cn_rote_name
                    }
                )
            return rote_name_link
        else:
            print("rote_link  !=200")
    except Exception as e:
        print("rote_link", e)
        return None

景點

  • 主要獲取內容爲景點名稱

1555054801455

def gonlve_list(url):
    """
    獲取景點
    :param url:
    :return:
    """
    try:
        time.sleep(3)
        req = requests.get(url=url, headers=headers)
        if req.status_code == 200:
            req_text = req.text
            html = etree.HTML(req_text)
            gonglve_list = html.xpath('//*[@class="item clearfix"]/div/div/h3/a[2]/@title')

            return gonglve_list
        else:
            print("gonlve_list  !=200")
            return None
    except Exception as e:
        print("gonlve_list", e)
        return None

路線 景點整合

def demo_hz(hz):
    # 獲取線路
    try:
        time.sleep(3)
        req = requests.get(url=hz.get("route"), headers=headers)
        if req.status_code == 200:
            req_text = req.text
            html = etree.HTML(req_text)
            rote_url_list = html.xpath('//*[@class="lp-detail"]/dt/a/@href')
            rote_name_list = html.xpath('//*[@class="lp-detail"]/dt/a/h2/text()')

            # 路線具體數據
            city_rote_data = []
            for url, name in zip(rote_url_list, rote_name_list):
                ad_rote_url = host + url

                data = {
                    "rote_name": name,
                    "rote_url": ad_rote_url,
                    "rote_link": rote_link(ad_rote_url)
                }
                city_rote_data.append(data)

            # 景點數據
            gonlve_list_data = gonlve_list(url=hz.get("gonglve"))

            return {
                "rote": city_rote_data,
                "gonlve": gonlve_list_data
            }
        else:
            print("demo_hz  !=200")
            return None
    except Exception as e:

        return None

部分數據結果

[
    {
    "city_id": 10099,
    "city_name": "上海",
    "url": "http://www.mafengwo.cn/travel-scenic-spot/mafengwo/10099.html",
    "route": "http://www.mafengwo.cn/mdd/route/10099.html",
    "gonglve": "http://www.mafengwo.cn/jd/10099/gonglve.html",
    "ly": {
      "rote": [
        {
          "rote_name": "上海3日經典線路",
          "rote_url": "http://www.mafengwo.cn/mdd/cityroute/10099_232.html",
          "rote_link": [
            {
              "rote_id": 0,
              "rote_link": [
                "上海城隍廟(2小時)",
                "豫園(1小時)",
                "南京路步行街(3小時)",
                "外灘(1小時)"
              ]
            },
            {
              "rote_id": 1,
              "rote_link": [
                "中華藝術宮(4小時)",
                "田子坊(2小時)",
                "上海新天地(2小時)"
              ]
            },
            {
              "rote_id": 2,
              "rote_link": [
                "上海杜莎夫人蠟像館(3小時)",
                "陸家嘴(2小時)",
                "東方明珠廣播電視塔(2小時)"
              ]
            }
          ]
        },
        {
          "rote_name": "上海4日經典線路",
          "rote_url": "http://www.mafengwo.cn/mdd/cityroute/10099_99.html",
          "rote_link": [
            {
              "rote_id": 0,
              "rote_link": [
                "上海城隍廟(2小時)",
                "豫園(1小時)",
                "南京路步行街(3小時)",
                "外灘(1小時)"
              ]
            },
            {
              "rote_id": 1,
              "rote_link": [
                "中華藝術宮(4小時)",
                "田子坊(2小時)",
                "上海新天地(2小時)"
              ]
            },
            {
              "rote_id": 2,
              "rote_link": [
                "上海迪士尼樂園(1天)"
              ]
            },
            {
              "rote_id": 3,
              "rote_link": [
                "上海杜莎夫人蠟像館(3小時)",
                "陸家嘴(2小時)",
                "東方明珠廣播電視塔(2小時)"
              ]
            }
          ]
        },
        {
          "rote_name": "上海、蘇州、烏鎮、杭州9日線路",
          "rote_url": "http://www.mafengwo.cn/mdd/route/10099_289.html",
          "rote_link": []
        }
      ],
      "gonlve": [
        "上海迪士尼樂園",
        "外灘",
        "田子坊",
        "上海野生動物園",
        "南京路步行街"
      ]
    }
  }
]

地圖處理篇

處理內容

  1. 利用高德地圖API進行旅遊景點的座標獲取
  2. 利用高德地圖駕車API對爬蟲獲取的路線進行路線設計
  3. 儲存到mongodb
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# __file__: lv_data

import requests
import json
import geojson
from geomet import wkt
import pymongo

import geojsonio

GD_API_KEY = '93a19a8f482c4d60d98c180425e4967f'


def do_gd_location(address, city):
    """
    高德座標數據獲取
    :param address:
    :param city:
    :return:
    """

    url = "https://restapi.amap.com/v3/geocode/geo?key={}&address={}&city={}".format(GD_API_KEY, address.split("(")[0],
                                                                                     city)
    jd = eval(requests.get(url=url).text)

    if jd.get("status") =="1":
        # 默認獲取第一個
        if len(jd.get('geocodes')) > 0:
            location = jd.get('geocodes')[0].get('location')
            return location.split(",")
        else:
            return None


def do_gonlve(gonlve, city):
    """
    景點處理
    :param gonlve:
    :param city:
    :return:
    """
    result = []

    for item in gonlve:
        location = do_gd_location(item, city)
        if location == None:
            location = ["0", "0"]

        point_path = "POINT(" + " ".join(location) + ")"
        data = {
            "addr": item,
            "location": location,
            "wkt": point_path,
        }
        result.append(data)
    return result
    pass


def gd_driver(start_point, end_point, waypoints, city):
    """
    高德駕車路線導航
    :param start_point:起點
    :param end_point:終點
    :param waypoints:途徑點
    :param city:城市
    :return:
    """
    url = "https://restapi.amap.com/v3/direction/driving"
    origin = do_gd_location(start_point, city)
    destination = do_gd_location(end_point, city)

    ac = None

    if origin == None or destination == None:
        return None



    else:
        waypoints_data = []
        for i in waypoints:
            location = do_gd_location(i, city)
            if location:
                waypoints_data.append(",".join(location))

        if len(waypoints_data) > 1:
            ac = ";".join(waypoints_data)

        elif len(waypoints_data) == 1:
            ac = waypoints_data[0]
        else:
            return None
        params = {'key': GD_API_KEY,
                  'origin': ",".join(origin),
                  'destination': ",".join(destination),
                  'waypoints': ac,
                  }

        req = requests.get(url=url, params=params)
        dic = eval(req.text)
        if dic.get("status") == "1":
            steps = dic.get("route").get('paths')[0].get('steps')
            try:
                if steps:

                    point_list = []

                    for i in steps:
                        polyline = i.get('polyline')
                        point_list.append(polyline)

                    a = ';'.join(point_list).replace(",", ' ').replace(";", ',')
                    return 'LINESTRING(' + a + ')'
            except Exception as e:
                print(e)


def rote_link_location(rote_link, city):
    """
    路線規劃
    :param rote_link:
    :param city:
    :return:
    """
    ls = [x.split("(")[0] for x in rote_link]
    path = gd_driver(
        start_point=ls[0],
        end_point=ls[(len(ls) - 1)],
        waypoints=ls[1:-1],
        city=city
    )

    return {
        "path": ls,
        "wkt": path,
    }


def do_rote_link(rote_link_list, city):
    """
    路線處理
    :param rote_link_list:
    :param city:
    :return:
    """
    res = []
    for items in rote_link_list:
        rote_name = items.get('rote_name')
        rote_url = items.get('rote_url')
        di = {
            "rote_name": rote_name,
            "rote_url": rote_url,
            "rote_link": list()

        }
        for item in items.get('rote_link'):
            rote_link = item.get("rote_link")
            rs = rote_link_location(rote_link, city)
            di.get("rote_link").append(rs)

        res.append(di)
    return res
    pass


def city_gonglve_data(load_dict):
    """
    主要運行函數
    :return:
    """
    one_data = {
        "city_id": None,
        "city_name": None,
        "url": None,
        "route": None,
        "gonglve": None,
        "gonlve": None,
        "rotes": None

    }

    city_name = load_dict.get('city_name')
    # gonlve 座標數據
    gonlve = load_dict.get('ly').get('gonlve')
    gonlve_data = do_gonlve(gonlve, city=city_name)
    # 路線數據
    rote_link_list = load_dict.get('ly').get('rote')
    rote_link_list_data = do_rote_link(rote_link_list, city=city_name)

    one_data['city_id'] = load_dict.get('city_id')
    one_data['city_name'] = load_dict.get('city_name')
    one_data['url'] = load_dict.get('url')
    one_data['route'] = load_dict.get('route')
    one_data['gonglve'] = load_dict.get('gonglve')
    one_data['gonlve'] = gonlve_data
    one_data['rotes'] = rote_link_list_data

    # with open('shanghai-shuju-gs.json', 'w') as result_file:
    #     json.dump(one_data, result_file, indent=2, ensure_ascii=False)
    # save_mg_gdshuju(one_data)

    print("數據生成完成")
    return one_data
    pass


def get_mg_client():
    client = pymongo.MongoClient(host='localhost', port=27017)
    db = client.lvyou
    return db


def save_mg_gdshuju(l):
    """
    保存座標信息
    :param l:
    :return:
    """
    db = get_mg_client()
    collection = db.gdshj
    rs = collection.find(l)
    if list(rs) == []:
        collection.insert_one(l)


def save_mg_gdllx(l):
    """
    保存高德路線
    :param l:
    :return:
    """
    db = get_mg_client()
    collection = db.gdlx
    rs = collection.find(l)
    if list(rs) == []:
        collection.insert_one(l)


def gd_geojson_mg(load_dict):
    """
    高德geojson生產
    :return:
    """

    gonlve = load_dict.get('gonlve')
    rotes = load_dict.get('rotes')

    feature_list = []
    for i in gonlve:
        geom = eval(json.dumps(wkt.loads(i.get("wkt"))))
        atr = {
            "addr": i.get('addr'),
            "marker-color": "#e0b91d",
            "marker-size": "medium",
            "marker-symbol": "circle",
        }
        feature_list.append(
            dict(type="Feature", geometry=geom, properties=atr)
        )

    for i in rotes:
        item = i.get('rote_link')
        rote_name = i.get("rote_name")
        for it in item:
            if it.get("wkt"):
                geom = eval(json.dumps(wkt.loads(it.get("wkt"))))
                atr = {
                    "rote_name": rote_name + str(list(item).index(it) + 1),
                    "rote_path": ",".join(it.get("path")),
                }
                feature_list.append(
                    dict(type="Feature", geometry=geom, properties=atr)
                )

    data = {"type": "FeatureCollection", "features": feature_list}

    return data


def run_mg_ins_gd():
    db = get_mg_client()
    # 路由路線
    con1 = db.lsa
    # 查詢所有旅遊路線 ly !=null
    r1 = con1.find({'ly': {"$ne": None}})
    for r in r1:
        dt1 = city_gonglve_data(r)
        dt2 = gd_geojson_mg(dt1)

        save_mg_gdshuju(dt1)
        save_mg_gdllx(dt2)


if __name__ == '__main__':
    # city_gonglve_data()
    # main()
    run_mg_ins_gd()
    pass
  • 最終成果

1555055180366

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章