【歸檔】爬取馬蜂窩景點信息(含源代碼)

爬取馬蜂窩景點信息(含源代碼)

  • 爬取熱門目的地信息 MafengwoCrawler()._get_mdd()
  • 爬取目的地內景點信息 MafengwoCrawler().crawler_mdd()
  • 爬取景點詳細信息 MafengwoCrawler().crawler_detail()

源碼文件及數據庫結構文件見我的資源。(另包含國內10W景點信息的數據庫文件)

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time         : 2019/04/24
# @Author       : AIsland
# @Email        : [email protected]
# @File         : crawler.py
# @Description  : 爬取馬蜂窩各省市景點數據

import requests
import re
import time
import json
import hashlib
import logging
import threading
import pymysql
from bs4 import BeautifulSoup


class MafengwoCrawler:
    # 查詢目的地的網址
    # 目的地內包含景點
    URL_MDD = 'http://www.mafengwo.cn/mdd/'
    # 查詢景點的網址
    # 包含景點詳情的鏈接、景點圖片和景點名稱
    URL_ROUTE = 'http://www.mafengwo.cn/ajax/router.php'
    # 查詢景點座標經緯度的網址
    # 經度:longitude lng
    # 緯度:lat itude lat
    URL_POI = 'http://pagelet.mafengwo.cn/poi/pagelet/poiLocationApi'

    # 通用 Headers
    HEADERS = {
        'Referer': 'http://www.mafengwo.cn/',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
    }

    # mysql 數據庫鏈接信息
    DB_HOST = 'localhost'
    DB_USER = 'root'
    DB_PASSWORD = '[email protected]'
    DB_NAME = 'mafengwo'

    # 請求數據加密需要的字符串,由 _get_md5_encrypted_string() 方法獲取
    encrypted_string = ''

    # 記錄不用爬取的頁碼,即爬取成功的頁碼
    success_pages = []

    def __init__(self, log_file=None):
        # 使用說明 https://www.cnblogs.com/nancyzhu/p/8551506.html
        logging.basicConfig(level=logging.DEBUG,
                            filename='mafengwo.'+str(int(time.time()))+'.log',
                            format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'
                            )
        # 初始化請求對象
        self.REQ = requests.session()
        # 設置通用 Headers
        self.REQ.headers.update(self.HEADERS)

        # 獲取請求數據加密需要的字符串
        self._get_md5_encrypted_string()

        # 如果傳入日誌文件,則過濾已爬取成功的頁碼
        if log_file is not None:
            self.success_pages = self._read_log_file_get_success_page(log_file)
            print('當前已經成功爬取的頁數:' + str(len(self.success_pages)))
            print('5秒後繼續運行')
            time.sleep(5)

    def crawler_mdd(self, mdd_id=21536):
        '''
        爬取單個目的地的景點信息
        默認:21536,中國
        '''
        # mdd_id = 12522  # 鼓浪嶼,16頁,測試數據

        # 開始爬數據
        start = int(time.time())

        # 先獲取數據總頁數
        res = self._get_route(mdd_id)
        page_total = res['pagecount']
        # 計算每個線程爬取多少頁
        page_range = round(page_total/20)
        if page_range == 0:
            page_range = 1

        logging.info('總共'+str(page_total)+'頁,每個線程爬取'+str(page_range)+'頁')
        print('總共'+str(page_total)+'頁,每個線程爬取'+str(page_range)+'頁')

        # 開啓多線程模式
        thread = []
        for i in range(1, page_total+1, page_range):
            page_start = i
            page_end = i + page_range
            if page_end > page_total + 1:
                page_end = page_total + 1

            t = threading.Thread(target=self.crawler,
                                 args=(mdd_id, page_start, page_end))
            thread.append(t)

        for i in range(0, len(thread)):
            thread[i].start()

        for i in range(0, len(thread)):
            thread[i].join()

        end = int(time.time())

        logging.info('總共花費:'+str(end-start)+'秒')
        print('總共花費:'+str(end-start)+'秒')

    def crawler(self, mdd_id, start_page, end_page):
        '''
        真正的爬蟲
        是時候展示真正的實力了
        '''
        # 連接數據庫
        db = pymysql.connect(
            self.DB_HOST,
            self.DB_USER,
            self.DB_PASSWORD,
            self.DB_NAME)
        for page in range(start_page, end_page):
            if page in self.success_pages:
                print('跳過:'+str(page))
                continue
            page_pass = False
            page_retry = 0
            while not page_pass and page_retry < 11:
                try:
                    print('當前爬取頁數:'+str(page))
                    result = self._get_route(mdd_id, page=page)['list']
                    # 存數據庫
                    sql = "INSERT IGNORE INTO poi(poi_id, name, image, link, lat, lng, type, is_cnmain, country_mddid) \
                        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s);"
                    params = []
                    for item in result:
                        params.append((
                            item['poi_id'],
                            item['name'],
                            item['image'],
                            item['link'],
                            item['lat'],
                            item['lng'],
                            item['type'],
                            item['is_cnmain'],
                            item['country_mddid']
                        ))
                    try:
                        cursor = db.cursor()
                        cursor.executemany(sql, params)
                        db.commit()
                        # 成功
                        logging.info('page success: ' + str(page))
                        print('page success: ' + str(page))
                        page_pass = True
                    except Exception as e:
                        logging.error(e)
                        # 如果發生錯誤則回滾
                        db.rollback()
                except Exception as e:
                    page_retry += 1
                    logging.error(e)
                    logging.error(result)
        # 關閉數據庫
        db.close()

    def crawler_detail(self):
        '''
        爬取景點詳細信息到數據庫
        執行這個方法之前,需要先爬取好數據到 poi 數據表

        多線程爬取 crawler_detail_worker
        '''
        # 查詢 poi 數據表中的數據條數
        db = pymysql.connect(
            self.DB_HOST,
            self.DB_USER,
            self.DB_PASSWORD,
            self.DB_NAME)
        sql = 'SELECT COUNT(*) as total from poi;'
        cursor = db.cursor()
        cursor.execute(sql)
        result = cursor.fetchall()
        # 總數據條數
        total = result[0][0]
        db.close()

        # 開始爬數據
        start = int(time.time())
        # 先獲取總數據條數
        total = result[0][0]
        # 計算每個線程爬取多少條
        range_count = round(total/20)
        if range_count == 0:
            range_count = 1
        # 日誌
        logging.info('總共'+str(total)+'條數據,每個線程爬取'+str(range_count)+'條')
        print('總共'+str(total)+'條數據,每個線程爬取'+str(range_count)+'條')
        # 開啓多線程模式
        thread = []
        for i in range(0, total, range_count):
            # i, range_count SQL 查詢起始位置,查詢數量
            t = threading.Thread(target=self.crawler_detail_worker,
                                 args=(i, range_count))
            thread.append(t)

        for i in range(0, len(thread)):
            thread[i].start()

        for i in range(0, len(thread)):
            thread[i].join()

        end = int(time.time())

        logging.info('總共花費:'+str(end-start)+'秒')
        print('總共花費:'+str(end-start)+'秒')
        return

    def crawler_detail_worker(self, offset, limit):
        '''工作線程'''
        db = pymysql.connect(
            self.DB_HOST,
            self.DB_USER,
            self.DB_PASSWORD,
            self.DB_NAME)
        sql = 'SELECT poi_id, name, link FROM poi ORDER BY poi_id LIMIT ' + \
            str(offset) + ', ' + str(limit) + ';'
        cursor = db.cursor()
        cursor.execute(sql)
        # 查詢結果集
        result = cursor.fetchall()
        detail_list = []
        c_count = 0
        save_count = 100  # 多少條數據保存一次數據庫,默認 100
        for item in result:
            poi_id = item[0]
            name = item[1]
            link = item[2]
            # 爬取之前先查詢一下是否有相應數據
            sql_select = 'SELECT poi_id FROM poi_detail WHERE poi_id=' + \
                str(poi_id) + ';'
            cursor.execute(sql_select)
            result_select = cursor.fetchall()
            # 如果已經爬取過,則跳過
            if len(result_select) != 0 and len(detail_list) != c_count:
                continue

            # 如果沒有獲取過,則爬取數據
            poi_detail = self._get_poi_detail(link)
            # 將爬取到的信息暫存
            poi_detail['name'] = name
            poi_detail['poi_id'] = poi_id
            detail_list.append(poi_detail)
            logging.info('詳情爬取成功 ' + str(poi_id) + ' ' + name)
            print('詳情爬取成功 ' + str(poi_id) + ' ' + name)
            c_count += 1
            # 防止請求過快被拒絕
            time.sleep(0.3)
            # 如果暫存數據達到要求,則保存進數據庫
            if len(detail_list) >= save_count or len(detail_list) == c_count:
                sql = "INSERT IGNORE INTO poi_detail(poi_id, name, mdd, enName, commentCount, description, tel, site, time, traffic, ticket, openingTime, location) \
                        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"
                params = []
                for det in detail_list:
                    params.append((
                        det['poi_id'],
                        det['name'],
                        det['mdd'],
                        det['enName'],
                        det['commentCount'],
                        det['description'],
                        det['tel'],
                        det['site'],
                        det['time'],
                        det['traffic'],
                        det['ticket'],
                        det['openingTime'],
                        det['location'],
                    ))
                try:
                    cursor.executemany(sql, params)
                    db.commit()
                    print('成功保存 ' + str(len(params)) + ' 條數據')
                except Exception as e:
                    logging.error(e)
                    # 如果發生錯誤則回滾
                    db.rollback()
                # 清空暫存的數據
                detail_list = []

    def _get_route(self, mdd_id, page=1):
        '''
        獲取景點信息
        '''
        post_data = self._md5({
            'sAct': 'KMdd_StructWebAjax|GetPoisByTag',
            'iMddid': mdd_id,
            'iTagId': 0,
            'iPage': page
        })
        r = self.REQ.post(self.URL_ROUTE, data=post_data)
        if r.status_code == 403:
            exit('訪問被拒絕')
        response = r.json()
        list_data = response['data']['list']
        page_data = response['data']['page']
        # 解析景點列表數據
        soup = BeautifulSoup(list_data, "html.parser")
        route_list = soup.find_all('a')
        result = []
        for route in route_list:
            link = route['href']
            route_id = re.findall(r'/poi/(.*?).html', link)
            name = route['title']
            image = route.find('img')['src'].split('?')[0]
            result.append({
                'poi_id': int(route_id[0]),
                'name': name,
                'image': image,
                'link': 'http://www.mafengwo.cn'+link,
            })
        # 解析分頁數據
        soup_page = BeautifulSoup(page_data, "html.parser")
        page = int(soup_page.find('span', class_='count').find('span').text)

        for i in result:
            poi = self._get_poi(i['poi_id'])
            retry = 0
            while ('lat' not in poi or 'lng' not in poi) and retry < 6:
                # 如果當前請求沒獲取到相關信息,則等一下再獲取
                logging.debug('Wait 0.3s. Get poi info fail. ' + i['name'])
                time.sleep(0.3)
                poi = self._get_poi(i['poi_id'])
                retry += 1
            i['lat'] = poi['lat'] if 'lat' in poi else None
            i['lng'] = poi['lng'] if 'lng' in poi else None
            i['type'] = poi['type'] if 'type' in poi else None
            i['is_cnmain'] = 1 if 'is_cnmain' in poi and poi['is_cnmain'] else 0
            i['country_mddid'] = poi['country_mddid'] if 'country_mddid' in poi else None

            logging.info(i)
            print(i['poi_id'], i['name'])

        # 返回當前頁列表數據和總頁數
        return {
            'list': result,
            'pagecount': page
        }

    def _get_poi(self, poi_id):
        '''
        獲取景點經緯度信息
        '''
        payload = self._md5({
            'params': {
                'poi_id': poi_id
            }
        })
        # 獲取數據
        r = self.REQ.get(self.URL_POI, params=payload)
        if r.status_code == 403:
            exit('訪問被拒絕')
        try:
            controller_data = r.json()['data']['controller_data']
            poi = controller_data['poi']
            return poi
        except Exception:
            return {}

    def _get_poi_detail(self, url):
        '''
        獲取景點詳細信息
        !! 注意,傳入的景點 url 的 type 必須爲 3

        爬取信息:
        - 目的地 ✅ mdd
        - 英文名 ✅ enName
        - 蜂蜂點評數 ✅ commentCount
        - 簡介 ✅ description
        - 電話、網址、用時參考 ✅ tel site time
        - 交通、門票、開放時間 ✅ traffic ticket openingTime
        - 景點位置 ✅ location

        '''
        # 爬取頁面
        r = self.REQ.get(url)
        if r.status_code == 403:
            exit('訪問被拒絕')
        # 解析 HTML 獲取信息
        soup = BeautifulSoup(r.text, "html.parser")
        # 獲取目的地
        try:
            _mdd = soup.find('div', attrs={'class': 'crumb'}).find_all('a')[
                1].text
        except Exception:
            _mdd = '獲取失敗'
        # 獲取英文名
        try:
            _en_name = soup.find('div', attrs={'class': 'en'}).text
        except Exception:
            _en_name = '獲取失敗'
        # 獲取蜂蜂點評數
        try:
            _comment_count = soup.find('a', attrs={'title': '蜂蜂點評'}).find(
                'span').text.replace('(', '').replace(')', '').replace('條', '')
        except Exception:
            _comment_count = '獲取失敗'
        # 獲取簡介
        try:
            _description = soup.find(
                'div', attrs={'class': 'summary'}).get_text("\n", strip=True)
        except Exception:
            _description = '獲取失敗'
        # 獲取電話、網址、用時參考
        try:
            _tel = soup.find('li', attrs={'class': 'tel'}).find(
                'div', attrs={'class': 'content'}).text
            _site = soup.find(
                'li', attrs={'class': 'item-site'}).find('div', attrs={'class': 'content'}).text
            _time = soup.find(
                'li', attrs={'class': 'item-time'}).find('div', attrs={'class': 'content'}).text
        except Exception:
            _tel = '獲取失敗'
            _site = '獲取失敗'
            _time = '獲取失敗'
        # 獲取交通、門票、開放時間
        try:
            detail = soup.find(
                'div', attrs={'class': 'mod mod-detail'}).find_all('dd')
            _traffic = detail[0].get_text("\n", strip=True)
            _ticket = detail[1].get_text("\n", strip=True)
            _opening = detail[2].get_text("\n", strip=True)
        except Exception:
            _traffic = '獲取失敗'
            _ticket = '獲取失敗'
            _opening = '獲取失敗'
        # 獲取景點位置
        try:
            _location = soup.find(
                'div', attrs={'class': 'mod mod-location'}).find('p').text
        except Exception:
            _location = '獲取失敗'

        return {
            'mdd': _mdd,
            'enName': _en_name,
            'commentCount': _comment_count,
            'description': _description,
            'tel': _tel,
            'site': _site,
            'time': _time,
            'traffic': _traffic,
            'ticket': _ticket,
            'openingTime': _opening,
            'location': _location
        }

    def _get_md5_encrypted_string(self):
        '''
        獲取 MD5 加密 _sn 時使用的加密字符串
        每個實例只調用一次
        '''
        # 以北京景點爲例,首先獲取加密 js 文件的地址
        url = 'http://www.mafengwo.cn/jd/10065/gonglve.html'
        r = self.REQ.get(url)
        if r.status_code == 403:
            exit('訪問被拒絕,請檢查是否爲IP地址被禁')
        param = re.findall(
            r'src="http://js.mafengwo.net/js/hotel/sign/index.js(.*?)"', r.text)
        param = param[0]
        # 拼接 index.js 的文件地址
        url_indexjs = 'http://js.mafengwo.net/js/hotel/sign/index.js' + param
        # 獲取 index.js
        r = self.REQ.get(url_indexjs)
        if r.status_code == 403:
            exit('訪問被拒絕')
        response_text = r.text
        # 查找加密字符串
        result = re.findall(r'var __Ox2133f=\[(.*?)\];', response_text)[0]
        byteslike_encrypted_string = result.split(',')[46].replace('"', '')
        # 解碼
        strTobytes = []
        for item in byteslike_encrypted_string.split('\\x'):
            if item != '':
                num = int(item, 16)
                strTobytes.append(num)
        # 轉換字節爲字符串
        encrypted_string = bytes(strTobytes).decode('utf8')
        self.encrypted_string = encrypted_string
        return encrypted_string

    def _stringify(self, data):
        """
        將 dict 的每一項都變成字符串
        """
        data = sorted(data.items(), key=lambda d: d[0])
        new_dict = {}
        for item in data:
            if type(item[1]) == dict:
                # 如果是字典類型,就遞歸處理
                new_dict[item[0]] = json.dumps(
                    self._stringify(item[1]), separators=(',', ':'))
            else:
                if type(item[1]) == list:
                    # 如果是列表類型,就把每一項都變成字符串
                    new_list = []
                    for i in item[1]:
                        new_list.append(self._stringify(i))
                    new_dict[item[0]] = new_list
                else:
                    if item[1] is None:
                        new_dict[item[0]] = ''
                    else:
                        new_dict[item[0]] = str(item[1])
        return new_dict

    def _md5(self, data):
        '''
        獲取請求參數中的加密參數,_ts 和 _sn
        '''
        _ts = int(round(time.time() * 1000))
        data['_ts'] = _ts
        # 數據對象排序並字符串化
        orderd_data = self._stringify(data)
        # md5 加密
        m = hashlib.md5()
        m.update((json.dumps(orderd_data, separators=(',', ':')) +
                  self.encrypted_string).encode('utf8'))
        _sn = m.hexdigest()
        # _sn 是加密後字符串的一部分
        orderd_data['_sn'] = _sn[2:12]
        return orderd_data

    def _get_mdd(self):
        '''
        獲取目的地信息,只能獲取到國內部分熱門目的地
        暫時沒用到
        '''
        # 獲取網頁源代碼
        r = self.REQ.get(self.URL_MDD)
        if r.status_code == 403:
            exit('訪問被拒絕')
        response_text = r.text
        # 解析 HTMl
        soup = BeautifulSoup(response_text, "html.parser")
        # 獲取國內熱門目的地
        hot_mdd_homeland = soup.find('div', class_='hot-list clearfix')
        # 獲取目的地鏈接
        hot_mdd_homeland_list = hot_mdd_homeland.find_all('a')
        # 保存目的地鏈接、目的地 ID和目的地名稱
        result = []
        for mdd in hot_mdd_homeland_list:
            link = mdd['href']
            mdd_id = re.findall(
                r'/travel-scenic-spot/mafengwo/(.*?).html', link)
            if len(mdd_id) == 1 and mdd_id[0] != '':
                # 過濾部分沒有 ID 的景點
                result.append({
                    'mdd_id': int(mdd_id[0]),
                    'name': mdd.text,
                    'link': 'http://www.mafengwo.cn'+link,
                })
        return result

    @classmethod
    def _read_log_file_get_success_page(self, log_file):
        '''讀取日誌文件,獲取爬取成功的頁碼'''
        result = []
        for file_name in log_file:
            f = open(file_name)
            line = f.readline()
            while line:
                res = re.findall(r'page success: (.*?)$', line)
                if len(res) > 0:
                    result.append(int(res[0]))
                line = f.readline()
        result.sort()
        # 返回爬取成功的頁碼
        return list(set(result))


if __name__ == '__main__':
    # # 正常爬取
    # ins = MafengwoCrawler()
    # ins.crawler_mdd()

    # # 跳過上次爬取成功的頁面
    # # 日誌文件在目錄中查找,自己添加到數組裏
    # ins = MafengwoCrawler(log_file=[...])
    # ins.crawler_mdd()

    # 爬取景點詳情到數據庫
    ins = MafengwoCrawler()
    ins.crawler_detail()

    pass

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章