爬取馬蜂窩景點信息(含源代碼)
- 爬取熱門目的地信息
MafengwoCrawler()._get_mdd()
- 爬取目的地內景點信息
MafengwoCrawler().crawler_mdd()
- 爬取景點詳細信息
MafengwoCrawler().crawler_detail()
源碼文件及數據庫結構文件見我的資源。(另包含國內10W景點信息的數據庫文件)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/04/24
# @Author : AIsland
# @Email : [email protected]
# @File : crawler.py
# @Description : 爬取馬蜂窩各省市景點數據
import requests
import re
import time
import json
import hashlib
import logging
import threading
import pymysql
from bs4 import BeautifulSoup
class MafengwoCrawler:
# 查詢目的地的網址
# 目的地內包含景點
URL_MDD = 'http://www.mafengwo.cn/mdd/'
# 查詢景點的網址
# 包含景點詳情的鏈接、景點圖片和景點名稱
URL_ROUTE = 'http://www.mafengwo.cn/ajax/router.php'
# 查詢景點座標經緯度的網址
# 經度:longitude lng
# 緯度:lat itude lat
URL_POI = 'http://pagelet.mafengwo.cn/poi/pagelet/poiLocationApi'
# 通用 Headers
HEADERS = {
'Referer': 'http://www.mafengwo.cn/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}
# mysql 數據庫鏈接信息
DB_HOST = 'localhost'
DB_USER = 'root'
DB_PASSWORD = '[email protected]'
DB_NAME = 'mafengwo'
# 請求數據加密需要的字符串,由 _get_md5_encrypted_string() 方法獲取
encrypted_string = ''
# 記錄不用爬取的頁碼,即爬取成功的頁碼
success_pages = []
def __init__(self, log_file=None):
# 使用說明 https://www.cnblogs.com/nancyzhu/p/8551506.html
logging.basicConfig(level=logging.DEBUG,
filename='mafengwo.'+str(int(time.time()))+'.log',
format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'
)
# 初始化請求對象
self.REQ = requests.session()
# 設置通用 Headers
self.REQ.headers.update(self.HEADERS)
# 獲取請求數據加密需要的字符串
self._get_md5_encrypted_string()
# 如果傳入日誌文件,則過濾已爬取成功的頁碼
if log_file is not None:
self.success_pages = self._read_log_file_get_success_page(log_file)
print('當前已經成功爬取的頁數:' + str(len(self.success_pages)))
print('5秒後繼續運行')
time.sleep(5)
def crawler_mdd(self, mdd_id=21536):
'''
爬取單個目的地的景點信息
默認:21536,中國
'''
# mdd_id = 12522 # 鼓浪嶼,16頁,測試數據
# 開始爬數據
start = int(time.time())
# 先獲取數據總頁數
res = self._get_route(mdd_id)
page_total = res['pagecount']
# 計算每個線程爬取多少頁
page_range = round(page_total/20)
if page_range == 0:
page_range = 1
logging.info('總共'+str(page_total)+'頁,每個線程爬取'+str(page_range)+'頁')
print('總共'+str(page_total)+'頁,每個線程爬取'+str(page_range)+'頁')
# 開啓多線程模式
thread = []
for i in range(1, page_total+1, page_range):
page_start = i
page_end = i + page_range
if page_end > page_total + 1:
page_end = page_total + 1
t = threading.Thread(target=self.crawler,
args=(mdd_id, page_start, page_end))
thread.append(t)
for i in range(0, len(thread)):
thread[i].start()
for i in range(0, len(thread)):
thread[i].join()
end = int(time.time())
logging.info('總共花費:'+str(end-start)+'秒')
print('總共花費:'+str(end-start)+'秒')
def crawler(self, mdd_id, start_page, end_page):
'''
真正的爬蟲
是時候展示真正的實力了
'''
# 連接數據庫
db = pymysql.connect(
self.DB_HOST,
self.DB_USER,
self.DB_PASSWORD,
self.DB_NAME)
for page in range(start_page, end_page):
if page in self.success_pages:
print('跳過:'+str(page))
continue
page_pass = False
page_retry = 0
while not page_pass and page_retry < 11:
try:
print('當前爬取頁數:'+str(page))
result = self._get_route(mdd_id, page=page)['list']
# 存數據庫
sql = "INSERT IGNORE INTO poi(poi_id, name, image, link, lat, lng, type, is_cnmain, country_mddid) \
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s);"
params = []
for item in result:
params.append((
item['poi_id'],
item['name'],
item['image'],
item['link'],
item['lat'],
item['lng'],
item['type'],
item['is_cnmain'],
item['country_mddid']
))
try:
cursor = db.cursor()
cursor.executemany(sql, params)
db.commit()
# 成功
logging.info('page success: ' + str(page))
print('page success: ' + str(page))
page_pass = True
except Exception as e:
logging.error(e)
# 如果發生錯誤則回滾
db.rollback()
except Exception as e:
page_retry += 1
logging.error(e)
logging.error(result)
# 關閉數據庫
db.close()
def crawler_detail(self):
'''
爬取景點詳細信息到數據庫
執行這個方法之前,需要先爬取好數據到 poi 數據表
多線程爬取 crawler_detail_worker
'''
# 查詢 poi 數據表中的數據條數
db = pymysql.connect(
self.DB_HOST,
self.DB_USER,
self.DB_PASSWORD,
self.DB_NAME)
sql = 'SELECT COUNT(*) as total from poi;'
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
# 總數據條數
total = result[0][0]
db.close()
# 開始爬數據
start = int(time.time())
# 先獲取總數據條數
total = result[0][0]
# 計算每個線程爬取多少條
range_count = round(total/20)
if range_count == 0:
range_count = 1
# 日誌
logging.info('總共'+str(total)+'條數據,每個線程爬取'+str(range_count)+'條')
print('總共'+str(total)+'條數據,每個線程爬取'+str(range_count)+'條')
# 開啓多線程模式
thread = []
for i in range(0, total, range_count):
# i, range_count SQL 查詢起始位置,查詢數量
t = threading.Thread(target=self.crawler_detail_worker,
args=(i, range_count))
thread.append(t)
for i in range(0, len(thread)):
thread[i].start()
for i in range(0, len(thread)):
thread[i].join()
end = int(time.time())
logging.info('總共花費:'+str(end-start)+'秒')
print('總共花費:'+str(end-start)+'秒')
return
def crawler_detail_worker(self, offset, limit):
'''工作線程'''
db = pymysql.connect(
self.DB_HOST,
self.DB_USER,
self.DB_PASSWORD,
self.DB_NAME)
sql = 'SELECT poi_id, name, link FROM poi ORDER BY poi_id LIMIT ' + \
str(offset) + ', ' + str(limit) + ';'
cursor = db.cursor()
cursor.execute(sql)
# 查詢結果集
result = cursor.fetchall()
detail_list = []
c_count = 0
save_count = 100 # 多少條數據保存一次數據庫,默認 100
for item in result:
poi_id = item[0]
name = item[1]
link = item[2]
# 爬取之前先查詢一下是否有相應數據
sql_select = 'SELECT poi_id FROM poi_detail WHERE poi_id=' + \
str(poi_id) + ';'
cursor.execute(sql_select)
result_select = cursor.fetchall()
# 如果已經爬取過,則跳過
if len(result_select) != 0 and len(detail_list) != c_count:
continue
# 如果沒有獲取過,則爬取數據
poi_detail = self._get_poi_detail(link)
# 將爬取到的信息暫存
poi_detail['name'] = name
poi_detail['poi_id'] = poi_id
detail_list.append(poi_detail)
logging.info('詳情爬取成功 ' + str(poi_id) + ' ' + name)
print('詳情爬取成功 ' + str(poi_id) + ' ' + name)
c_count += 1
# 防止請求過快被拒絕
time.sleep(0.3)
# 如果暫存數據達到要求,則保存進數據庫
if len(detail_list) >= save_count or len(detail_list) == c_count:
sql = "INSERT IGNORE INTO poi_detail(poi_id, name, mdd, enName, commentCount, description, tel, site, time, traffic, ticket, openingTime, location) \
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"
params = []
for det in detail_list:
params.append((
det['poi_id'],
det['name'],
det['mdd'],
det['enName'],
det['commentCount'],
det['description'],
det['tel'],
det['site'],
det['time'],
det['traffic'],
det['ticket'],
det['openingTime'],
det['location'],
))
try:
cursor.executemany(sql, params)
db.commit()
print('成功保存 ' + str(len(params)) + ' 條數據')
except Exception as e:
logging.error(e)
# 如果發生錯誤則回滾
db.rollback()
# 清空暫存的數據
detail_list = []
def _get_route(self, mdd_id, page=1):
'''
獲取景點信息
'''
post_data = self._md5({
'sAct': 'KMdd_StructWebAjax|GetPoisByTag',
'iMddid': mdd_id,
'iTagId': 0,
'iPage': page
})
r = self.REQ.post(self.URL_ROUTE, data=post_data)
if r.status_code == 403:
exit('訪問被拒絕')
response = r.json()
list_data = response['data']['list']
page_data = response['data']['page']
# 解析景點列表數據
soup = BeautifulSoup(list_data, "html.parser")
route_list = soup.find_all('a')
result = []
for route in route_list:
link = route['href']
route_id = re.findall(r'/poi/(.*?).html', link)
name = route['title']
image = route.find('img')['src'].split('?')[0]
result.append({
'poi_id': int(route_id[0]),
'name': name,
'image': image,
'link': 'http://www.mafengwo.cn'+link,
})
# 解析分頁數據
soup_page = BeautifulSoup(page_data, "html.parser")
page = int(soup_page.find('span', class_='count').find('span').text)
for i in result:
poi = self._get_poi(i['poi_id'])
retry = 0
while ('lat' not in poi or 'lng' not in poi) and retry < 6:
# 如果當前請求沒獲取到相關信息,則等一下再獲取
logging.debug('Wait 0.3s. Get poi info fail. ' + i['name'])
time.sleep(0.3)
poi = self._get_poi(i['poi_id'])
retry += 1
i['lat'] = poi['lat'] if 'lat' in poi else None
i['lng'] = poi['lng'] if 'lng' in poi else None
i['type'] = poi['type'] if 'type' in poi else None
i['is_cnmain'] = 1 if 'is_cnmain' in poi and poi['is_cnmain'] else 0
i['country_mddid'] = poi['country_mddid'] if 'country_mddid' in poi else None
logging.info(i)
print(i['poi_id'], i['name'])
# 返回當前頁列表數據和總頁數
return {
'list': result,
'pagecount': page
}
def _get_poi(self, poi_id):
'''
獲取景點經緯度信息
'''
payload = self._md5({
'params': {
'poi_id': poi_id
}
})
# 獲取數據
r = self.REQ.get(self.URL_POI, params=payload)
if r.status_code == 403:
exit('訪問被拒絕')
try:
controller_data = r.json()['data']['controller_data']
poi = controller_data['poi']
return poi
except Exception:
return {}
def _get_poi_detail(self, url):
'''
獲取景點詳細信息
!! 注意,傳入的景點 url 的 type 必須爲 3
爬取信息:
- 目的地 ✅ mdd
- 英文名 ✅ enName
- 蜂蜂點評數 ✅ commentCount
- 簡介 ✅ description
- 電話、網址、用時參考 ✅ tel site time
- 交通、門票、開放時間 ✅ traffic ticket openingTime
- 景點位置 ✅ location
'''
# 爬取頁面
r = self.REQ.get(url)
if r.status_code == 403:
exit('訪問被拒絕')
# 解析 HTML 獲取信息
soup = BeautifulSoup(r.text, "html.parser")
# 獲取目的地
try:
_mdd = soup.find('div', attrs={'class': 'crumb'}).find_all('a')[
1].text
except Exception:
_mdd = '獲取失敗'
# 獲取英文名
try:
_en_name = soup.find('div', attrs={'class': 'en'}).text
except Exception:
_en_name = '獲取失敗'
# 獲取蜂蜂點評數
try:
_comment_count = soup.find('a', attrs={'title': '蜂蜂點評'}).find(
'span').text.replace('(', '').replace(')', '').replace('條', '')
except Exception:
_comment_count = '獲取失敗'
# 獲取簡介
try:
_description = soup.find(
'div', attrs={'class': 'summary'}).get_text("\n", strip=True)
except Exception:
_description = '獲取失敗'
# 獲取電話、網址、用時參考
try:
_tel = soup.find('li', attrs={'class': 'tel'}).find(
'div', attrs={'class': 'content'}).text
_site = soup.find(
'li', attrs={'class': 'item-site'}).find('div', attrs={'class': 'content'}).text
_time = soup.find(
'li', attrs={'class': 'item-time'}).find('div', attrs={'class': 'content'}).text
except Exception:
_tel = '獲取失敗'
_site = '獲取失敗'
_time = '獲取失敗'
# 獲取交通、門票、開放時間
try:
detail = soup.find(
'div', attrs={'class': 'mod mod-detail'}).find_all('dd')
_traffic = detail[0].get_text("\n", strip=True)
_ticket = detail[1].get_text("\n", strip=True)
_opening = detail[2].get_text("\n", strip=True)
except Exception:
_traffic = '獲取失敗'
_ticket = '獲取失敗'
_opening = '獲取失敗'
# 獲取景點位置
try:
_location = soup.find(
'div', attrs={'class': 'mod mod-location'}).find('p').text
except Exception:
_location = '獲取失敗'
return {
'mdd': _mdd,
'enName': _en_name,
'commentCount': _comment_count,
'description': _description,
'tel': _tel,
'site': _site,
'time': _time,
'traffic': _traffic,
'ticket': _ticket,
'openingTime': _opening,
'location': _location
}
def _get_md5_encrypted_string(self):
'''
獲取 MD5 加密 _sn 時使用的加密字符串
每個實例只調用一次
'''
# 以北京景點爲例,首先獲取加密 js 文件的地址
url = 'http://www.mafengwo.cn/jd/10065/gonglve.html'
r = self.REQ.get(url)
if r.status_code == 403:
exit('訪問被拒絕,請檢查是否爲IP地址被禁')
param = re.findall(
r'src="http://js.mafengwo.net/js/hotel/sign/index.js(.*?)"', r.text)
param = param[0]
# 拼接 index.js 的文件地址
url_indexjs = 'http://js.mafengwo.net/js/hotel/sign/index.js' + param
# 獲取 index.js
r = self.REQ.get(url_indexjs)
if r.status_code == 403:
exit('訪問被拒絕')
response_text = r.text
# 查找加密字符串
result = re.findall(r'var __Ox2133f=\[(.*?)\];', response_text)[0]
byteslike_encrypted_string = result.split(',')[46].replace('"', '')
# 解碼
strTobytes = []
for item in byteslike_encrypted_string.split('\\x'):
if item != '':
num = int(item, 16)
strTobytes.append(num)
# 轉換字節爲字符串
encrypted_string = bytes(strTobytes).decode('utf8')
self.encrypted_string = encrypted_string
return encrypted_string
def _stringify(self, data):
"""
將 dict 的每一項都變成字符串
"""
data = sorted(data.items(), key=lambda d: d[0])
new_dict = {}
for item in data:
if type(item[1]) == dict:
# 如果是字典類型,就遞歸處理
new_dict[item[0]] = json.dumps(
self._stringify(item[1]), separators=(',', ':'))
else:
if type(item[1]) == list:
# 如果是列表類型,就把每一項都變成字符串
new_list = []
for i in item[1]:
new_list.append(self._stringify(i))
new_dict[item[0]] = new_list
else:
if item[1] is None:
new_dict[item[0]] = ''
else:
new_dict[item[0]] = str(item[1])
return new_dict
def _md5(self, data):
'''
獲取請求參數中的加密參數,_ts 和 _sn
'''
_ts = int(round(time.time() * 1000))
data['_ts'] = _ts
# 數據對象排序並字符串化
orderd_data = self._stringify(data)
# md5 加密
m = hashlib.md5()
m.update((json.dumps(orderd_data, separators=(',', ':')) +
self.encrypted_string).encode('utf8'))
_sn = m.hexdigest()
# _sn 是加密後字符串的一部分
orderd_data['_sn'] = _sn[2:12]
return orderd_data
def _get_mdd(self):
'''
獲取目的地信息,只能獲取到國內部分熱門目的地
暫時沒用到
'''
# 獲取網頁源代碼
r = self.REQ.get(self.URL_MDD)
if r.status_code == 403:
exit('訪問被拒絕')
response_text = r.text
# 解析 HTMl
soup = BeautifulSoup(response_text, "html.parser")
# 獲取國內熱門目的地
hot_mdd_homeland = soup.find('div', class_='hot-list clearfix')
# 獲取目的地鏈接
hot_mdd_homeland_list = hot_mdd_homeland.find_all('a')
# 保存目的地鏈接、目的地 ID和目的地名稱
result = []
for mdd in hot_mdd_homeland_list:
link = mdd['href']
mdd_id = re.findall(
r'/travel-scenic-spot/mafengwo/(.*?).html', link)
if len(mdd_id) == 1 and mdd_id[0] != '':
# 過濾部分沒有 ID 的景點
result.append({
'mdd_id': int(mdd_id[0]),
'name': mdd.text,
'link': 'http://www.mafengwo.cn'+link,
})
return result
@classmethod
def _read_log_file_get_success_page(self, log_file):
'''讀取日誌文件,獲取爬取成功的頁碼'''
result = []
for file_name in log_file:
f = open(file_name)
line = f.readline()
while line:
res = re.findall(r'page success: (.*?)$', line)
if len(res) > 0:
result.append(int(res[0]))
line = f.readline()
result.sort()
# 返回爬取成功的頁碼
return list(set(result))
if __name__ == '__main__':
# # 正常爬取
# ins = MafengwoCrawler()
# ins.crawler_mdd()
# # 跳過上次爬取成功的頁面
# # 日誌文件在目錄中查找,自己添加到數組裏
# ins = MafengwoCrawler(log_file=[...])
# ins.crawler_mdd()
# 爬取景點詳情到數據庫
ins = MafengwoCrawler()
ins.crawler_detail()
pass