url:https://hotels.ctrip.com/Domestic/tool/AjaxHotelFaqLoad.aspx
參數:
hotelid
:酒店id
page
: 頁數
# coding=utf-8
import requests
import json
from time import sleep
from pprint import pprint
global hotel # hotelid
url = 'https://hotels.ctrip.com/Domestic/tool/AjaxHotelFaqLoad.aspx?'
header = {
'Host': 'hotels.ctrip.com',
'Referer': 'https://hotels.ctrip.com/hotel/346412.html',
'User-Agent': 'chrome'
}
# 將爬取的問答結果保存在本地
def save_to_file(data):
with open('q_ans_rec.txt','a+',encoding='utf-8') as f1 :
# f1.writelines(str(data))
json_str = json.dumps(data, ensure_ascii=False, indent=0)
f1.write(json_str)
f1.write('\n\n')
def get_json(hotel, page):
params_ = {
'hotelid': hotel,
'page': page
}
try:
ht = requests.get(url,headers=header,params=params_)
if ht.ok:
return ht.json()
except Exception as e :
print('Error {}:\n'.format(e))
# 解析json,獲取結果集的字典形式
def json_parser(json):
if json is None:
return None
asks_list = dict(json).get('AskList')
if not asks_list :
return None
for ask in asks_list:
ask_dicts = {}
ask_dicts['id'] = ask.get('AskId') # 問題編號
ask_dicts['hotel'] = hotel # 酒店id
ask_dicts['question'] = ask.get('AskContentTitle') # 問題
ask_dicts['reply_num'] = ask.get('ReplyCount') # 回覆數
ask_dicts['reply'] = [] # 回覆列表
ask_dicts['createtime'] = ask.get('CreateTime') # 問題時間
if ask.get('ReplyList') :
for reply_item in ask.get('ReplyList'):
ask_dicts['reply'].append(
(
reply_item.get('ReplierText'),
reply_item.get('ReplyContentTitle'),
reply_item.get('ReplyTime')
)
)
yield ask_dicts # 生成器yield方法
def solve(hotel):
max_page_num = int(input('input max page num: '))
for page in range(1,max_page_num + 1):
print('page: {} \n'.format(page)) # 爬取頁數
js1 = get_json(hotel,page)
res_set = json_parser(js1)
for res in res_set:
save_to_file(res)
# print(res)
sleep(1) # 爬取間隙
if __name__ == '__main__':
hotel = int(input('input the hotel id : ')) # hotelid
solve(hotel)