python crawler - 分析AJAX(根據json)直接爬取攜程酒店問答存入本地

在這裏插入圖片描述在這裏插入圖片描述
url:https://hotels.ctrip.com/Domestic/tool/AjaxHotelFaqLoad.aspx
參數
hotelid :酒店id
page : 頁數
在這裏插入圖片描述

# coding=utf-8
import requests
import json

from time import sleep
from pprint import pprint

global hotel					# hotelid


url = 'https://hotels.ctrip.com/Domestic/tool/AjaxHotelFaqLoad.aspx?'

header = {
    'Host': 'hotels.ctrip.com',
    'Referer': 'https://hotels.ctrip.com/hotel/346412.html',
    'User-Agent': 'chrome'
}
# 將爬取的問答結果保存在本地
def save_to_file(data):
    with open('q_ans_rec.txt','a+',encoding='utf-8') as f1 :
        # f1.writelines(str(data))
        json_str = json.dumps(data, ensure_ascii=False, indent=0)
        f1.write(json_str)
        f1.write('\n\n')

def get_json(hotel, page):
    params_ = {
        'hotelid': hotel,
        'page': page
    }
    try:
        ht = requests.get(url,headers=header,params=params_)

        if ht.ok:
            return ht.json()
    except Exception as e :
        print('Error {}:\n'.format(e))

# 解析json,獲取結果集的字典形式
def json_parser(json):
    if json is None:
        return None
    asks_list = dict(json).get('AskList')
    if not asks_list :
        return None
    for ask in asks_list:
        ask_dicts = {}
        ask_dicts['id'] = ask.get('AskId')	# 問題編號
        ask_dicts['hotel'] = hotel			# 酒店id
        ask_dicts['question'] = ask.get('AskContentTitle')	# 問題
        ask_dicts['reply_num'] = ask.get('ReplyCount')		# 回覆數
        ask_dicts['reply'] = []								# 回覆列表
        ask_dicts['createtime'] = ask.get('CreateTime')		# 問題時間

        if ask.get('ReplyList') :
            for reply_item in ask.get('ReplyList'):
                ask_dicts['reply'].append(
                    (
                        reply_item.get('ReplierText'),		
                        reply_item.get('ReplyContentTitle'),	
                        reply_item.get('ReplyTime')
                    )
                )
            yield ask_dicts # 生成器yield方法

def solve(hotel):
    max_page_num = int(input('input max page num: '))

    for page in range(1,max_page_num + 1):

        print('page: {} \n'.format(page))		# 爬取頁數

        js1 = get_json(hotel,page)
        res_set = json_parser(js1)

        for res in res_set:
            save_to_file(res)
            # print(res)
        sleep(1)			# 爬取間隙

if __name__ == '__main__':
    hotel = int(input('input the hotel id : '))	# hotelid
    solve(hotel)




發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章