python爬蟲獲取攜程旅遊景點評分和評論

寫在前面:酒店和旅遊景點方式不同,不能用我的寫法,如果獲取酒店數據請參考:https://blog.csdn.net/qq_34774456/article/details/89885296

旅遊景點代碼(地址要用手機版的攜程搜自己想搜的景點):

import urllib.request
from bs4 import BeautifulSoup
import pymysql.cursors
import requests
import json
import datetime


def deal_json_invaild(data):
    data = data.replace("\n", "").replace("\r", "").replace("\n\r", "").replace("\r\n", "").replace("\t", "").replace("\\\"","\"").replace("				","")

    data = data.replace('":"{"', "**testPasswors**5")\
        .replace('":"', '&&testPassword&&')\
        .replace('","', "$$testPassword$$")\
        .replace('":{"', "**testPasswors**1")\
        .replace('"},"', "**testPasswors**2")\
        .replace(',"', "**testPasswors**3")\
        .replace('{"', "@@testPassword@@")\
        .replace('"}', "**testPassword**")\
        .replace('":', "**testPasswors**4")\

    data = data.replace('"', '”')\
        .replace("**testPasswors**5","\":{\"").replace('&&testPassword&&', '":"').replace('$$testPassword$$', '","').replace('**testPasswors**1','":{"').replace('**testPasswors**2','"},"').replace('@@testPassword@@', '{"').replace('**testPassword**','"}').replace('**testPasswors**3',',"').replace('**testPasswors**4','":').replace('\\"','\"').replace(' ','').replace("resourceExtraInfo ","resourceExtraInfo").replace("\n", "").replace("\r", "").replace("\n\r", "").replace("\r\n", "").replace("\t", "").replace(r"\"","\"").replace("				","").replace("}\"","}")
    return data

headers={
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko",
"Content-Type": "application/json"
}



f = open("data.txt","w",encoding='utf8')
for x in range(302,303):
    # 綜合評價
    payload={
        "CommentResultInfoEntity": {"BusinessId": "16588","BusinessType": 11,"ChannelType": 7,"CommentTagId": 0,"ImageFilter": 'false',"PageIndex": x,"PageSize": 10,"PoiId": 0,"SortType": 3,"StarType": 0,"TouristType": 0,"VideoImageHeight": 392,"VideoImageWidth": 700},"contentType": "json",
        "head": {"auth": "","cid": "09031081211299101374","ctok": "","cver": "1.0","extension": [{"name": "protocal", "value": "https"}],"lang": "01","sid": "8888","syscode": "09"}
    } 
    #好評
    # payload={
    #     "CommentResultInfoEntity": {"BusinessId": "16588","BusinessType": 11,"ChannelType": 7,"CommentTagId": -11,"ImageFilter": 'false',"PageIndex": x,"PageSize": 10,"PoiId": 0,"SortType": 3,"StarType": 0,"TouristType": 0,"VideoImageHeight": 392,"VideoImageWidth": 700},"contentType": "json",
    #     "head": {"auth": "","cid": "09031081211299101374","ctok": "","cver": "1.0","extension": [{"name": "protocal", "value": "https"}],"lang": "01","sid": "8888","syscode": "09"}
    # }
    # #差評
    # payload={
    #     "CommentResultInfoEntity": {"BusinessId": "16588","BusinessType": 11,"ChannelType": 7,"CommentTagId": -11,"ImageFilter": 'false',"PageIndex": x,"PageSize": 10,"PoiId": 0,"SortType": 3,"StarType": 0,"TouristType": 0,"VideoImageHeight": 392,"VideoImageWidth": 700},"contentType": "json",
    #     "head": {"auth": "","cid": "09031081211299101374","ctok": "","cver": "1.0","extension": [{"name": "protocal", "value": "https"}],"lang": "01","sid": "8888","syscode": "09"}
    # }
    r = requests.post('https://m.ctrip.com/restapi/soa2/13444/json/GetCommentListAndHotTagList?_fxpcqlniredt=09031081211299101374&__gw_appid=99999999&__gw_ver=1.0&__gw_from=10650019636&__gw_platform=H5', json=payload,headers=headers)

    soup = BeautifulSoup(r.text,'html.parser')
    soup = str(soup)
    soup = deal_json_invaild(soup)
    a = json.loads(soup)
    # print(a)
    for j in range(len(a["CommentResult"]["CommentInfo"])):
        datetime1 = int(a["CommentResult"]["CommentInfo"][j]["PublishTime"][6:16])  #留言時間 這裏是/Date()格式 裏面是時間戳 js才能轉換
        dateArray = datetime.datetime.utcfromtimestamp(datetime1)
        otherStyleTime = dateArray.strftime("%Y-%m-%d %H:%M:%S")     
        address,level='None','None'
        print(not a["CommentResult"]["CommentInfo"][j]["UserInfoModel"])
        if a["CommentResult"]["CommentInfo"][j]["UserInfoModel"]:
            address = str(a["CommentResult"]["CommentInfo"][j]["UserInfoModel"]["UserDistrictName"]) #用戶地址
            level = str(a["CommentResult"]["CommentInfo"][j]["UserInfoModel"]["MedalName"]) #用戶級別
        
        dd = "總評分:" +str(a["CommentResult"]["CommentInfo"][j]["TotalStar"]) + "\t評論時間:"+ otherStyleTime +" \t用戶地址:"+address+"  \t用戶會員級別:"+level+" \t 評論:"+ a["CommentResult"]["CommentInfo"][j]["Content"]
        print("第{0}列 第{1}個 {2}".format(x,j,dd))
        f.write(dd+'\n')


    

中間一大堆replace是攜程json很多不規則的地方 需要替換 想data:"{}"這種結構

發現只能搜前3000條,後面就搜不到了,不知道什麼情況

還有綜合評價和好評差評是分開的 

用網頁模擬點擊:

這個參數0是綜合 -11是好評 -12差評

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章