前言:
因爲疫情原因,在家閒的無聊,於是看了一下廈門飛往武漢的機票只要200元,於是我誕生了一個爬取機票的念頭
目標:
爬取未來40天全國飛往廈門的機票價格及航班信息
Ajax介紹:
AJAX = Asynchronous JavaScript and XML(異步的 JavaScript 和 XML)。
AJAX 是一種在無需重新加載整個網頁的情況下,能夠更新部分網頁的技術。
起始URL:
https://flights.ctrip.com/itinerary/oneway/sjw-xmn?date=2020-03-25
明確爬取數據信息:
通過 網頁檢查–> network --> XHR 頭部 可以看到機票信息的 api
https://flights.ctrip.com/itinerary/api/12808/products
這個api網址是無法訪問的,不過依舊有數據傳送,我們只能通過模擬頭部請求獲取(payload是一種以JSON格式進行數據傳輸的一種方式)
發現有一個products 的文件 (如果沒有 請嘗試 F5 刷新頁面)
我們可以看到Json數據都在response當中,也就是我們想要的航班信息,機票價格都是通過這個文件傳送的.
我們複製這行Json數據在其他軟件展開便於觀察內容,(或者網頁搜索:Json在線解析)
首先是航班信息:
往下有機票價格:
我們看到機票價格分了很多種類,有售價,打折的價格,我們就爬取第一個price就好了,經過觀察大部分價格都是一樣的
我們已經明確的要爬取的信息了, 接下來就開始抓取數據吧
先獲取一天一個城市的航班信息
獲取Json 信息:
import requests
import json
from fake_useragent import UserAgent
if __name__ == "__main__":
url = url = "https://flights.ctrip.com/itinerary/api/12808/products/oneway/sjw,sjw-xmn?date=2020-03-25"
# 這裏的url 必須寫全!!!
headers = {
"User-Agent": '{}'.format(UserAgent().random), # 構造隨機請求頭
"Referer": "https://flights.ctrip.com/itinerary/oneway/sjw-xmn?date=2020-03-25",
"Content-Type": "application/json"
}
request_payload = {
"flightWay": "Oneway",
"classType": "ALL",
"hasChild": False,
"hasBaby": False,
"searchIndex": 1,
"airportParams": [
{"dcity": "SJW", "acity": "XMN", "dcityname": "石家莊", "acityname": "廈門", "date": "2020-03-25", "dcityid": 428}
]
"token": 從頭部獲取的token,寫在這裏
}
# post請求
response = requests.post(url, data=json.dumps(request_payload), headers=headers).text
print(response)
Json 提取信息
# post請求
response = requests.post(url, data=json.dumps(request_payload), headers=headers, timeout=30).text
# 避免爬取過快 設置延遲
# json.dumps 將 Python 對象編碼成 JSON 字符串
routeList = json.loads(response).get('data').get('routeList')
# json.loads 將已編碼的 JSON 字符串解碼爲 Python 對象
# 依次讀取每條信息
for route in routeList:
# 判斷是否有信息,有時候沒有會報錯
if len(route.get('legs')) == 1:
legs = route.get('legs')
flight = legs[0].get('flight')
# 提取想要的信息
airlineName = flight.get('airlineName')
flightNumber = flight.get('flightNumber')
departureDate = flight.get('departureDate')
arrivalDate = flight.get('arrivalDate')
departureCityName = flight.get('departureAirportInfo').get('cityName')
departureAirportName = flight.get('departureAirportInfo').get('airportName')
arrivalCityName = flight.get('arrivalAirportInfo').get('cityName')
arrivalAirportName = flight.get('arrivalAirportInfo').get('airportName')
print(airlineName, "\t",
flightNumber, "\t",
price, "\t",
departureDate, "\t",
arrivalDate, "\t",
craftTypeName, "\t",
departureCityName, "\t",
departureAirportName, "\t",
departureterminal, "\t",
arrivalCityName, "\t",
arrivalAirportName, "\t",
arrivalterminal, )
else:
pass
然而我們只獲取了一天的數據是並沒有什麼參考價值的,接下來我們獲取未來40天的航班信息
未來40天爬取
首先我們先構造一個日期遍歷的方法
def gen_dates(start_date, day_counts):
next_day = timedelta(days=1) # timedalte 是datetime中的一個對象,該對象表示兩個時間的差值,day=1表示相差一天
for i in range(day_counts): # 從起始時間的現在
yield start_date + next_day * i
def get_date_list(start_date):
"""
:param start_date: 開始時間
:return: 開始時間未來40天后的日期列表
"""
if start_date < datetime.datetime.now():
start = datetime.datetime.now()
else:
start = start_date
end = start + datetime.timedelta(days=40) # 爬取未來一個月的機票
data = []
for d in gen_dates(start, ((end - start).days)):
data.append(d.strftime("%Y-%m-%d"))
return date
if __name__ == "__main__":
start_date = datetime.datetime.strptime("2020-03-25", "%Y-%m-%d") # 類型爲 <class 'datetime.datetime'>
返回的data是一個日期列表,這樣我們就能自定義起始日期,獲取未來40天的航班信息!
當然,一個城市怎麼夠呢, 我們可以根據不同出發城市的頭部獲取更多的航班信息
多個城市航班信息
我們只需要把信息放在一個盒子裏後面通過 format自動添加就可
cities_data = [
{"dcity": "SJW", "acity": "XMN", "dcityname": "石家莊", "acityname": "廈門", "date": "{}", "dcityid": 428, "token": "*********************"},
{"dcity": "BJS", "acity": "XMN", "dcityname": "北京", "acityname": "廈門", "date": "{}", "dcityid": 1, "token": "**********************"},
]
寫入 CSV
path = "/home/liuyang/Spider/Scrapy_Project/ScrapyS/Airtickets/TO_XMN{}.csv".format(start_date)
# 創建csv文件對象
with open(path, "a+") as f:
writer = csv.writer(f, dialect="excel")
# 基於文件對象構建 csv寫入對象
csv_write = csv.writer(f)
csv_data = [airlineName, flightNumber,
departureCityName, departureAirportName, departureterminal, departureDate,
arrivalDate, arrivalCityName, arrivalAirportName, arrivalterminal,
price, craftTypeName]
csv_write.writerow(csv_data)
f.close()
完整代碼
import csv
import requests
import json
import datetime
from datetime import timedelta
from fake_useragent import UserAgent
def gen_dates(start_date, day_counts):
next_day = timedelta(days=1) # timedalte 是datetime中的一個對象,該對象表示兩個時間的差值,day=1表示相差一天
for i in range(day_counts): # 從起始時間的現在
yield start_date + next_day * i
def get_date_list(start_date):
"""
:param start_date: 開始時間
:return: 開始時間未來40天后的日期列表
"""
if start_date < datetime.datetime.now():
start = datetime.datetime.now()
else:
start = start_date
end = start + datetime.timedelta(days=40) # 爬取未來一個月的機票
data = []
for d in gen_dates(start, ((end - start).days)):
data.append(d.strftime("%Y-%m-%d"))
return data
cities_data = [
{"dcity": "SJW", "acity": "XMN", "dcityname": "石家莊", "acityname": "廈門", "date": "{}", "dcityid": 428, "token": "********你的token令牌************"},
{"dcity": "BJS", "acity": "XMN", "dcityname": "北京", "acityname": "廈門", "date": "{}", "dcityid": 1, "token": "***********************"},
]
if __name__ == "__main__":
start_date = datetime.datetime.strptime("2020-03-25", "%Y-%m-%d") # <class 'datetime.datetime'>
date_data = get_date_list(start_date)
for city_data in cities_data:
for day in date_data:
# url = "https://flights.ctrip.com/itinerary/api/12808/products/oneway/sjw,sjw-xmn?date={}".format(day)
url = "https://flights.ctrip.com/itinerary/api/12808/products/oneway/{},{}-xmn?date={}".format(city_data.get('dcity'),
city_data.get('dcity'),
day)
# 這裏的url 必須寫全!!!不能只寫個path
headers = {
'User-Agent': '{}'.format(UserAgent().random),
'Referer': 'https://flights.ctrip.com/itinerary/oneway/{},{}-xmn?date={}'.format(city_data.get('dcity'),
city_data.get('dcity'),
day),
"Content-Type": "application/json"
}
request_payload = {
"flightWay": "Oneway",
"classType": "ALL",
"hasChild": False,
"hasBaby": False,
"searchIndex": 1,
"airportParams": [
{"dcity": "{}".format(city_data.get('dcity')),
"acity": "XMN",
"dcityname": "{}".format(city_data.get('dcityname')),
"acityname": "廈門",
"date": "{}".format(day),
"dcityid": "{}".format(city_data.get('dcityid'))}
],
"token": "{}".format(city_data.get('token'))
}
# post請求
response = requests.post(url, data=json.dumps(request_payload), headers=headers, timeout=30).text
# json.dumps 將 Python 對象編碼成 JSON 字符串
routeList = json.loads(response).get('data').get('routeList') # 字典 get('key') 返回 value
# json.loads 將已編碼的 JSON 字符串解碼爲 Python 對象
# 依次讀取每條信息
for route in routeList:
# 判斷是否有信息,有時候沒有會報錯
if len(route.get('legs')) == 1:
legs = route.get('legs')
flight = legs[0].get('flight')
# 提取想要的信息
airlineName = flight.get('airlineName')
flightNumber = flight.get('flightNumber')
craftTypeName = flight.get('craftTypeName')
departureCityName = flight.get('departureAirportInfo').get('cityName')
departureAirportName = flight.get('departureAirportInfo').get('airportName')
departureterminal = flight.get('departureAirportInfo').get('terminal').get('name')
departureDate = flight.get('departureDate')
arrivalCityName = flight.get('arrivalAirportInfo').get('cityName')
arrivalAirportName = flight.get('arrivalAirportInfo').get('airportName')
arrivalterminal = flight.get('arrivalAirportInfo').get('terminal').get('name')
arrivalDate = flight.get('arrivalDate')
cabins = legs[0].get('cabins')[0]
price = cabins.get('price').get('price')
path = "/home/liuyang/Spider/Scrapy_Project/ScrapyS/Airtickets/TO_XMN{}.csv".format(start_date)
# 創建csv文件對象
with open(path, "a+", encoding='utf-8-sig') as f:
writer = csv.writer(f, dialect="excel")
# 基於文件對象構建 csv寫入對象
csv_write = csv.writer(f)
csv_data = [airlineName, flightNumber,
departureCityName, departureAirportName, departureterminal, departureDate,
arrivalDate, arrivalCityName, arrivalAirportName, arrivalterminal,
price, craftTypeName]
csv_write.writerow(csv_data)
f.close()
print(airlineName, "\t",
flightNumber, "\t",
price, "\t",
departureDate, "\t",
arrivalDate, "\t",
craftTypeName, "\t",
departureCityName, "\t",
departureAirportName, "\t",
departureterminal, "\t",
arrivalCityName, "\t",
arrivalAirportName, "\t",
arrivalterminal, )
else:
pass
結果展示
這裏還有簡單的數據分析啦
https://blog.csdn.net/weixin_44355591/article/details/105008238
最後歡迎各位訪問我的博客網站:wangwanghub.com