背景:SARIMA,簡單說就是AR+MA+差分+季節性因素+趨勢。所以參數在statsmodels.tsa.statespace.sarimax.SARIMAX裏邊,用3個指標涵蓋核心參數,
order(p,d,q)
、seasonal_order(P,D,Q,s)
和trend
.
Seasonal AutoRegessive Integrated Moving Average with eXogenous regressors model
一、步驟的文字描述:
"""準備階段"""
# 第一、定義一個待傳入參數的模型,及模型評分
# 第二、定義一組要測試的參數組合
"""開始"""
# 第三、定義一個函數,記錄各個參數組合及傳入模型後的評分
# 第四、選擇評分最優的一組參數組合,組成模型
# 第五、使用模型
二、代碼:(在原文基礎上有調整)
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
import pandas as pd
import warnings
import joblib
from multiprocessing import cpu_count
def walk_forward_validation(data, n_test, cfg):
# 定義一個給一套參數cfg打分的函數
predictions = []
train, test = data[:-n_test], data[-n_test:]
history = [i for i in train]
for x in range(len(test)):
order, sorder, trend = cfg
model = SARIMAX(history,
order=order, seasonal_order=sorder, trend=trend,
enforce_stationarity=False,
enforce_invertibility=False)
model.fit(disp=False)
yhat = model_fit.predict(len(history), len(history))
predicions.append(yhat)
history.append(test[x])
error = mean_squared_error(test, predictions)
return error
def score_model(data, n_test, cfg, debug=False):
# 記錄下一套參數,以及該參數下模型的得分
key = str(cfg)
if debug:
error = walk_forward_validation(data, n_test, cfg)
else:
try:
with warnings.catch_warnings():
warnings.filterwarnings('ignore')
error = warlk_forward_validation(data, n_test, cfg)
except:
error=None
if error is not None:
print(f'> Model{key} {error:.3f}')
return key, error
def grid_search(data, cfg_list, n_test, parallel=True):
# 把所有參數組合一一帶入模型,並把所有參數組合及其對應模型得分記錄下來,排序。
if parallel:
executor = joblib.Parallel(n_jobs=cpu_count(),
backend='multiprocessing')
tasks = (joblib.delayed(score_model)(data, n_test, cfg) for cfg in cfg_list)
scores = executor(tasks)
else:
scores = [score_model(data, n_test, cfg) for cfg in cfg_list]
scores = [r for r in scores if r[1] != None]
scores.sort(key=lambda tup: tup[1)
return scores
def sarima_config():
# 造出自己預估的所有參數組合list
cfg_list = []
p_params = [0, 1, 2]
d_params = [0, 1]
q_params = [0, 1, 2]
P_params = [0, 1, 2]
D_params = [0, 1]
Q_params = [0, 1, 2]
s = [2, 4, 12]
t = ['n', 'c', 't', 'ct']
for p in p_params:
for d in d_params:
for q in q_params:
for P in P_params:
for D in D_params:
for Q in Q_params:
for s in s:
for t in t:
cfg = [(p,d,q), (P,D,Q,s),t]
cfg_list.append(cfg)
return cfg_list
if __name__ == '__main__':
df = pd.read_csv('filepath+filename.csv')
data = df.values
n_test = number_of_test
cfg_list = sarima_config()
scores = grid_search(data, cfg_list, n_test)
print('Done')
for cfg, error in scores[:5]: # 取出前5個最優的參數組合及對應的模型得分
print(cfg, error)
三、其他
pmdarima.arima.aotu_arima()
可以自動使用訓練數據集得到參數。但是很多人都不用它,應該是因爲它找到的參數並不怎麼好的緣故。
例如,如下的原始數據使用pmdarima.arima.auto_arima(),沒有手工測試得到的好:
直接使用的話並不好
import numpy as np
import pmdarima as pm
import matplotlib.pyplot as plt
from pmdarima.model_selection import train_test_split
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('path+filename')
y = df.values
train, test = train_test_split(y, train_size=0.7)
model = pm.arima.auto_arima(train, seasonal=True, m=10)
# m等於(P,D,Q,s)的s。m=10是通過線圖,肉眼觀察得到的。(原始數據x是時間)
print(model.summary())
forecasets = model.predict(len(test))
x = np.arange(len(train))
plt.plot(x, y, c='black')
plt.plot(x[len(train):], forecasets, c='blue')
plt.show()
結果&圖如下:
,作爲對比使用最上邊得到的參數貌似更好些,從AIC/BIC/HQIC看:
代碼如下
from statsmodels.tsa.statespace.sarimax import SARIMAX
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pmdarima.model_selection import train_test_split
import warnings
df = pd.read_csv('path+filename')
y = df.values
x = np.arange(len(y))
warnings.filterwarings('ignore')
train, test = trains_test_split(y, train_size=0.7)
model = SARIMAX(train,
order = (0, 1, 0),
seasonal_order=(2, 1, 0, 10),
trend='n') # 利用我們手動測試出來的最優參數組合
model_fit = model.fi(disp=False)
print(model_fit.summery())
f = model_fit.predict(1, end=len(y)+10)
plt.plot(x, y, c='black')
plt.plot(np.arange(len(y)+10), f, c='blue')
plt.show()