日萌社
人工智能AI:Keras PyTorch MXNet TensorFlow PaddlePaddle 深度學習實戰(不定時更新)
In [1]:
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
from sklearn.svm import LinearSVR, SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
In [2]:
#沒有用bagging和boosting
#stacking 先用幾個不同的模型做預測 輸出預測值 然後將這幾個模型輸出的預測值作爲特徵來訓練一個新的模型
獲取數據
In [3]:
data=pd.read_csv("data/onehot_feature.csv")
data_test = pd.read_csv("./data/onehot_feature_test.csv")
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 150518 entries, 0 to 150517 Data columns (total 34 columns): Unnamed: 0 150518 non-null int64 時間 150518 non-null int64 小區名 150518 non-null int64 小區房屋出租數量 150518 non-null float64 樓層 150518 non-null int64 總樓層 150518 non-null float64 房屋面積 150518 non-null float64 房屋朝向 150518 non-null object 居住狀態 150518 non-null float64 臥室數量 150518 non-null int64 廳的數量 150518 non-null int64 衛的數量 150518 non-null int64 出租方式 150518 non-null float64 區 150518 non-null float64 位置 150518 non-null float64 地鐵線路 150518 non-null float64 地鐵站點 150518 non-null float64 距離 150518 non-null float64 裝修情況 150518 non-null float64 月租金 150518 non-null float64 log_rent 150518 non-null float64 新朝向 150518 non-null object 房+衛+廳 150518 non-null int64 房/總 150518 non-null float64 衛/總 150518 non-null float64 廳/總 150518 non-null float64 臥室面積 150518 non-null float64 樓層比 150518 non-null float64 戶型 150518 non-null int64 有地鐵 150518 non-null int64 小區線路數 150518 non-null int64 位置線路數 150518 non-null int64 新小區名 150518 non-null int64 小區條數大於100 150518 non-null int64 dtypes: float64(18), int64(14), object(2) memory usage: 39.0+ MB
In [4]:
# 將離散特徵轉換成字符串類型
colunms = ['時間', '新小區名', '居住狀態', '出租方式', '區',
'位置', '地鐵線路', '地鐵站點', '裝修情況', '戶型']
for col in colunms:
data[col] = data[col].astype(str)
In [5]:
x_columns=['小區房屋出租數量','新小區名', '樓層', '總樓層', '房屋面積','居住狀態', '臥室數量',
'衛的數量', '位置', '地鐵站點', '距離', '裝修情況',
'新朝向', '房+衛+廳', '房/總', '衛/總', '廳/總', '臥室面積', '樓層比', '戶型','有地鐵','小區線路數','位置線路數','小區條數大於100',]
y_label='log_rent'
x=data[x_columns]
y=data[y_label]
X_TEST = data_test[x_columns]
In [6]:
# 2.分割數據集
train_x, test_x, train_y, test_y = train_test_split(
x, y, test_size=0.25, random_state=12)
In [7]:
# 1.特徵轉換
vector = DictVectorizer(sparse=True)
x_train = vector.fit_transform(train_x.to_dict(orient='records'))
x_test = vector.transform(test_x.to_dict(orient='records'))
X_TEST = vector.transform(X_TEST.to_dict(orient="records"))
In [8]:
print(x_train.shape, x_test.shape, X_TEST.shape)
(112888, 826) (37630, 826) (46000, 826)
In [9]:
# 2.降維
pca=PCA(0.98)
pca_x_train=pca.fit_transform(x_train.toarray())
pca_x_test=pca.transform(x_test.toarray())
PCA_X_TEST = pca.transform(X_TEST.toarray())
In [10]:
print(pca_x_train.shape, pca_x_test.shape, PCA_X_TEST.shape)
(112888, 361) (37630, 361) (46000, 361)
In [68]:
def rmse(y_true,y_pred):
y_pred=np.exp(y_pred)-1 # 轉換成真實的租金
y_true=np.exp(y_true)-1
return np.sqrt(mean_squared_error(y_true,y_pred))
構建子模型
構建嶺迴歸模型
In [69]:
%%time
# 1.通過參數搜索,確定最優參數alpha的值
ridge = Ridge(normalize=True)
params = {
"alpha": [0.005, 0.01, 1, 5, 10, 20, 50]
}
model1 = GridSearchCV(ridge, param_grid=params, cv=5, n_jobs=-1)
model1.fit(pca_x_train, train_y)
model1.best_params_
#{'alpha': 50, 'fit_intercept': True}
CPU times: user 1.78 s, sys: 705 ms, total: 2.48 s Wall time: 21.5 s
In [70]:
# 利用搜索出的最優參數構建模型
ridge = Ridge(alpha=50, normalize=True)
ridge.fit(pca_x_train, train_y)
Out[70]:
Ridge(alpha=50, copy_X=True, fit_intercept=True, max_iter=None, normalize=True, random_state=None, solver='auto', tol=0.001)
In [71]:
y_pred_test=ridge.predict(pca_x_test)
y_pred_train=ridge.predict(pca_x_train)
print("訓練集rmse:",rmse(train_y,y_pred_train))
print("測試集rmse:",rmse(test_y,y_pred_test))
訓練集rmse: 6.342657781238426 測試集rmse: 6.493947602276618
構建lasso迴歸
In [72]:
%%time
# 1.參數搜索
lasso = Lasso(normalize=True)
params = {
"alpha": [0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10],
"fit_intercept": [True, False]
}
model2 = GridSearchCV(lasso, param_grid=params, cv=5, n_jobs=-1)
model2.fit(pca_x_train, train_y)
print(model2.best_params_)
#{'alpha': 0.001, 'fit_intercept': True}
{'alpha': 0.001, 'fit_intercept': True} CPU times: user 1.68 s, sys: 551 ms, total: 2.23 s Wall time: 49.6 s
In [73]:
# 利用搜索出的最優參數構建模型
lasso=Lasso(alpha=0.001, normalize=True)
lasso.fit(pca_x_train,train_y)
Out[73]:
Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000, normalize=True, positive=False, precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
In [74]:
%%time
y_pred_test=lasso.predict(pca_x_test)
y_pred_train=lasso.predict(pca_x_train)
print("訓練集rmse:",rmse(train_y,y_pred_train))
print("測試集rmse:",rmse(test_y,y_pred_test))
訓練集rmse: 6.385065714494761 測試集rmse: 6.53676743372339 CPU times: user 393 ms, sys: 47.4 ms, total: 440 ms Wall time: 87.1 ms
構建隨機森林
In [75]:
%%time
# 1.參數搜索
rf = RandomForestRegressor(max_features='sqrt') # 設置max_features='sqrt',不然太耗時間
params = {
"n_estimators": [200], # [200,500,700],
"max_depth": [50], # [40, 50, 60]
"min_samples_split": [20, 50, 100],
"min_samples_leaf": [10, 20, 30]
}
model3 = GridSearchCV(rf, param_grid=params, cv=5, n_jobs=-1, verbose=2)
model3.fit(pca_x_train, train_y)
print(model3.best_params_)
# {'max_depth': 50,
# 'min_samples_leaf': 10,
# 'min_samples_split': 20,
# 'n_estimators': 200}
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 55.7min [Parallel(n_jobs=-1)]: Done 45 out of 45 | elapsed: 81.1min finished
{'max_depth': 50, 'min_samples_leaf': 10, 'min_samples_split': 20, 'n_estimators': 200} CPU times: user 10min 4s, sys: 8.96 s, total: 10min 13s Wall time: 1h 31min 30s
In [76]:
%%time
# 利用搜索出的最優參數構建模型
rf=RandomForestRegressor(n_estimators=200,
max_features=0.8,
max_depth=50,
min_samples_split=20,
min_samples_leaf=10,
n_jobs=-1)
rf.fit(pca_x_train,train_y)
CPU times: user 3h 34min 3s, sys: 1min 29s, total: 3h 35min 32s Wall time: 33min 4s
In [77]:
%%time
y_pred_test=rf.predict(pca_x_test)
y_pred_train=rf.predict(pca_x_train)
print("訓練集rmse:",rmse(train_y,y_pred_train))
print("測試集rmse:",rmse(test_y,y_pred_test))
訓練集rmse: 2.133144119124377 測試集rmse: 2.7950254213867094 CPU times: user 24.4 s, sys: 465 ms, total: 24.9 s Wall time: 4.53 s
構建決策樹
In [78]:
%%time
tree=DecisionTreeRegressor()
params={
"max_depth":[60], # [40,50,60,70],
"min_samples_split":[5], # [5,10,20,30,40,50]
"min_samples_leaf":[5], # [2,3,5,7,9,11]
}
model4=GridSearchCV(tree,param_grid=params,cv=5,n_jobs=-1)
model4.fit(pca_x_train,train_y)
print(model4.best_params_)
# {'max_depth': 60, 'min_samples_leaf': 2, 'min_samples_split': 5}
{'max_depth': 60, 'min_samples_leaf': 5, 'min_samples_split': 5} CPU times: user 1min 34s, sys: 2.06 s, total: 1min 36s Wall time: 3min 26s
In [79]:
%%time
from sklearn.tree import DecisionTreeRegressor
#利用搜索出的最優參數構建模型
tree=DecisionTreeRegressor(max_depth=60,min_samples_leaf=2,min_samples_split=5)
tree.fit(pca_x_train,train_y)
CPU times: user 1min 36s, sys: 1.48 s, total: 1min 38s Wall time: 1min 40s
In [80]:
%%time
y_pred_test=tree.predict(pca_x_test)
y_pred_train=tree.predict(pca_x_train)
print("訓練集rmse:",rmse(train_y,y_pred_train))
print("測試集rmse:",rmse(test_y,y_pred_test))
訓練集rmse: 0.805142479875888 測試集rmse: 2.6702036461919856 CPU times: user 254 ms, sys: 123 ms, total: 377 ms Wall time: 380 ms
In [81]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10),dpi=100)
plt.scatter(test_y,y_pred_test)
plt.xlabel("真實值")
plt.ylabel("預測值")
plt.show()
構建支持向量機
In [ ]:
# %%time
# # 1.參數搜索----數據量大 svm太耗時,調參幾乎不可能
# svr=SVR()
# params={
# "gamma":[0.001,0.01,0.1,0.5,1,5],
# "C":[0.001,0.1,0.5,1,5]
# }
# model5=GridSearchCV(svr,param_grid=params,cv=5,n_jobs=-1,verbose=10)
# # verbose:日誌冗長度,int:冗長度,0:不輸出訓練過程,1:偶爾輸出,>1:對每個子模型都輸出。
# model5.fit(pca_x_train,train_y)
# model5.best_params_
In [ ]:
# %%time
# # 隨意選一組參數 --- 耗時太長 放棄該模型
# svr=SVR(gamma=0.1,C=0.5)
# svr.fit(pca_x_train,train_y)
# y_pred=svr.predict(pca_x_test)
# print(rmse(test_y,y_pred))
構建xgboost模型
In [82]:
%%time
import xgboost as xgb
xgbr = xgb.XGBRegressor(objective='reg:linear', learning_rate=0.1, gamma=0.05, max_depth=45,
min_child_weight=0.5, subsample=0.6, reg_alpha=0.5, reg_lambda=0.8, colsample_bytree=0.5, n_jobs=-1)
xgbr.fit(pca_x_train, train_y)
y_pred = xgbr.predict(pca_x_test)
print(rmse(test_y,y_pred))
/Users/sherwin/anaconda3/lib/python3.6/site-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version if getattr(data, 'base', None) is not None and \
[12:23:28] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror. 2.1601162492127104 CPU times: user 28min 30s, sys: 24.2 s, total: 28min 54s Wall time: 29min 29s
In [83]:
%%time
y_pred_test=xgbr.predict(pca_x_test)
y_pred_train=xgbr.predict(pca_x_train)
print("訓練集rmse:",rmse(train_y,y_pred_train))
print("測試集rmse:",rmse(test_y,y_pred_test))
訓練集rmse: 0.9609658477710833 測試集rmse: 2.1601162492127104 CPU times: user 10 s, sys: 427 ms, total: 10.4 s Wall time: 10.6 s
In [84]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10),dpi=100)
plt.scatter(test_y,y_pred_test)
plt.xlabel("真實值")
plt.ylabel("預測值")
plt.show()
Stacking融合
構建Stacking模型需要的數據
In [86]:
%%time
# 獲取每個子模型的預測結果作爲特徵
# 訓練特徵
train_features=[]
train_features.append(ridge.predict(pca_x_train)) # 將每個模型預測值保存起來
train_features.append(lasso.predict(pca_x_train))
# train_features.append(svr.predict(pca_x_train)) # 這個太慢了 不要了
train_features.append(rf.predict(pca_x_train))
train_features.append(tree.predict(pca_x_train))
train_features.append(xgbr.predict(pca_x_train))
# 測試特徵
test_features=[]
test_features.append(ridge.predict(pca_x_test))
test_features.append(lasso.predict(pca_x_test))
# test_features.append(svr.predict(pca_x_test))
test_features.append(rf.predict(pca_x_test))
test_features.append(tree.predict(pca_x_test))
test_features.append(xgbr.predict(pca_x_test))
# 提交結果特徵
TEST_FEATURES=[]
TEST_FEATURES.append(ridge.predict(PCA_X_TEST))
TEST_FEATURES.append(lasso.predict(PCA_X_TEST))
# TEST_FEATURES.append(svr.predict(PCA_X_TEST))
TEST_FEATURES.append(rf.predict(PCA_X_TEST))
TEST_FEATURES.append(tree.predict(PCA_X_TEST))
TEST_FEATURES.append(xgbr.predict(PCA_X_TEST))
CPU times: user 42.1 s, sys: 1.49 s, total: 43.6 s Wall time: 20.3 s
In [87]:
train_features
Out[87]:
[array([2.04715431, 2.05232901, 2.04572967, ..., 2.04659472, 2.04508413, 2.05562638]), array([2.05200758, 2.05200758, 2.05200758, ..., 2.05200758, 2.05200758, 2.05200758]), array([1.67325566, 1.94499122, 1.85460452, ..., 1.92275812, 1.76267895, 2.22438597]), array([1.59023952, 1.84714777, 1.85130219, ..., 1.96150612, 1.77317884, 2.23207518]), array([1.6343094, 1.9145248, 1.8356705, ..., 1.9381661, 1.7626299, 2.2465973], dtype=float32)]
In [88]:
test_features
Out[88]:
[array([2.04925512, 2.04865288, 2.04878586, ..., 2.07295592, 2.05666692, 2.0560697 ]), array([2.05200758, 2.05200758, 2.05200758, ..., 2.05200758, 2.05200758, 2.05200758]), array([1.93842148, 1.71689679, 1.71233925, ..., 3.7684956 , 2.1988801 , 2.15518207]), array([1.93762954, 1.71991266, 1.59023952, ..., 3.92681962, 2.1296814 , 2.08786427]), array([1.9394264, 1.6995616, 1.8815998, ..., 3.7348156, 2.2026072, 2.1582646], dtype=float32)]
In [89]:
# np.vstack:按垂直方向(行順序)堆疊數組構成一個新的數組
mx_train=np.vstack(train_features).T
mx_test=np.vstack(test_features).T
MX_TEST=np.vstack(TEST_FEATURES).T
MX_TEST.shape
Out[89]:
(46000, 5)
Stacking模型訓練
In [90]:
%%time
stack_model=Ridge(fit_intercept=False)
params={
"alpha":np.logspace(-2,3,20)
}
model=GridSearchCV(stack_model,param_grid=params,cv=5,n_jobs=-1)
model.fit(mx_train,train_y)
print(model.best_params_)
{'alpha': 0.06158482110660264} CPU times: user 580 ms, sys: 439 ms, total: 1.02 s Wall time: 3.47 s
In [91]:
%%time
stack_model=Ridge(alpha=0.379269,fit_intercept=False)
stack_model.fit(mx_train,train_y)
y_pred=stack_model.predict(mx_test)
y_pred_train=stack_model.predict(mx_train)
print("訓練集rmse:",rmse(train_y,y_pred_train))
print("測試集rmse:",rmse(test_y,y_pred))
訓練集rmse: 0.7337935133190991 測試集rmse: 2.3272631885188044 CPU times: user 30.8 ms, sys: 9.28 ms, total: 40.1 ms Wall time: 13.2 ms
In [92]:
stack_model.coef_
Out[92]:
array([-0.1330147 , 0.13235901, -0.15773228, 0.6991465 , 0.45928745])
提交結果輸出
In [96]:
Y_PRED_TEST = stack_model.predict(MX_TEST)
Y_PRED_TEST = np.exp(Y_PRED_TEST)-1
print(Y_PRED_TEST)
data = range(1, len(Y_PRED_TEST)+1)
Y_PRED = pd.DataFrame(data=Y_PRED_TEST, columns=["月租金"])
Y_PRED["id"] = range(1, Y_PRED.shape[0]+1)
Y_PRED.head()
[6.2493489 5.12626054 8.64297508 ... 3.59608672 1.05481017 4.8740706 ]
Out[96]:
月租金 | id | |
---|---|---|
0 | 6.249349 | 1 |
1 | 5.126261 | 2 |
2 | 8.642975 | 3 |
3 | 8.885262 | 4 |
4 | 4.482541 | 5 |
In [97]:
Y_PRED.to_csv("./data/Y_PRED_STACK.csv")
模型保存
In [98]:
from sklearn.externals import joblib
joblib.dump(stack_model, "./data/stack_model.kpl")
Out[98]:
['./data/stack_model.kpl']