文章目錄
該比賽爲DC練習賽,要求使用邏輯迴歸的方法,從給定的影響員工離職的因素和員工是否離職的記錄,建立一個邏輯迴歸模型預測有可能離職的員工,‘Attrition’爲預測目標。
本文僅是練習記錄,最後還是沒達到90%,歡迎探討。
1、數據探索
from scipy import stats
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# import pyecharts.options as opts
# from pyecharts.charts import Line,Bar
from scipy.stats import chi2_contingency
train=pd.read_csv('pfm_train.csv')
test=pd.read_csv('pfm_test.csv')
train.head(5)
Age | Attrition | BusinessTravel | Department | DistanceFromHome | Education | EducationField | EmployeeNumber | EnvironmentSatisfaction | Gender | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 37 | 0 | Travel_Rarely | Research & Development | 1 | 4 | Life Sciences | 77 | 1 | Male | ... | 3 | 80 | 1 | 7 | 2 | 4 | 7 | 5 | 0 | 7 |
1 | 54 | 0 | Travel_Frequently | Research & Development | 1 | 4 | Life Sciences | 1245 | 4 | Female | ... | 1 | 80 | 1 | 33 | 2 | 1 | 5 | 4 | 1 | 4 |
2 | 34 | 1 | Travel_Frequently | Research & Development | 7 | 3 | Life Sciences | 147 | 1 | Male | ... | 4 | 80 | 0 | 9 | 3 | 3 | 9 | 7 | 0 | 6 |
3 | 39 | 0 | Travel_Rarely | Research & Development | 1 | 1 | Life Sciences | 1026 | 4 | Female | ... | 3 | 80 | 1 | 21 | 3 | 3 | 21 | 6 | 11 | 8 |
4 | 28 | 1 | Travel_Frequently | Research & Development | 1 | 3 | Medical | 1111 | 1 | Male | ... | 1 | 80 | 2 | 1 | 2 | 3 | 1 | 0 | 0 | 0 |
5 rows × 31 columns
# ‘Attrition’是預測目標
print(train.shape)
print(train.Attrition.value_counts())
print(train.Attrition.mean())
(1100, 31)
0 922
1 178
Name: Attrition, dtype: int64
0.1618181818181818
1.1 各特徵總量及百分比
#連續屬性畫圖
def countinuous_get_fig(data,cols,target):
for i in cols:
data_group=data.groupby([i])[target]
data_mean=data_group.mean()#離職率
data_index=data_mean.index
fig, ax = plt.subplots()
plt.plot(data_index,data_mean,c='r')
plt.title(i)
ax.legend()
# 分類屬性畫圖
def get_fig(data,cols,target):
for i in cols:
data_group=data.groupby([i])[target]
data_sum=data_group.count()
data_mean=data_group.mean()#離職率
data_index=data_mean.index
fig, ax = plt.subplots()
plt.bar(data_index,data_sum,color='c',alpha=0.7)
sum_max=max(data_sum)
data_mean_change=data_mean*sum_max
for a,b in zip(data_index,data_sum):
plt.text(a, b+0.05, '%.0f' % b, ha='center', va= 'bottom',fontsize=10)
plt.plot(data_index,data_mean_change,'go-',c='r',ls='--')
for a,b in zip(data_index,data_mean_change):
plt.text(a, b+0.05, '%.2f' % (b/sum_max*100), ha='center', va= 'bottom',fontsize=10)
plt.xticks(rotation=45)
plt.title(i)
plt.legend()
# get_fig(train,['BusinessTravel','Department'],'Attrition')
1.2 heatmap
corr = train.corr()
plt.figure(figsize=(16,10))
sns.heatmap(corr,annot=True)
存在共線性特徵,‘Monthlyincome’ & ‘JobLevel’ ,選擇與預測值相關性加大的,刪除 ‘Monthlyincome’。 發現有特徵值存在單一值,繼續確認一下,如果T檢驗通過,需要刪除[‘Over18’, ‘StandardHours’]
#檢測一個值的特徵
def singel_value_feature(data,cols):
singel_value_feature=[]
for i in cols:
if len(data[i].unique())==1:
singel_value_feature.append(i)
return singel_value_feature
singel_value_feature(train,train.columns)
['Over18', 'StandardHours']
2、特徵選擇&處理
2.1 特徵分類
首先對特徵進行分類,連續或分類的(設定返回len大於10的特徵,供確認),經查看,將’NumCompaniesWorked’作爲分類變量。
categorical={}
list_cate=[]
countinuous={}
list_counti=[]
for i in train.columns:
l=len(train[i].unique())
if l>=11:
countinuous[i]=l
list_counti.append(i)
else:
categorical[i]=l
list_cate.append(i)
print(countinuous)
print(categorical)
{'Age': 43, 'DistanceFromHome': 29, 'EmployeeNumber': 1100, 'MonthlyIncome': 1028, 'PercentSalaryHike': 15, 'TotalWorkingYears': 40, 'YearsAtCompany': 35, 'YearsInCurrentRole': 19, 'YearsSinceLastPromotion': 16, 'YearsWithCurrManager': 18}
{'Attrition': 2, 'BusinessTravel': 3, 'Department': 3, 'Education': 5, 'EducationField': 6, 'EnvironmentSatisfaction': 4, 'Gender': 2, 'JobInvolvement': 4, 'JobLevel': 5, 'JobRole': 9, 'JobSatisfaction': 4, 'MaritalStatus': 3, 'NumCompaniesWorked': 10, 'Over18': 1, 'OverTime': 2, 'PerformanceRating': 2, 'RelationshipSatisfaction': 4, 'StandardHours': 1, 'StockOptionLevel': 4, 'TrainingTimesLastYear': 7, 'WorkLifeBalance': 4}
# 連續屬性刪除EmployeeNumber,MonthlyIncome
del list_counti[2:4]
len(list_counti)
8
# 分類屬性刪除'Over18', 'StandardHours','Attrition'
list_cate.remove('Over18')
list_cate.remove('StandardHours')
list_cate.remove('Attrition')
len(list_cate)
18
# for i in list_counti:
# sns.distplot(train[i])
# plt.show()
# countinuous_get_fig(train,list_counti,'Attrition')
# train[train.PercentSalaryHike==24]
# get_fig(train,list_cate,'Attrition')
2.2 連續特徵選擇
# T檢驗
def ttest_simple(cols,target,data):
unrelated={}
related={}
for i in cols:
levene=stats.levene(data[data[target]==1][i],data[data[target]==0][i])
if levene.pvalue>0.05:
ttest=stats.ttest_ind(data[data[target]==1][i],data[data[target]==0][i])
else:
ttest=stats.ttest_ind(data[data[target]==1][i],data[data[target]==0][i],equal_var=False)
if ttest.pvalue>0.05:
unrelated[i]=ttest.pvalue
else:
related[i]=ttest.pvalue
return related,unrelated
連續特徵的T檢驗,其實這裏意義不大,因爲屬性並不是正態分佈,通過觀察圖像,也沒有必要做Box—Cox轉換;得出的數據只做參考
related,unrelated=ttest_simple(list_counti,'Attrition',train)
# related
2.3 分類特徵選擇
#卡方檢驗
def x2_simple(cols,target,data):
unrelated={}
related={}
for i in cols:
c=data.groupby([i,target])[target].count().unstack()
kf = chi2_contingency(c)
if kf[1]<=0.05:
# print(x,'p-value=%.4f'%kf[1])
related[i]=kf[1]
else:
unrelated[i]=kf[1]
return related,unrelated
related_1,unrelated_1=x2_simple(list_cate,'Attrition',train)
list_cate_x2=[i for i in related_1.keys()]
len(list_cate_x2)
13
2.4 異常值處理
#選取前面提取的特徵,和目標'Attrition'
list_columns=list_counti+list_cate_x2+['Attrition']
train_choose=train[list_columns]
# 連續特徵標準化
def get_normal(data,num_columns):
num_mean = data[num_columns].mean()
num_std = data[num_columns].std()
num_normal = (data[num_columns] - num_mean) / num_std
tmp=data.drop(columns=num_columns)# 刪除沒有歸一化的數值型數據
data_normal= pd.concat([tmp,num_normal],axis = 1) # concat歸一化數據
return data_normal
train_normal=get_normal(train_choose,list_counti)
# train_normal.head()
#one-hot 編碼
train_encode= pd.get_dummies(train_normal,columns=list_cate_x2)
# train_encode.head()
#異常值處理(刪除)
from sklearn.ensemble import IsolationForest
clf = IsolationForest(max_samples=100,behaviour="new",contamination='auto')
clf.fit(train_encode)
y_pred= clf.predict(train_encode)
check=[]
for i in range(len(y_pred)):
if y_pred[i]==-1:
check.append(i)
train.drop(check,inplace=True)
train.shape
(1003, 31)
2.5 連續特徵處理
#'Attrition'列,留着用於下面特徵篩選,因爲函數的原因
list_columns=list_counti+list_cate_x2+['Attrition']
data_concat=pd.concat([train,test],sort=False)[list_columns]
Y_train=train['Attrition']
#歸一化
data_normal=get_normal(data_concat,list_counti)
data_normal.shape
(1353, 22)
# data_bin_encode = pd.get_dummies(data_normal,columns=list_cate)
# X_train=data_bin_encode[:1003]
# X_test=data_bin_encode[1003:]
2.5.1 分箱測試
事實上不如標準化
# Age_bins=[17,24,26,33,51,56,60]
# DistanceFromHome_bins=[1,21,25,29]
# TotalWorkingYears_bins=[0,2,12,26,30,35,40]
# YearsAtCompany_bins=[0,2,11,25,30,34,40]
# YearsInCurrentRole_bins=[0,1,4,6,10,15.5,20]
# YearsSinceLastPromotion_bins=[0,3,5,9,16]
# YearsWithCurrManager_bins=[0,1,8.5,12.5,18]
# data_concat['Age']=pd.cut(data_concat['Age'],Age_bins)
# data_concat['DistanceFromHome']=pd.cut(data_concat['DistanceFromHome'],DistanceFromHome_bins)
# data_concat['TotalWorkingYears']=pd.cut(data_concat['TotalWorkingYears'],TotalWorkingYears_bins)
# data_concat['YearsAtCompany']=pd.cut(data_concat['YearsAtCompany'],YearsAtCompany_bins)
# data_concat['YearsInCurrentRole']=pd.cut(data_concat['YearsInCurrentRole'],YearsInCurrentRole_bins)
# data_concat['YearsSinceLastPromotion']=pd.cut(data_concat['YearsSinceLastPromotion'],YearsSinceLastPromotion_bins)
# data_concat['YearsWithCurrManager']=pd.cut(data_concat['YearsWithCurrManager'],YearsWithCurrManager_bins)
# data_concat_bin=data_concat.astype('object')
# data_bin_encode = pd.get_dummies(data_concat_bin)
# X_train=data_bin_encode[:1026]
# X_test=data_bin_encode[1026:]
# X_train.shape
(1026, 69)
2.6 拼接特徵,然後再篩選
#暴力兩兩拼接
def _combineFeatures(df,columns):
combine_columns = []
for i in range(len(columns) - 1):
for j in range(i+1,len(columns)):
temp_col = columns[i] + "_" + columns[j]
combine_columns.append(temp_col)
df[temp_col] = df[columns[i]].astype("str") + "_" + df[columns[j]].astype("str")
return df,combine_columns
combine_data,combine_columns = _combineFeatures(data_normal,list_cate_x2)
#拼接特徵和原始分類特徵一起比較
# combine_columns=combine_columns+list_cate_x2
#將拼接特徵進行卡方檢驗,並排序
related_combin,unrelated_combin=x2_simple(combine_columns,'Attrition',combine_data)
/home/leeruohua/anaconda3/lib/python3.7/site-packages/scipy/stats/contingency.py:243: RuntimeWarning: invalid value encountered in less
if np.any(observed < 0):
combin_sort=sorted(related_combin.items(),key = lambda x:x[1])
combin_x2=[i[0] for i in combin_sort]
# del combin_x2[5]
combin_x2
len(combin_x2)
19
3、建立模型
#組合數據
#取前幾個拼接特徵,nums爲拼接特徵數,意義不大,所以取0
nums=0
last_columns=list_counti+list_cate_x2+combin_x2[:nums]
last_data=combine_data[last_columns]
#one-hot 編碼
list_encode=combin_x2[:nums]+list_cate_x2
last_data_encode= pd.get_dummies(last_data,columns=list_encode)
X_train=last_data_encode[:1003]
X_test=last_data_encode[1003:]
X_train.shape
(1003, 69)
3.1、LR
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1,solver='newton-cg')
from sklearn.model_selection import GridSearchCV
parameters = {"C": range(1,200,10)}
lr = GridSearchCV(estimator=clf, param_grid=parameters)
lr.fit(X_train,Y_train)
print(" 最優分數: %.4lf" %clf.best_score_)
print(" 最優參數:", clf.best_params_)
/home/leeruohua/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_split.py:2053: FutureWarning: You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.
warnings.warn(CV_WARNING, FutureWarning)
最優分數: 0.8853
最優參數: {'C': 1}
#‘StratifiedShuffleSplit’出錯,因爲數據均勻,直接取後面一部分數據做測試
# from sklearn.model_selection import StratifiedShuffleSplit
# sss = StratifiedShuffleSplit(n_splits=2,test_size=0.3)
# for train_index,test_index in sss.split(X_train,Y_train):
# trainx,testx = X_train[train_index],X_train[train_index]
# trainy,testy = Y_train[test_index],Y_train[test_index]
#繪製學習曲線
max_m=list(range(30,800,30))
def learning_line(model,max_m,value,target):
train_score=[]
CV_score=[]
for m in max_m:
model.fit(value[:m], target[:m])
train_score.append(model.score(value[:m], target[:m]))
CV_score.append(model.score(value[800:], target[800:]))
print(train_score,CV_score,sep='\n')
plt.plot(max_m,train_score,c='r')
plt.plot(max_m,CV_score,c='g')
learning_line(lr,max_m,X_train,Y_train)
[1.0, 0.95, 0.9444444444444444, 0.95, 0.94, 0.9333333333333333, 0.919047619047619, 0.9125, 0.9037037037037037, 0.9033333333333333, 0.906060606060606, 0.9111111111111111, 0.9102564102564102, 0.9142857142857143, 0.9066666666666666, 0.9041666666666667, 0.907843137254902, 0.9148148148148149, 0.9105263157894737, 0.905, 0.8984126984126984, 0.8984848484848484, 0.8971014492753623, 0.9013888888888889, 0.908, 0.908974358974359]
[0.8423645320197044, 0.8571428571428571, 0.8866995073891626, 0.8916256157635468, 0.8620689655172413, 0.8669950738916257, 0.8916256157635468, 0.8916256157635468, 0.8916256157635468, 0.8719211822660099, 0.896551724137931, 0.8866995073891626, 0.9014778325123153, 0.9014778325123153, 0.9014778325123153, 0.896551724137931, 0.9014778325123153, 0.896551724137931, 0.9014778325123153, 0.8916256157635468, 0.8916256157635468, 0.8916256157635468, 0.8866995073891626, 0.8866995073891626, 0.9014778325123153, 0.896551724137931]
#手動,K折驗證
list_nums=[0,100,200,300,400,500,600,700]
def k(model,list_nums,value,target):
for i in list_nums:
value_tmp=pd.concat([value[:i],value[i+300:]])
target_tmp=pd.concat([target[:i],target[i+300:]])
model.fit(value_tmp,target_tmp)
print(model.score(value[i:i+300],target[i:i+300]))
k(lr,list_nums,X_train,Y_train)
0.9
0.8533333333333334
0.8766666666666667
0.87
0.8766666666666667
0.8566666666666667
0.85
0.88
from sklearn.metrics import roc_auc_score,precision_recall_curve,classification_report,roc_curve
pred = lr.predict_proba(X_train[800:])[:,1]
pred_labels = lr.predict(X_train[800:])
# ROC
print(roc_auc_score(Y_train[800:],pred))
# 分類報告
print(classification_report(Y_train[800:],pred_labels))
0.8073721759809751
precision recall f1-score support
0 0.91 0.97 0.94 174
1 0.71 0.41 0.52 29
micro avg 0.89 0.89 0.89 203
macro avg 0.81 0.69 0.73 203
weighted avg 0.88 0.89 0.88 203
precision,recall,_ = precision_recall_curve(Y_train[800:],pred)
plt.plot(recall,precision)
fpr,tpr,_ = roc_curve(Y_train[800:],pred)
plt.plot(fpr,tpr)
predict=lr.predict(X_test).astype(int)
np.savetxt("result001.csv", predict, delimiter=',')
3.1.1 嘗試修改LR模型閥值
原打算通過更改閾值,修改模型的,但是特徵處理之後,意義不大。
# 按閾值計算的分
def proba_test(answer,proba):
train_y_se=[]
se=0
for i in answer:
if i[0]>=proba:
se=0
else:
se=1
train_y_se.append(se)
return train_y_se
# 計算正確率
def test_score(train_test_y,train_y):
TP,TN,FP,FN=0,0,0,0
for m,n in zip(train_test_y,train_y):
if m==1 and n==1:
TP+=1
if m==0 and n==0:
TN+=1
if m==1 and n==0:
FP+=1
if m==0 and n==1:
FN+=1
return (TP,TN,FP,FN)
answer = lr.predict_proba(X_train[800:])
proba=np.linspace(0,1,200)
y_label=[]
for p in proba:
TP,TN,FP,FN=test_score(proba_test(answer,p),Y_train[800:])
y_label.append((TP+TN))
plt.plot(proba,y_label)
max0=y_label.index(max(y_label))
print(max(y_label),max0,proba[max0])
print(y_label)
182 100 0.5025125628140703
[174, 174, 174, 174, 174, 174, 174, 174, 174, 174, 174, 175, 175, 175, 175, 175, 175, 176, 176, 176, 176, 177, 177, 177, 177, 177, 177, 177, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 179, 179, 179, 180, 180, 180, 180, 179, 179, 180, 180, 180, 179, 179, 180, 180, 180, 180, 181, 181, 181, 180, 180, 180, 180, 180, 181, 181, 181, 181, 181, 180, 181, 182, 182, 182, 182, 182, 182, 182, 182, 181, 181, 181, 181, 180, 179, 179, 179, 178, 177, 176, 175, 175, 175, 174, 172, 172, 172, 172, 171, 171, 170, 169, 169, 169, 169, 168, 167, 166, 164, 164, 164, 165, 164, 165, 166, 165, 165, 165, 165, 165, 163, 163, 163, 161, 161, 159, 159, 159, 156, 156, 154, 152, 152, 152, 152, 152, 152, 152, 151, 151, 151, 147, 146, 146, 142, 142, 142, 139, 139, 139, 136, 132, 130, 128, 126, 123, 121, 118, 114, 111, 109, 103, 97, 90, 87, 82, 75, 68, 56, 44, 29]
3.1.2 嘗試下采樣,堆疊LR
直接提交結果進行測試,效果不如意
#因爲刪除異常值,需要重排index
x_train=X_train.reset_index(drop=True)
y_train=Y_train.reset_index(drop=True)
#提取負類特徵列表
check=[]
for i in range(1003):
if y_train[i]==0:
check.append(i)
# check
import random
#下采樣,訓練21個模型,統計得分
predict_0=np.zeros(350)
for i in range(21):
c_random=random.sample(check,500)
x_train_0=x_train.drop(c_random)
y_train_0=y_train.drop(c_random)
lr.fit(x_train_0,y_train_0)
predict_0+=lr.predict(X_test).astype(int)
predict_0
array([ 0., 0., 0., 0., 0., 9., 0., 0., 0., 0., 0., 17., 21.,
0., 21., 0., 0., 0., 4., 0., 0., 0., 4., 0., 18., 10.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 9., 0., 0., 0.,
0., 0., 5., 0., 0., 0., 21., 21., 0., 0., 0., 21., 0.,
0., 0., 21., 5., 15., 0., 0., 0., 21., 0., 0., 0., 0.,
20., 0., 21., 4., 0., 0., 0., 0., 7., 17., 21., 0., 0.,
0., 0., 0., 10., 0., 0., 21., 1., 0., 0., 0., 0., 18.,
0., 0., 1., 0., 0., 1., 0., 7., 0., 0., 0., 0., 21.,
0., 0., 0., 1., 0., 0., 0., 0., 20., 0., 0., 0., 21.,
21., 0., 0., 11., 2., 0., 0., 21., 0., 0., 0., 12., 21.,
21., 1., 0., 0., 0., 0., 0., 0., 0., 21., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 21., 1., 0., 0., 1., 0., 0.,
0., 19., 0., 0., 0., 0., 0., 21., 18., 0., 14., 1., 0.,
0., 21., 0., 0., 0., 0., 0., 0., 21., 0., 0., 0., 0.,
0., 18., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 21.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 3., 0., 21., 0.,
21., 0., 21., 0., 0., 0., 21., 1., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 19., 0., 0., 0., 21., 5., 0., 0., 21.,
0., 18., 0., 0., 21., 0., 0., 0., 0., 0., 0., 0., 3.,
21., 0., 11., 0., 0., 4., 21., 0., 0., 0., 0., 21., 18.,
0., 14., 0., 0., 0., 0., 0., 0., 21., 0., 0., 0., 21.,
0., 0., 21., 0., 0., 0., 0., 0., 15., 0., 0., 0., 5.,
6., 0., 0., 0., 0., 0., 0., 0., 10., 21., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 13., 0., 0.,
0., 0., 0., 21., 0., 0., 0., 0., 21., 0., 0., 0., 0.,
0., 0., 0., 2., 0., 18., 3., 21., 0., 2., 0., 0., 0.,
0., 21., 19., 7., 0., 0., 0., 10., 0., 0., 21., 0.])
def jude(x):
if x>17:
return 1
else:
return 0
result_score=list(map(lambda x:jude(x),predict_0))
s=0
for i in result_score:
s+=i
s
53
np.savetxt("result.csv",result_score, delimiter=',')
3.2 XGBoost
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
x_train=X_train
y_train=Y_train
x_test=X_test
#one-hot編碼的特徵無法識別,改成數字
col=list(range(69))
x_train.columns=col
x_test.columns=col
# x_test.head()
xgb= xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
max_delta_step=0, max_depth=2, min_child_weight=1, missing=None,
n_estimators=100, n_jobs=1, nthread=None,
objective='binary:logistic', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
subsample=1, verbosity=1)
curve(xgb,max_m,x_train,y_train)
[0.9666666666666667, 1.0, 0.9888888888888889, 0.9833333333333333, 0.9666666666666667, 0.9555555555555556, 0.9571428571428572, 0.95, 0.9444444444444444, 0.9433333333333334, 0.9484848484848485, 0.9444444444444444, 0.9384615384615385, 0.9238095238095239, 0.92, 0.9208333333333333, 0.9254901960784314, 0.9222222222222223, 0.9245614035087719, 0.9116666666666666, 0.9047619047619048, 0.9, 0.9028985507246376, 0.9027777777777778, 0.9093333333333333, 0.908974358974359]
[0.8421052631578947, 0.8315789473684211, 0.8421052631578947, 0.856140350877193, 0.856140350877193, 0.856140350877193, 0.8666666666666667, 0.8596491228070176, 0.8631578947368421, 0.8771929824561403, 0.8631578947368421, 0.8771929824561403, 0.8701754385964913, 0.8631578947368421, 0.8666666666666667, 0.887719298245614, 0.8807017543859649, 0.8771929824561403, 0.8842105263157894, 0.8842105263157894, 0.8912280701754386, 0.8842105263157894, 0.8807017543859649, 0.887719298245614, 0.8842105263157894, 0.887719298245614]
parameters = {"min_child_weight": range(1,11),"reg_lambda": range(0,10),"max_depth": range(0,10)}
clf = GridSearchCV(estimator=xgb, param_grid=parameters)
clf.fit(x_train,y_train)
print(" 最優分數: %.4lf" %clf.best_score_)
print(" 最優參數:", clf.best_params_)
/home/leeruohua/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_split.py:2053: FutureWarning: You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.
warnings.warn(CV_WARNING, FutureWarning)
最優分數: 0.8824
最優參數: {'max_depth': 2, 'min_child_weight': 1, 'reg_lambda': 1}
3.3 GBC
'max_depth=3’時,增加數據可能得到更好的結果
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(max_depth=3)
# predict=gbc.predict(X_test).astype(int)
# np.savetxt("result002.csv", predict, delimiter=',')
curve(gbc,max_m,X_train,Y_train)
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9962962962962963, 0.9966666666666667, 0.9939393939393939, 0.9916666666666667, 0.9923076923076923, 0.9928571428571429, 0.9888888888888889, 0.9833333333333333, 0.984313725490196, 0.9796296296296296, 0.9789473684210527, 0.975, 0.9714285714285714, 0.9666666666666667, 0.9666666666666667, 0.9666666666666667, 0.9653333333333334, 0.9653846153846154]
[0.8070175438596491, 0.8385964912280702, 0.8421052631578947, 0.8385964912280702, 0.8280701754385965, 0.8526315789473684, 0.856140350877193, 0.8526315789473684, 0.8596491228070176, 0.8491228070175438, 0.8631578947368421, 0.8456140350877193, 0.856140350877193, 0.8771929824561403, 0.8771929824561403, 0.8666666666666667, 0.8596491228070176, 0.856140350877193, 0.8701754385964913, 0.8736842105263158, 0.8631578947368421, 0.8771929824561403, 0.8807017543859649, 0.8736842105263158, 0.8771929824561403, 0.8947368421052632]
3.4 SGD
from sklearn.linear_model import SGDClassifier
sgd= SGDClassifier(loss="log",max_iter = 10)
# curve(sgd,max_m,X_train,Y_train)