再刷泰坦尼克 特徵工程

 

將訓練集和測試集組合處理

#忽略警告提示
import warnings
warnings.filterwarnings('ignore')
#數據處理
import pandas as pd
import numpy as np
import random
#可視化
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
path='C:/Users/Titanic/'
p1=open(path+'train.csv')
p2=open(path+'test.csv')
train=pd.read_csv(p1)
test=pd.read_csv(p2)

#合併訓練集和測試集
combined=train.append(test,ignore_index=True)
combined.shape
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
(1309, 12)
  • 1
  • 2

提取乘客頭銜

combined['Title']=combined['Name'].map(lambda x: x.split(',')[1].split('.')[0].strip())
Title_Dictionary={
                        "Capt":       "Officer",
                        "Col":        "Officer",
                        "Major":      "Officer",
                        "Jonkheer":   "Royalty",
                        "Don":        "Royalty",
                        "Sir" :       "Royalty",
                        "Dr":         "Officer",
                        "Rev":        "Officer",
                        "the Countess":"Royalty",
                        "Dona":       "Royalty",
                        "Mme":        "Mrs",
                        "Mlle":       "Miss",
                        "Ms":         "Mrs",
                        "Mr" :        "Mr",
                        "Mrs" :       "Mrs",
                        "Miss" :      "Miss",
                        "Master" :    "Master",
                        "Lady" :      "Royalty"
}
combined['Title']=combined.Title.map(Title_Dictionary)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22

處理缺失值

處理年齡

從可視化分析可知,年齡分佈存在羣體的差異,簡單的用平均值或者衆數填充是不合適的

#將數據按照性別,艙位,頭銜分組,並計算中位數
grouped_train=combined.head(891).groupby(['Sex','Pclass','Title'])
grouped_median_train=grouped_train.median()
grouped_test=combined.iloc[891:].groupby(['Sex','Pclass','Title'])
grouped_median_test=grouped_test.median()
  • 1
  • 2
  • 3
  • 4
  • 5
grouped_median_train
  • 1
      Age Fare Parch PassengerId SibSp Survived
Sex Pclass Title            
female 1 Miss 30.0 88.25000 0.0 369.0 0.0 1.0
Mrs 40.0 79.20000 0.0 499.0 1.0 1.0
Officer 49.0 25.92920 0.0 797.0 0.0 1.0
Royalty 40.5 63.05000 0.0 658.5 0.5 1.0
2 Miss 24.0 13.00000 0.0 437.5 0.0 1.0
Mrs 31.5 26.00000 0.0 439.5 1.0 1.0
3 Miss 18.0 8.75625 0.0 372.0 0.0 0.5
Mrs 31.0 15.97500 1.0 405.5 1.0 0.5
male 1 Master 4.0 120.00000 2.0 446.0 1.0 1.0
Mr 40.0 42.40000 0.0 463.0 0.0 0.0
Officer 51.0 35.50000 0.0 648.0 0.0 0.0
Royalty 40.0 27.72080 0.0 600.0 0.0 0.0
2 Master 1.0 26.00000 1.0 408.0 1.0 1.0
Mr 31.0 13.00000 0.0 440.0 0.0 0.0
Officer 46.5 13.00000 0.0 358.5 0.0 0.0
3 Master 4.0 28.51250 1.0 270.5 3.5 0.0
Mr 26.0 7.89580 0.0 472.0 0.0 0.0
grouped_median_test
  • 1
      Age Fare Parch PassengerId SibSp Survived
Sex Pclass Title            
female 1 Miss 32.0 158.20835 0.0 1074.0 0.0 NaN
Mrs 48.0 63.35830 0.0 1076.0 1.0 NaN
Royalty 39.0 108.90000 0.0 1306.0 0.0 NaN
2 Miss 19.5 24.50000 1.0 1121.0 1.0 NaN
Mrs 29.0 26.00000 0.0 1123.5 0.0 NaN
3 Miss 22.0 7.87920 0.0 1090.5 0.0 NaN
Mrs 28.0 14.28125 0.5 1048.0 1.0 NaN
male 1 Master 9.5 198.43750 2.0 1022.0 1.0 NaN
Mr 42.0 50.24790 0.0 1102.0 0.0 NaN
Officer 53.0 81.85830 0.0 1094.0 1.0 NaN
2 Master 5.0 27.75000 1.5 1033.5 0.5 NaN
Mr 28.0 13.00000 0.0 1156.0 0.0 NaN
Officer 35.5 19.50000 0.5 1048.5 0.5 NaN
3 Master 7.0 15.24580 1.0 1173.0 1.0 NaN
Mr 25.0 7.85420 0.0 1101.0 0.0 NaN

因此我們可以通過乘客的性別,稱謂,所屬艙別的不同通過中位數來進行年齡的填充

def fillAges(row,grouped_median):
    if row['Sex']=='female' and row['Pclass']==1:
        if row['Title']=='Miss':
            return grouped_median.loc['female',1,'Miss']['Age']
        elif row['Title']=='Mrs':
            return grouped_median.loc['female',1,'Mrs']['Age']
        elif row['Title'] == 'Officer':
                return grouped_median.loc['female', 1, 'Officer']['Age']
        elif row['Title'] == 'Royalty':
                return grouped_median.loc['female', 1, 'Royalty']['Age']

    elif row['Sex']=='female' and row['Pclass'] == 2:
        if row['Title'] == 'Miss':
            return grouped_median.loc['female', 2, 'Miss']['Age']
        elif row['Title'] == 'Mrs':
            return grouped_median.loc['female', 2, 'Mrs']['Age']

    elif row['Sex']=='female' and row['Pclass'] == 3:
        if row['Title'] == 'Miss':
            return grouped_median.loc['female', 3, 'Miss']['Age']
        elif row['Title'] == 'Mrs':
            return grouped_median.loc['female', 3, 'Mrs']['Age']

    elif row['Sex']=='male' and row['Pclass'] == 1:
        if row['Title'] == 'Master':
            return grouped_median.loc['male', 1, 'Master']['Age']
        elif row['Title'] == 'Mr':
            return grouped_median.loc['male', 1, 'Mr']['Age']
        elif row['Title'] == 'Officer':
            return grouped_median.loc['male', 1, 'Officer']['Age']
        elif row['Title'] == 'Royalty':
            return grouped_median.loc['male', 1, 'Royalty']['Age']

    elif row['Sex']=='male' and row['Pclass'] == 2:
        if row['Title'] == 'Master':
            return grouped_median.loc['male', 2, 'Master']['Age']
        elif row['Title'] == 'Mr':
            return grouped_median.loc['male', 2, 'Mr']['Age']
        elif row['Title'] == 'Officer':
            return grouped_median.loc['male', 2, 'Officer']['Age']

    elif row['Sex']=='male' and row['Pclass'] == 3:
        if row['Title'] == 'Master':
            return grouped_median.loc['male', 3, 'Master']['Age']
        elif row['Title'] == 'Mr':
            return grouped_median.loc['male', 3, 'Mr']['Age']

combined.head(891).Age=combined.head(891).apply(lambda r: fillAges(r,grouped_median_train) if 
                                               np.isnan(r['Age']) else r['Age'],axis=1)
combined.iloc[891:].Age=combined.iloc[891:].apply(lambda r: fillAges(r,grouped_median_test) if
                                                 np.isnan(r['Age']) else r['Age'],axis=1)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
combined.info()
  • 1
combined.head(891).Fare.fillna(combined.head(891).Fare.mean(),inplace=True)
combined.iloc[891:].Fare.fillna(combined.iloc[891:].Fare.mean(),inplace=True)
  • 1
  • 2

填充缺失Embarked爲登船地點最多的S

combined.head(891).Embarked.fillna('S', inplace=True)
combined.iloc[891:].Embarked.fillna('S', inplace=True)
  • 1
  • 2

填充缺失的Cabin

combined.Cabin.fillna('U', inplace=True)
combined['Cabin'] = combined['Cabin'].map(lambda c : c[0])
  • 1
  • 2
combined.info()
  • 1
#title虛擬變量編碼
titleDf=pd.get_dummies(combined['Title'],prefix='Title')
combined=pd.concat([combined,titleDf],axis=1)
  • 1
  • 2
  • 3

Parch&SibSp

#和上次處理一樣,建立Familysize
familyDf=pd.DataFrame()
familyDf['FamilySize']=combined['Parch']+combined['SibSp']+1
familyDf[ 'Family_Single' ] = familyDf[ 'FamilySize' ].map( lambda s : 1 if s == 1 else 0 )
familyDf[ 'Family_Small' ]  = familyDf[ 'FamilySize' ].map( lambda s : 1 if 2 <= s <= 4 else 0 )
familyDf[ 'Family_Large' ]  = familyDf[ 'FamilySize' ].map( lambda s : 1 if 5 <= s else 0 )
combined=pd.concat([combined,familyDf],axis=1)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7

Embarked

embarkedDf=pd.get_dummies(combined['Embarked'],prefix='Embarked')
combined=pd.concat([combined,embarkedDf],axis=1)
  • 1
  • 2

Sex

sex_mapDict={'male':1,
            'female':0}
#map函數:對Series每個數據應用自定義的函數計算
combined['Sex']=combined['Sex'].map(sex_mapDict)
  • 1
  • 2
  • 3
  • 4

Cabin

cabinDf=pd.get_dummies(combined['Cabin'],prefix='Cabin')
combined=pd.concat([combined,cabinDf],axis=1)
  • 1
  • 2

Pclass

pclassDf=pd.get_dummies(combined['Pclass'],prefix='Pclass')
combined=pd.concat([combined,pclassDf],axis=1)
  • 1
  • 2

Ticket

#提取票價前綴,如果沒有前綴,即票價爲數字返回XXX
def cleanTicket(ticket):
    ticket=ticket.replace('.','')
    ticket=ticket.replace('/','')
    ticket=ticket.split()
    #ticket=map(lambda t: t.strip(),ticket)
    #flag=filter(lambda t: not t.isdigit(),ticket)
    if ticket[0].isdigit():
        return 'XXX'
    else:
        return ticket[0]

combined['Ticket']=combined['Ticket'].map(cleanTicket)
ticketsDf=pd.get_dummies(combined['Ticket'],prefix='Ticket')
combined=pd.concat([combined,ticketsDf],axis=1)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
combined.head(3)
  • 1
  Age Cabin Embarked Fare Name Parch PassengerId Pclass Sex SibSp Ticket_SOTONO2 Ticket_SOTONOQ Ticket_SP Ticket_STONO Ticket_STONO2 Ticket_STONOQ Ticket_SWPP Ticket_WC Ticket_WEP Ticket_XXX
0 22.0 U S 7.2500 Braund, Mr. Owen Harris 0 1 3 1 1 0 0 0 0 0 0 0 0 0 0
1 38.0 C C 71.2833 Cumings, Mrs. John Bradley (Florence Briggs Th… 0 2 1 0 1 0 0 0 0 0 0 0 0 0 0
2 26.0 U S 7.9250 Heikkinen, Miss. Laina 0 3 3 0 0 0 0 0 0 1 0 0 0 0 0

3 rows × 75 columns

#將其餘無關特徵刪除
combined.drop(['PassengerId','Cabin','Embarked','Name','Pclass','Ticket','Title'], inplace=True, axis=1)
  • 1
  • 2
combined.head(3)
  • 1
  Age Fare Parch Sex SibSp Survived Title_Master Title_Miss Title_Mr Title_Mrs Ticket_SOTONO2 Ticket_SOTONOQ Ticket_SP Ticket_STONO Ticket_STONO2 Ticket_STONOQ Ticket_SWPP Ticket_WC Ticket_WEP Ticket_XXX
0 22.0 7.2500 0 1 1 0.0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
1 38.0 71.2833 0 0 1 1.0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
2 26.0 7.9250 0 0 0 1.0 0 1 0 0 0 0 0 0 1 0 0 0 0 0

3 rows × 68 columns

建立模型和預測

  • 1.將數據集拆分爲訓練集和測試集
  • 2.使用訓練集建立預測模型
  • 3.使用訓練集評估模型
  • 4.使用模型得到測試集預測結果
#得到訓練/測試數據
train_X=combined.iloc[:891,:].drop(['Survived'],axis=1)
target_Y=combined.iloc[:891,:]['Survived']
test_X=combined.iloc[891:,:].drop(['Survived'],axis=1)
print('訓練集特徵:',train_X.shape,
     '訓練集標籤:',target_Y.shape,
     '測試集特徵:',test_X.shape)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
訓練集特徵: (891, 67) 訓練集標籤: (891,) 測試集特徵: (418, 67)
  • 1
  • 2
#導入庫
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
#定義評價函數
def compute_score(clf,X,y,scoring='accuracy'):
    xval=cross_val_score(clf,X,y,cv=5,scoring=scoring)#K折交叉分類,cv數據分成的數量
    return np.mean(xval)
  • 1
  • 2
  • 3
  • 4

特徵選擇

一個好的特徵選擇可以: 
* 1.減少數據之間的冗餘 
* 2.加速訓練過程 
* 3.防止過擬合

train_X.info()
  • 1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 67 columns):
Age               891 non-null float64
Fare              891 non-null float64
Parch             891 non-null int64
Sex               891 non-null int64
SibSp             891 non-null int64
Title_Master      891 non-null uint8
Title_Miss        891 non-null uint8
Title_Mr          891 non-null uint8
Title_Mrs         891 non-null uint8
Title_Officer     891 non-null uint8
Title_Royalty     891 non-null uint8
FamilySize        891 non-null int64
Family_Single     891 non-null int64
Family_Small      891 non-null int64
Family_Large      891 non-null int64
Embarked_C        891 non-null uint8
Embarked_Q        891 non-null uint8
Embarked_S        891 non-null uint8
Cabin_A           891 non-null uint8
Cabin_B           891 non-null uint8
Cabin_C           891 non-null uint8
Cabin_D           891 non-null uint8
Cabin_E           891 non-null uint8
Cabin_F           891 non-null uint8
Cabin_G           891 non-null uint8
Cabin_T           891 non-null uint8
Cabin_U           891 non-null uint8
Pclass_1          891 non-null uint8
Pclass_2          891 non-null uint8
Pclass_3          891 non-null uint8
Ticket_A          891 non-null uint8
Ticket_A4         891 non-null uint8
Ticket_A5         891 non-null uint8
Ticket_AQ3        891 non-null uint8
Ticket_AQ4        891 non-null uint8
Ticket_AS         891 non-null uint8
Ticket_C          891 non-null uint8
Ticket_CA         891 non-null uint8
Ticket_CASOTON    891 non-null uint8
Ticket_FC         891 non-null uint8
Ticket_FCC        891 non-null uint8
Ticket_Fa         891 non-null uint8
Ticket_LINE       891 non-null uint8
Ticket_LP         891 non-null uint8
Ticket_PC         891 non-null uint8
Ticket_PP         891 non-null uint8
Ticket_PPP        891 non-null uint8
Ticket_SC         891 non-null uint8
Ticket_SCA3       891 non-null uint8
Ticket_SCA4       891 non-null uint8
Ticket_SCAH       891 non-null uint8
Ticket_SCOW       891 non-null uint8
Ticket_SCPARIS    891 non-null uint8
Ticket_SCParis    891 non-null uint8
Ticket_SOC        891 non-null uint8
Ticket_SOP        891 non-null uint8
Ticket_SOPP       891 non-null uint8
Ticket_SOTONO2    891 non-null uint8
Ticket_SOTONOQ    891 non-null uint8
Ticket_SP         891 non-null uint8
Ticket_STONO      891 non-null uint8
Ticket_STONO2     891 non-null uint8
Ticket_STONOQ     891 non-null uint8
Ticket_SWPP       891 non-null uint8
Ticket_WC         891 non-null uint8
Ticket_WEP        891 non-null uint8
Ticket_XXX        891 non-null uint8
dtypes: float64(2), int64(7), uint8(58)
memory usage: 113.2 KB
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
#採用隨機森林來計算特徵輸入
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
#n_estimators構造的決策樹數量,max_features不超過的最大特徵數量
clf=RandomForestClassifier(n_estimators=50,max_features='sqrt')
clf=clf.fit(train_X,target_Y)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
features=pd.DataFrame()
features['feature']=train_X.columns
features['importance']=clf.feature_importances_ #係數大小反應特徵重要性
features.sort_values(by=['importance'],ascending=True,inplace=True)
features.set_index('feature',inplace=True)
  • 1
  • 2
  • 3
  • 4
  • 5
features.plot(kind='barh',figsize=(20,20))
  • 1

這裏寫圖片描述

#選取合適的特徵
model=SelectFromModel(clf,prefit=True)
train_reduced=model.transform(train_X)
train_reduced.shape
  • 1
  • 2
  • 3
  • 4
(891, 13)
  • 1
  • 2
test_reduced=model.transform(test_X)
test_reduced.shape
  • 1
  • 2
(418, 13)
  • 1
  • 2

現在我們得到了13個特徵

爲了得到最佳的預測模型,需要對模型參數進行調整

run_gs=False

if run_gs:
    parameter_grid={
                'max_depth' : [4, 6, 8],
                'n_estimators': [50, 10],
                'max_features': ['sqrt', 'auto', 'log2'],
                'min_samples_split': [2, 3, 10],
                'min_samples_leaf': [1, 3, 10],
                'bootstrap': [True, False],
                }
    forest=RandomForestClassifier()
    cross_validation=StratifiedKFold(target_Y,n_folds=5)
    #使用GridSearchCV搜索最佳參數
    grid_search=GridSearchCV(forest,
                            scoring='accuracy',
                            param_grid=parameter_grid,
                            cv=cross_validation)
    grid_search.fit(train_X,target_Y)
    model=grid_search
    parameters=grid_search.best_params_

    print('Best score:{}'.format(grid_search.best_score_))
    print('Best parameters:{}'.format(grid_search.best_params_))
else:
    parameters={'bootstrap':False,'min_samples_leaf': 3, 'n_estimators': 50, 
                  'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6}

    model=RandomForestClassifier(**parameters)
    model.fit(train_X, target_Y)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
compute_score(model, train_X, target_Y, scoring='accuracy')
  • 1
0.8271904225390074
  • 1
  • 2

輸出結果

output=model.predict(test_X).astype(int)
outputDf=pd.DataFrame()
outputDf['PassengerId']=test['PassengerId']
outputDf['Survived']=output
outputDf.to_csv(path+'pred.csv',index=False)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章