【Kaggle入門】Titanic: Machine Learning from Disaster----模型優化嘗試(三)


這個系列博客純粹爲了記錄一下自己學習kaggle的相關內容,也是跟着別人一步步學習。


雖然這篇是模型優化嘗試(三),但其實並不是用到的第三種方法。在上一篇博客【Kaggle入門】Titanic: Machine Learning from Disaster----模型優化嘗試(二)之後,我嘗試了很多種新增或組合的特徵,最後發現單一的新增或者組合某種特徵,基本上不可能對結果的提升有幫助,最後還是跟着前輩的思路組合了多種特徵,纔對結果起了積極的作用。

import pandas as pd
import numpy as np
from pandas import Series,DataFrame
data_train = pd.read_csv("data/train.csv")

接下來就是對數據的處理,新增或者組合原來的特徵。

def clean_data(df):
    df.loc[ (df.Fare.isnull()), 'Fare' ] = 0
    #按名字分類,生成Title字段
    title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                'Don', 'Jonkheer']
    df['Title']=df['Name'].map(lambda x: substrings_in_string(x, title_list))
    
    def replace_titles(x):
        title = x['Title']
        if title in ['Mr', 'Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
            return 'Mr'
        elif title in ['Master']:
            return 'Master'
        elif title in ['Countess', 'Mme', 'Mrs']:
            return 'Mrs'
        elif title in ['Mlle', 'Ms', 'Miss']:
            return 'Miss'
        elif title == 'Dr':
            if x['Sex'] == 'Male':
                return 'Mr'
            else:
                return 'Mrs'
        elif title == '':
            if x['Sex'] == 'Male':
                return 'Master'
            else:
                return 'Miss'
        else:
            return title
    
    df['Title'] = df.apply(replace_titles, axis=1)
    
    #將缺失的Embarked值直接填充爲‘S’
    df.loc[ (df.Embarked.isnull()), 'Embarked' ] = 'S'
    
    df.loc[df.Cabin.notnull(), 'Cabin'] = 'Yes'
    df.loc[df.Cabin.isnull(), 'Cabin'] = 'No'
    
    #新增child字段
    df['child'] = 0
    df.loc[(df.Age<=12), 'child'] = 1
    df.loc[(df.Age>12), 'child'] = 0
    
    #新增mother字段
    df['mother'] = 0
    df.loc[(df['Title'] == 'Mrs')&(df['Parch']>1), 'mother'] = 1
    
    #新增Family字段
    df['Family'] = df['SibSp']*df['Parch']
    
    #缺失的年齡按Title字段進行平均
    df['AgeFill'] = df['Age']
    mean_ages = np.zeros(4)
    mean_ages[0] = np.average(df[df['Title'] == 'Miss']['Age'].dropna())
    mean_ages[1] = np.average(df[df['Title'] == 'Mrs']['Age'].dropna())
    mean_ages[2] = np.average(df[df['Title'] == 'Mr']['Age'].dropna())
    mean_ages[3] = np.average(df[df['Title'] == 'Master']['Age'].dropna())
    df.loc[(df.Age.isnull())&(df.Title == 'Miss'), 'AgeFill'] = mean_ages[0]
    df.loc[(df.Age.isnull())&(df.Title == 'Mrs'), 'AgeFill'] = mean_ages[1]
    df.loc[(df.Age.isnull())&(df.Title == 'Mr'), 'AgeFill'] = mean_ages[2]
    df.loc[(df.Age.isnull())&(df.Title == 'Master'), 'AgeFill'] = mean_ages[3]

    scaler = preprocessing.StandardScaler()
    np_data_age = np.array(df['AgeFill']).reshape(-1, 1)
    age_scale_param = scaler.fit(np_data_age)
    df['Age_scaled'] = scaler.fit_transform(np_data_age, age_scale_param)
    np_data_fare = np.array(df['Fare']).reshape(-1, 1)
    fare_scale_param = scaler.fit(np_data_fare)
    df['Fare_scaled'] = scaler.fit_transform(np_data_fare, fare_scale_param)
    
    dummies_Cabin = pd.get_dummies(df['Cabin'], prefix='Cabin')
    dummies_Embarked = pd.get_dummies(df['Embarked'], prefix='Embarked')
    dummies_Sex = pd.get_dummies(df['Sex'], prefix='Sex')
    dummies_Pclass = pd.get_dummies(df['Pclass'], prefix='Pclass')
    dummies_Title = pd.get_dummies(df['Title'], prefix='Title')
    
    data_df = pd.concat([df, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass, dummies_Title],axis=1)
    data_df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked','Age', 'AgeFill', 'Title'], axis=1, inplace=True)

    return data_df
from sklearn import linear_model

train_data = clean_data(data_train)
train_data.info()
train_df = train_data.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|child|Family|Title_.*')
train_np = train_df.as_matrix()

y = train_np[:, 0]
X = train_np[:, 1:]

clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
clf.fit(X, y)
data_test = pd.read_csv('data/test.csv')
test_data = clean_data(data_test)
test_data.info()
test_df = test_data.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|child|Family|Title_.*')
predictions = clf.predict(test_df)
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})
result.to_csv('titanic_predictions.csv', index=False)

將結果提交到kaggle,得分0.77990。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章