泰坦尼克號獲救預測

 數據集下載地址:

https://github.com/fayduan/Kaggle_Titanic/blob/master/train.csv

期間參考博客https://www.cnblogs.com/cxfly/p/8505851.html進行了學習並對其中部分廢棄方法進行了更正

import pandas as pd

titanic = pd.read_csv('/usr/local/app/train.csv')

titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())

print(titanic['Sex'].unique()) 
titanic.loc[titanic['Sex']=='male','Sex']=0
titanic.loc[titanic['Sex']=='female','Sex']=1

print(titanic['Embarked'].unique())

titanic['Embarked'] = titanic['Embarked'].fillna('S')

titanic.loc[titanic['Embarked'] == 'S','Embarked'] =0
titanic.loc[titanic['Embarked'] == 'C','Embarked'] =1
titanic.loc[titanic['Embarked'] == 'Q','Embarked'] =2
=======================================================================

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold  #交叉驗證,將測試集進行切分驗證取平均值

predictors = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']   #用到的特徵
print(titanic[predictors])
alg = LinearRegression()

kf = KFold(n_splits=3,random_state=1)
preictions = []
for train, test in kf.split(titanic[predictors]):
   #print train
    #print test
    train_predictors = (titanic[predictors].iloc[train,:]) #將predictors作爲測試特徵
    #print train_predictors
    train_target = titanic['Survived'].iloc[train]
    #print train_target
    alg.fit(train_predictors,train_target)  #構建線性模型 樣本的x(訓練數據) 樣本的y(標籤值)
    test_prediction = alg.predict(titanic[predictors].iloc[test,:]) #預測結果值
    predictions.append(test_prediction)

======================================================================

import numpy as np 
predictions = np.concatenate(predictions,axis=0)
predictions[predictions >.5] = 1
predictions[predictions <=.5] = 0
accury = sum(predictions[predictions == titanic['Survived']]) / len(predictions) #測試準確率 進行模型評估
print (accury) #精度值

=======================================================================

#集成算法, 構造多個分類樹
#1.構造多個分類器
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
alg = LogisticRegression(random_state=1)
scores = cross_val_score(alg,titanic[predictors],titanic['Survived'],cv=3)
#rint(scores.mean())

========================================================================

#特徵提取
##儘可能多的提取特徵
#看不同特徵的效果
#特徵提取是數據挖掘裏很重要的一部分
#以上使用的特徵都是數據裏已經有的了,在真實的數據挖掘裏,常常沒有合適的特徵,需要我們自己去提取
#1.把多個特徵組合成一個特徵
titanic['Familysize'] = titanic['SibSp'] + titanic['Parch']
titanic['NameLength'] = titanic['Name'].apply(lambda x: len(x))

import re
#print(titanic['Name'])
def get_title(name):
    title_resarch = re.search('([A-Za-z]+)\.',name)
    if(title_resarch):
        return title_resarch.group(1)
    return ""

titles = titanic['Name'].apply(get_title)
#print(titles)
#將稱號轉換成數值表示
title_mapping = {"Mr":1,"Miss":2,"Mrs":3,"Master":4,"Dr":5,"Rev":6,"Col":7,"Major":8,"Mlle":9,"Countess":10,"Ms":11,"Lady":12,"Jonkheer":13,"Don":14,"Mme":15,"Capt":16,"Sir":17}
for k,v in title_mapping.items():
    titles[titles==k] = v
    
titanic['titles'] = titles 
#print(titanic)
========================================================================

#進行特徵選擇
#特徵重要性分析
#分析 不同特徵對最終結果的影響
#加入一些噪音數據,替換原來的值(注:此時 其它列的數據保持不變),得到錯誤率error
#兩個錯誤率的差值 可以體現這一個特徵的重要性
import numpy as np
from sklearn.feature_selection import SelectKBest,f_classif #feature_selection看每一個特徵的重要程度
import matplotlib.pyplot as plt

predictors = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked','Familysize','NameLength','titles']
selector = SelectKBest(f_classif,k=5)
selector.fit(titanic[predictors],titanic['Survived'])
scores = -np.log10(selector.pvalues_)

#集成分類器
#在競賽中常用的耍賴的辦法:集成多種算法,取最後每種算法的平均值,來減少過擬合
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
#GradientBoostingClassifier也是隨機森林的算法,可以集成多個弱分類器,然後變成強分類器

algorithas = [
    [  GradientBoostingClassifier(random_state=1,n_estimators=25,max_depth=3),
       ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked','Familysize','NameLength','titles']
    ],
          
    [  LogisticRegression(random_state=1),
       ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked','Familysize','NameLength','titles']
    ]
]


kf = KFold(n_splits=3,random_state=1)
predictions = []
for train, test in kf.split(titanic[predictors]):
    train_target = titanic['Survived'].iloc[train]
    full_test_predictions = []
    for alg,prediction in algorithas:
        alg.fit(titanic[predictors].iloc[train,:],train_target)
        test_prediction = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1]
        full_test_predictions.append(test_prediction)
    test_predictions = (full_test_predictions[0] + full_test_predictions[1])/2
    test_predictions[test_predictions > .5] =1
    test_predictions[test_predictions <= .5] =0
    predictions.append(test_predictions)

predictions = np.concatenate(predictions,axis =0)
accury = sum(predictions[predictions == titanic['Survived']])/len(predictions)
print(accury)

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章