%matplotlib inline
import numpy as np
import pandas as pd
import re as re
train = pd.read_csv("./data/train.csv",header = 0,dtype={'Age':np.float64})
test = pd.read_csv("./data/test.csv",header = 0,dtype={"Age":np.float64})
full_data = [train,test]
print(train.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None
Feature Engineering
1 Pclass
there is no missing value on this feature and already a numerical value.so lets check it’s impact on our train set
# 按Pclass分組求均值
print(train[['Pclass','Survived']].groupby(['Pclass'],as_index=False).mean())
Pclass Survived
0 1 0.629630
1 2 0.472826
2 3 0.242363
2 Sex
print(train[["Sex","Survived"]].groupby(['Sex'],as_index=False).mean())
Sex Survived
0 female 0.742038
1 male 0.188908
3 SibSp and Parch
With the number of siblings/spouse and the number of children/parents we can create new feature called Family Size.
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
for dataset in full_data:
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
print(train[['FamilySize','Survived']].groupby(['FamilySize'],as_index=False).mean())
FamilySize Survived
0 1 0.303538
1 2 0.552795
2 3 0.578431
3 4 0.724138
4 5 0.200000
5 6 0.136364
6 7 0.333333
7 8 0.000000
8 11 0.000000
for dataset in full_data:
dataset['IsAlone'] = 0
dataset.loc[dataset['FamilySize']==1,'IsAlone']=1
print(train[['IsAlone','Survived']].groupby(['IsAlone'],as_index=False).mean())
'''
這個結果表明,家庭規模小的存活率更高
'''
IsAlone Survived
0 0 0.505650
1 1 0.303538
'\n這個結果表明,家庭規模小的存活率更高\n'
4 Embarked
the embarked feature has some missing value and we try to fill those with the most occured value(‘S’)
result = train.groupby('Embarked')['Embarked'].count()
result
Embarked
C 168
Q 77
S 644
Name: Embarked, dtype: int64
result['S']
644
result[result.max()==result].index
Index(['S'], dtype='object', name='Embarked')
result[result.max()==result].index[0]
'S'
for dataset in full_data:
dataset['Embarked'] = dataset['Embarked'].fillna(result[result.max()==result].index[0])
print(train[['Embarked','Survived']].groupby(['Embarked'],as_index=False).mean())
Embarked Survived
0 C 0.553571
1 Q 0.389610
2 S 0.339009
5 Fare
Fare also has some missing value and we will replace it with the median.then we categorize it into 4 ranges
train['Fare'].median()
14.4542
for dataset in full_data:
dataset['Fare'] = dataset['Fare'].fillna(dataset['Fare'].median())
train['CategoricalFare'] = pd.qcut(train['Fare'],4)
print(train[['CategoricalFare','Survived']].groupby(['CategoricalFare'],as_index=False).mean())
CategoricalFare Survived
0 (-0.001, 7.91] 0.197309
1 (7.91, 14.454] 0.303571
2 (14.454, 31.0] 0.454955
3 (31.0, 512.329] 0.581081
6 Age
we have plenty of missing values in this feature. # generate random numbers between(mean - std) and (mean + std).then we categorize age into 5 range
for dataset in full_data:
age_avg = dataset['Age'].mean()
age_std = dataset['Age'].std()
age_null_count = dataset['Age'].isnull().sum() # 統計缺失值的樣本數
age_null_random_list = np.random.randint(age_avg - age_std,age_avg + age_std,size=age_null_count)
dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
dataset['Age'] = dataset['Age'].astype(int)
D:\Users\zhunode\Anaconda3\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
train['CategoricalAge'] = pd.cut(train['Age'],5) # 等分成5分
print(train[['CategoricalAge','Survived']].groupby('CategoricalAge',as_index=False).mean())
CategoricalAge Survived
0 (-0.08, 16.0] 0.525862
1 (16.0, 32.0] 0.363029
2 (32.0, 48.0] 0.353659
3 (48.0, 64.0] 0.434783
4 (64.0, 80.0] 0.090909
# 查看qcut和cut的區別
train['CategoricalAge'] = pd.qcut(train['Age'],5)
print(train[['CategoricalAge','Survived']].groupby('CategoricalAge',as_index=False).mean())
CategoricalAge Survived
0 (-0.001, 19.0] 0.455882
1 (19.0, 25.0] 0.329545
2 (25.0, 31.0] 0.385093
3 (31.0, 40.0] 0.382199
4 (40.0, 80.0] 0.352201
7 name
inside this feature we can find the title of people
title : (人名前表示地位、職業、婚否等的)稱號,頭銜,職稱,稱謂
def get_title(name):
title_search = re.search(' ([A-Za-z]+)\.', name)
# if title exists,extract and return it
if title_search:
return title_search.group(1)
return ""
for dataset in full_data:
dataset['Title'] = dataset['Name'].apply(get_title)
# 交叉表(cross-tabulation,簡稱crosstab)是⼀種⽤於計算分組頻率的特殊透視表。
print(pd.crosstab(train['Title'],train['Sex']))
Sex female male
Title
Capt 0 1
Col 0 2
Countess 1 0
Don 0 1
Dr 1 6
Jonkheer 0 1
Lady 1 0
Major 0 2
Master 0 40
Miss 182 0
Mlle 2 0
Mme 1 0
Mr 0 517
Mrs 125 0
Ms 1 0
Rev 0 6
Sir 0 1
now we have titles.Let’s categories it and check the title impact on survival rate
for dataset in full_data:
dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',
'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
print(set(dataset['Title']))
dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
print(set(dataset['Title']))
dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
print(set(dataset['Title']))
dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
print (train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean())
{'Rare', 'Mlle', 'Miss', 'Mme', 'Master', 'Mr', 'Mrs', 'Ms'}
{'Rare', 'Miss', 'Mme', 'Master', 'Mr', 'Mrs', 'Ms'}
{'Rare', 'Miss', 'Mme', 'Master', 'Mr', 'Mrs'}
{'Rare', 'Miss', 'Master', 'Mr', 'Mrs', 'Ms'}
{'Rare', 'Miss', 'Master', 'Mr', 'Mrs', 'Ms'}
{'Rare', 'Miss', 'Master', 'Mr', 'Mrs'}
Title Survived
0 Master 0.575000
1 Miss 0.702703
2 Mr 0.156673
3 Mrs 0.793651
4 Rare 0.347826
Data Cleaning
great! now let’s clean our data and map our features into numerical values
print(set(train['Sex']))
print(set(test['Sex']))
{'male', 'female'}
{'male', 'female'}
for dataset in full_data:
# Mapping Sex
dataset['Sex'] = dataset['Sex'].map({'female':0,'male':1}).astype(int)
# mapping titles
title_mapping = {"Mr":1,"Miss":1,"Mrs":3,"Master":4,"Rare":5}
dataset['Title'] = dataset['Title'].map(title_mapping)
dataset['Title'] = dataset['Title'].fillna(0)
# Mapping Embarked
dataset['Embarked'] = dataset['Embarked'].map({'S':0,'C':1,'Q':2}).astype(int)
# Mapping Fare
dataset.loc[dataset['Fare'] <=7.91,'Fare'] = 0
dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <=14.454),'Fare'] = 1
dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31),'Fare']=2
dataset.loc[dataset['Fare']>31,'Fare'] = 3
dataset['Fare'] = dataset['Fare'].astype(int)
#Mapping Age
dataset.loc[dataset['Age']<=16,'Age']=0
dataset.loc[(dataset['Age']>16) & (dataset['Age'] <= 32),'Age']=1
dataset.loc[(dataset['Age']>32) & ( dataset['Age'] <= 48),'Age']=2
dataset.loc[(dataset['Age']>48) & (dataset['Age'] <= 64),'Age']=3
dataset.loc[dataset['Age']>64,'Age']=4
# Feature Selection
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp','Parch', 'FamilySize']
print("has columns ",train.columns)
print("drop elements ",drop_elements)
train = train.drop(drop_elements,axis=1)
train = train.drop(['CategoricalAge','CategoricalFare'],axis=1)
test = test.drop(drop_elements,axis=1)
print(train.head(10))
train = train.values
test = test.values
has columns Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'FamilySize', 'IsAlone',
'CategoricalFare', 'CategoricalAge', 'Title'],
dtype='object')
drop elements ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'FamilySize']
Survived Pclass Sex Age Fare Embarked IsAlone Title
0 0 3 1 1 0 0 0 1
1 1 1 0 2 3 1 0 3
2 1 3 0 1 1 0 1 1
3 1 1 0 2 3 0 0 3
4 0 3 1 2 1 0 1 1
5 0 3 1 2 1 2 1 1
6 0 1 1 3 3 0 1 1
7 0 3 1 0 2 0 0 4
8 1 3 0 1 1 0 0 3
9 1 2 0 0 2 1 0 3
good! now we have a clean dataset and ready to predict. let’s find which classifier works better on this dataset.
Classifier Comparison
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score,log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
classifiers = [
KNeighborsClassifier(3),
SVC(probability=True),
DecisionTreeClassifier(),
RandomForestClassifier(),
AdaBoostClassifier(),
GradientBoostingClassifier(),
GaussianNB(),
LinearDiscriminantAnalysis(),
QuadraticDiscriminantAnalysis(),
LogisticRegression()
]
log_cols = ['Classifier','Accuracy']
log = pd.DataFrame(columns=log_cols)
sss = StratifiedShuffleSplit(n_splits=10,test_size=0.1,random_state=0)
X = train[0::,1::]
y = train[0::,0]
acc_dict = {}
for train_index,test_index in sss.split(X,y):
X_train,X_test = X[train_index],X[test_index]
y_train,y_test = y[train_index],y[test_index]
for clf in classifiers:
name = clf.__class__.__name__
clf.fit(X_train, y_train)
train_predictions = clf.predict(X_test)
acc = accuracy_score(y_test, train_predictions)
if name in acc_dict:
acc_dict[name] += acc
else:
acc_dict[name] = acc
for clf in acc_dict:
acc_dict[clf] = acc_dict[clf] / 10.0
log_entry = pd.DataFrame([[clf,acc_dict[clf]]],columns=log_cols)
log = log.append(log_entry)
plt.xlabel('Accuracy')
plt.title('Classifier Accuracy')
sns.set_color_codes("muted")
sns.barplot(x='Accuracy', y='Classifier', data=log, color="b")
<matplotlib.axes._subplots.AxesSubplot at 0x14e11cbba58>
Prediction
candidate_classifier = SVC()
candidate_classifier.fit(train[0::, 1::], train[0::, 0])
result = candidate_classifier.predict(test)