數據清洗完整代碼
import re
#加載正則表達式庫
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
#特徵工程處理
train_df_org=pd.read_csv('train.csv')
test_df_org=pd.read_csv('test.csv')
test_df_org['Survived']=0
combined_train_test=train_df_org.append(test_df_org,sort=True)
PassengerId=test_df_org['PassengerId']
combined_train_test['Embarked'].fillna(combined_train_test['Embarked'].mode().iloc[0], inplace=True)
# 爲了後面的特徵分析,這裏我們將 Embarked 特徵進行facrorizing
combined_train_test['Embarked'] = pd.factorize(combined_train_test['Embarked'])[0]
# 使用 pd.get_dummies 獲取one-hot 編碼
emb_dummies_df = pd.get_dummies(combined_train_test['Embarked'], prefix=combined_train_test[['Embarked']].columns[0])
combined_train_test = pd.concat([combined_train_test, emb_dummies_df], axis=1)
#提取篩選性別字段並進行Dummy處理
#---Sex---
# Sex Factorizing
print('# Sex_Dummy...')
combined_train_test['Sex']=pd.factorize(combined_train_test['Sex'])[0]
sex_dummies_df=pd.get_dummies(combined_train_test['Sex'],prefix=combined_train_test[['Sex']].columns[0])
combined_train_test=pd.concat([combined_train_test,sex_dummies_df],axis=1)
# print(combined_train_test.head())
#---Name---
#提取稱呼 Title
combined_train_test['Title']=combined_train_test['Name'].map(lambda x:re.compile(", (.*?)\.").findall(x)[0])
#稱呼統一化
print('# New_Name_Title...')
title_Dict={}
title_Dict.update(dict.fromkeys(['Capt','Col','Major','Dr','Rev'],'Officer'))
title_Dict.update(dict.fromkeys(['Don','Sir','the Countess','Dona','Lady'],'Royalty'))
title_Dict.update(dict.fromkeys(['Mme','Ms','Mrs'],'Mrs'))
title_Dict.update(dict.fromkeys(['Mlle','Miss'],'Miss'))
title_Dict.update(dict.fromkeys(['Mr'],'Mr'))
title_Dict.update(dict.fromkeys(['Master','Jonkheer'],'Master'))
combined_train_test['Title']=combined_train_test['Title'].map(title_Dict)
# print(combined_train_test)
# print(combined_train_test.groupby(['Title','Survived'])['Survived'].count())
#Title factorizing
print('#Title_factorizing...')
combined_train_test['Title']=pd.factorize(combined_train_test['Title'])[0]
title_dummies_df=pd.get_dummies(combined_train_test['Title'],prefix=combined_train_test[['Title']].columns[0])
combined_train_test=pd.concat([combined_train_test,title_dummies_df],axis=1)
# print(combined_train_test.groupby(['Title','Survived'])['Survived'].count())
#增加名字長度:Name_length
print('# New_Name_length...')
combined_train_test['Name_length']=combined_train_test['Name'].apply(len)
#---Fare---
#缺失值處理:均值
print('# Loss Value Processing : Fare...')
combined_train_test['Fare']=combined_train_test[['Fare']].fillna(combined_train_test.groupby('Pclass').transform(np.mean))
# combined_train_test.info()
#重複值處理:平攤
print('#Duplicate Value Processing:Fare...')
combined_train_test['Group_Ticket']=combined_train_test['Fare'].groupby(by=combined_train_test['Ticket']).transform('count')
combined_train_test['Fare']=combined_train_test['Fare']/combined_train_test['Group_Ticket']
combined_train_test.drop(['Group_Ticket'],axis=1,inplace=True)
# print(combined_train_test.info)
#增加票價等級:Fare_bin(分箱)
print('#Box:Fare...')
combined_train_test['Fare_bin']=pd.qcut(combined_train_test['Fare'],5)
# print(combined_train_test.groupby(['Fare_bin','Survived'])['Survived'].count())
combined_train_test['Fare_bin_id']=pd.factorize(combined_train_test['Fare_bin'])[0]
fare_bin_dummies_df=pd.get_dummies((combined_train_test['Fare_bin_id']).rename(columns=lambda x:'Fare_'+str(x)))
combined_train_test=pd.concat([combined_train_test,fare_bin_dummies_df],axis=1)
combined_train_test.drop(['Fare_bin'],axis=1,inplace=True)
#---Pclass字段---建立PCalss Fare Category
def pclass_fare_category(df,pclass1_mean_fare,pclass2_mean_fare,pclass3_mean_fare):
if df['Pclass']==1:
if df['Fare']<=pclass1_mean_fare:
return 'Pclass1_Low'
else:
return 'Pclass1_High'
elif df['Pclass']==2:
if df['Fare']<=pclass2_mean_fare:
return 'Pclass2_Low'
else:
return 'Pclass2_High'
elif df['Pclass']==3:
if df['Fare']<=pclass3_mean_fare:
return 'Pclass3_Low'
else:
return 'Pclass3_High'
Pclass1_mean_fare=combined_train_test['Fare'].groupby(by=combined_train_test['Pclass']).mean().get([1]).values[0]
Pclass2_mean_fare=combined_train_test['Fare'].groupby(by=combined_train_test['Pclass']).mean().get([2]).values[0]
Pclass3_mean_fare=combined_train_test['Fare'].groupby(by=combined_train_test['Pclass']).mean().get([3]).values[0]
combined_train_test['Pclass_Fare_Category']=combined_train_test.apply(pclass_fare_category,args=(Pclass1_mean_fare,Pclass2_mean_fare,Pclass3_mean_fare),axis=1)
print('# Pclass_Fare_Category...')
# print(combined_train_test.groupby(['Pclass_Fare_Category','Survived'])['Survived'].count())
pclass_level=LabelEncoder() #LabelEncoder的方法
pclass_level.fit(np.array(['Pclass1_Low','Pclass1_High','Pclass2_Low','Pclass2_High','Pclass3_Low','Pclass3_High']))
combined_train_test['Pclass_Fare_Category']=pclass_level.transform(combined_train_test['Pclass_Fare_Category'])
pclass_dummies_df=pd.get_dummies(combined_train_test['Pclass_Fare_Category']).rename(columns=lambda x:'Pclass_'+str(x))
combined_train_test=pd.concat([combined_train_test,pclass_dummies_df],axis=1)
# print(combined_train_test)
#新增字段Family_Size
def family_size_category(family_size):
if family_size<=1:
return 'Single'
elif family_size<=4:
return 'Small_Family'
else:
return 'Large_Family'
#Parch和SibSp字段處理
print('# New_Family_Size...')
combined_train_test['Family_Size']=combined_train_test['Parch']+combined_train_test['SibSp']+1
combined_train_test['Family_Size_Category']=combined_train_test['Family_Size'].map(family_size_category)
le_family=LabelEncoder()
le_family.fit(np.array(['Single','Small_Family','Large_Family']))
combined_train_test['Family_Size_Category']=le_family.transform(combined_train_test['Family_Size_Category'])
family_size_dummies_df=pd.get_dummies(combined_train_test['Family_Size_Category'],prefix=combined_train_test[['Family_Size_Category']].columns[0])
combined_train_test=pd.concat([combined_train_test,family_size_dummies_df],axis=1)
# print(combined_train_test)
#缺失值處理:RandomForestRegression模型填充缺失值
print('# Loss Value Processing:Age...')
age_df=combined_train_test[['Age','Embarked','Sex','Title','Name_length','Family_Size','Family_Size_Category','Fare','Fare_bin_id','Pclass']]
age_df_notnull=age_df.loc[(combined_train_test['Age'].notnull())]
age_df_isnull=age_df.loc[(combined_train_test['Age'].isnull())]
X=age_df_notnull.values[:,1:]
Y=age_df_notnull.values[:,0]
#Age字段處理
RFR=RandomForestRegressor(n_estimators=1000,n_jobs=1 ) #1000個決策樹
RFR.fit(X,Y)
predictAges=RFR.predict(age_df_isnull.values[:,1:]) #輸入age字段爲空的,除age字段之外的數據
combined_train_test.loc[combined_train_test['Age'].isnull(),['Age']]=predictAges
# print(age_df_isnull)
# print(combined_train_test)
#Ticket字段處理
print('# factorize:Ticket...')
combined_train_test['Ticket_Letter']=combined_train_test['Ticket'].str.split().str[0]
combined_train_test['Ticket_Letter']=combined_train_test['Ticket_Letter'].apply(lambda x:'0' if x.isnumeric() else x)
combined_train_test['Ticket_Letter']=pd.factorize(combined_train_test['Ticket_Letter'])[0]
print(combined_train_test['Ticket_Letter'])
print(combined_train_test)
#Cabin字段處理
print('# Loss Value Processing:Cabin...')
combined_train_test.loc[combined_train_test.Cabin.isnull(),'Cabin']='0'
combined_train_test['Cabin']=combined_train_test['Cabin'].apply(lambda x:0 if x=='0' else 1)
# print(combined_train_test['Cabin'])
#Age和fare字段值的正則化
print('# Normalization:Age&fare...')
scale_age_fare=preprocessing.StandardScaler().fit(combined_train_test[['Age','Fare','Name_length']])
combined_train_test[['Age','Fare','Name_length']]=scale_age_fare.transform(combined_train_test[['Age','Fare','Name_length']])
#保存清洗組合後數據
print("-"*40)
# combined_train_test.info()
#保存清洗組合後數據
print("-"*40)
print('# save to csv...')
combined_train_test.to_csv('3_combined_train_test.csv',index=False)
sklearn分析建模庫
Multiclass classification多類分類器
一個分類任務需要對多餘兩個類的數據進行分類。多類分類假設每一個樣本有且僅有一個標籤
Multilabel classification 多標籤分類器
給每一個樣本分配一系列標籤,可以被認爲是預測不相互排斥的數據點的屬性
Multioutput regress 多輸出分類器
爲每個樣本分配一組目標值,這可以認爲是預測每一個樣本的多個屬性。
示例:船艙等級/性別與生存率的關係
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
#源數據
train_data=pd.read_csv('train.csv')
test_data=pd.read_csv('test.csv')
#---性別與是否生存的關係 Sex---
train_data['Sex'].value_counts().plot.pie(autopct='%1.2f%%')
# plt.savefig('images/2_FeatureRelationSex1.png')
print('# 性別與是否生存的關係')
print(train_data.groupby(['Sex','Survived'])['Survived'].count())
train_data[['Sex','Survived']].groupby(['Sex']).mean().plot.bar()
# plt.savefig('images/2_FeatureRelationSex.png')
train_data.groupby(['Sex','Survived'])['Survived'].describe().reset_index().to_csv('2_FeatureRelationSex.csv')
plt.show()
#---船艙等級和生存與否的關係 Pclass---
train_data['Pclass'].value_counts().plot.pie(autopct='%1.2f%%')
# plt.savefig('images/2_FeatureRelationPclass1.png')
print('# 船艙等級和生存與否的關係')
print(train_data.groupby(['Pclass','Survived'])['Pclass'].count())
train_data[['Pclass','Survived']].groupby(['Pclass']).mean().plot.bar()
# plt.savefig('images/2_FeatureRelationPclass.png')
plt.show()
plt.close()
結果:
#性別與是否生存的關係
Sex Survived
female 0 81
1 233
male 0 468
1 109
Name: Survived, dtype: int64
#船艙等級和生存與否的關係
Pclass Survived
1 0 80
1 136
2 0 97
1 87
3 0 372
1 119
Name: Pclass, dtype: int64
示例:sklearn決策樹分析
DecisionTreeClassifier有兩個向量輸入:
X,大小爲[n_sample,n_feature],存放訓練樣本
Y,值爲整形,大小爲[n_sample],存放訓練樣本的分類標籤
tree.data:前面都爲X,最後一列爲Y
import time
from sklearn import metrics #引入評估
from sklearn import tree #加載樹
from sklearn.externals import joblib #保存/還原模型
import numpy as np
raw_data="tree.data"
#load the CSV file as a numpy matrix
dataset=np.loadtxt(raw_data,delimiter=',') #分隔符爲逗號
# separate the data from the target attributes
x=dataset[:,0:8]
y=dataset[:,8]
#訓練集合
X_train=dataset[0:500,0:8]
y_train=dataset[0:500,8]
#測試集合
X_test=dataset[500:,0:8]
y_test=dataset[500:,8]
print('\n調用scikit的tree.DecisionTreeClassifier()')
model=tree.DecisionTreeClassifier(min_samples_leaf=2) #葉子節點最少樣本數爲2,超過2個樣本才劃分
start_time=time.time()
model.fit(X_train,y_train)
print("training took %fs!" % (time.time()-start_time))
joblib.dump(value=model,filename='Decisiontree.model') #保存模型
expected=y_test #真實值
predicted=model.predict(X_test) #預測值
print(metrics.confusion_matrix(expected,predicted)) #混淆矩陣
print(metrics.classification_report(expected,predicted))
結果:
調用scikit的tree.DecisionTreeClassifier()
training took 0.007018s!
[[149 33]
[ 36 50]]
precision recall f1-score support
0.0 0.81 0.82 0.81 182
1.0 0.60 0.58 0.59 86
accuracy 0.74 268
macro avg 0.70 0.70 0.70 268
weighted avg 0.74 0.74 0.74 268