9.18學習筆記(特徵工程)

import re
#加載正則表達式庫
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

#特徵工程處理
train_df_org=pd.read_csv('train.csv')
test_df_org=pd.read_csv('test.csv')
test_df_org['Survived']=0
combined_train_test=train_df_org.append(test_df_org,sort=True)

def pclass_fare_category(df,pclass1_mean_fare,pclass2_mean_fare,pclass3_mean_fare):
    if df['Pclass']==1:
        if df['Fare']<=pclass1_mean_fare:
            return 'Pclass1_Low'
        else:
            return 'Pclass1_High'
    elif df['Pclass']==2:
        if df['Fare']<=pclass2_mean_fare:
            return 'Pclass2_Low'
        else:
            return 'Pclass2_High'
    elif df['Pclass']==3:
        if df['Fare']<=pclass3_mean_fare:
            return 'Pclass3_Low'
        else:
            return 'Pclass3_High'

Pclass1_mean_fare=combined_train_test['Fare'].groupby(by=combined_train_test['Pclass']).mean().get([1]).values[0]
Pclass2_mean_fare=combined_train_test['Fare'].groupby(by=combined_train_test['Pclass']).mean().get([2]).values[0]
Pclass3_mean_fare=combined_train_test['Fare'].groupby(by=combined_train_test['Pclass']).mean().get([3]).values[0]
combined_train_test['Pclass_Fare_Category']=combined_train_test.apply(pclass_fare_category,args=(Pclass1_mean_fare,Pclass2_mean_fare,Pclass3_mean_fare),axis=1)

pclass_level=LabelEncoder()    #LabelEncoder的方法
pclass_level.fit(np.array(['Pclass1_Low','Pclass1_High','Pclass2_Low','Pclass2_High','Pclass3_Low','Pclass3_High']))
combined_train_test['Pclass_Fare_Category']=pclass_level.transform(combined_train_test['Pclass_Fare_Category'])
pclass_dummies_df=pd.get_dummies(combined_train_test['Pclass_Fare_Category']).rename(columns=lambda x:'Pclass_'+str(x))
#生成六列,每一列的取值爲1或0
combined_train_test=pd.concat([combined_train_test,pclass_dummies_df],axis=1)
print(combined_train_test)

結果:
Age Cabin Embarked … Pclass_3 Pclass_4 Pclass_5
0 22.0 NaN S … 0 0 1
1 38.0 C85 C … 0 0 0
2 26.0 NaN S … 0 0 1
3 35.0 C123 S … 0 0 0
4 35.0 NaN S … 0 0 1
5 NaN NaN Q … 0 0 1
… … … … … … … …
413 NaN NaN S … 0 0 1
414 39.0 C105 C … 0 0 0
415 38.5 NaN S … 0 0 1
416 NaN NaN S … 0 0 1
417 NaN NaN C … 0 1 0

[1309 rows x 19 columns]

Parch和SibSp字段
新增字段:FamilySize

#Parch和SibSp字段處理
print('# New_Family_Size...')
combined_train_test['Family_Size']=combined_train_test['Parch']+combined_train_test['SibSp']+1
combined_train_test['Family_Size_Category']=combined_train_test['Family_Size'].map(family_size_category)
le_family=LabelEncoder()
le_family.fit(np.array(['Single','Small_Family','Large_Family']))
combined_train_test['Family_Size_Category']=le_family.transform(combined_train_test['Family_Size_Category'])
family_size_dummies_df=pd.get_dummies(combined_train_test['Family_Size_Category'],prefix=combined_train_test[['Family_Size_Category']].columns[0])
combined_train_test=pd.concat([combined_train_test,family_size_dummies_df],axis=1)
print(combined_train_test)

結果:
#New_Family_Size…
Age Cabin … Family_Size_Category_1 Family_Size_Category_2
0 22.0 NaN … 0 1
1 38.0 C85 … 0 1
2 26.0 NaN … 1 0
3 35.0 C123 … 0 1
4 35.0 NaN … 1 0

… … … … … … … …
413 NaN NaN … 1 0
414 39.0 C105 … 1 0
415 38.5 NaN … 1 0
416 NaN NaN … 1 0
417 NaN NaN … 0 1

[1309 rows x 24 columns]

Age字段處理
#缺失值處理:用隨機森林模型填充缺失值(RandomForestRegression)

print('# Loss Value Processing:Age...')
age_df=combined_train_test[['Age','Embarked','Sex','Title','Name_length','Family_Size','Family_Size_Category','Fare','Fare_bin_id','Pclass']]
age_df_notnull=age_df.loc[(combined_train_test['Age'].notnull())]							#不是缺失值
age_df_isnull=age_df.loc[(combined_train_test['Age'].isnull())]								#是缺失值
X=age_df_notnull.values[:,1:]																			#將第二列起的屬性值作爲X值
Y=age_df_notnull.values[:,0]																			#將第一列(Age)作爲Y值
#Age字段處理
RFR=RandomForestRegressor(n_estimators=1000,n_jobs=1 ) 							 #1000個決策樹
RFR.fit(X,Y)
predictAges=RFR.predict(age_df_isnull.values[:,1:]) #輸入age字段爲空的,除age字段之外的數據
combined_train_test.loc[combined_train_test['Age'].isnull(),['Age']]=predictAges
print(age_df_isnull)
print(combined_train_test)

結果:
Age Embarked Sex … Fare Fare_bin_id Pclass
5 NaN 2 0 … 8.458300 2 3
17 NaN 0 0 … 13.000000 3 2
19 NaN 1 1 … 7.225000 4 3
26 NaN 1 0 … 7.225000 4 3
28 NaN 2 1 … 7.879200 0 3
… … … … … … … …
408 NaN 2 1 … 7.720800 0 3
410 NaN 2 1 … 7.750000 0 3
413 NaN 0 0 … 8.050000 2 3
416 NaN 0 0 … 8.050000 2 3
417 NaN 1 0 … 7.452767 0 3

[263 rows x 10 columns]

       Age        Cabin  ...  Family_Size_Category_1  Family_Size_Category_2

0 22.000000 NaN … 0 1
1 38.000000 C85 … 0 1
2 26.000000 NaN … 1 0
3 35.000000 C123 … 0 1
4 35.000000 NaN … 1 0
… … … … … …
414 39.000000 C105 … 1 0
415 38.500000 NaN … 1 0
416 33.110230 NaN … 1 0
417 8.313700 NaN … 0 1

[1309 rows x 43 columns]

Ticket字段處理

#Ticket字段處理
print('# factorize:Ticket...')
combined_train_test['Ticket_Letter']=combined_train_test['Ticket'].str.split().str[0]
combined_train_test['Ticket_Letter']=combined_train_test['Ticket_Letter'].apply(lambda x:'0' if x.isnumeric() else x)
combined_train_test['Ticket_Letter']=pd.factorize(combined_train_test['Ticket_Letter'])[0]
print(combined_train_test['Ticket_Letter'])

Cabin字段處理

#Cabin字段處理
print('# Loss Value Processing:Cabin...')
combined_train_test.loc[combined_train_test.Cabin.isnull(),'Cabin']='0'									#空值賦予0
combined_train_test['Cabin']=combined_train_test['Cabin'].apply(lambda x:0 if x=='0' else 1)	#爲0賦值0,不爲0賦值1
print(combined_train_test['Cabin'])

Age和fare字段的正則化

#Age和fare字段值的正則化
print('# Normalization:Age&fare...')
scale_age_fare=preprocessing.StandardScaler().fit(combined_train_test[['Age','Fare','Name_length']])
combined_train_test[['Age','Fare','Name_length']]=scale_age_fare.transform(combined_train_test[['Age','Fare','Name_length']])

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章