9.17學習筆記(重複值處理、數據清洗)

pandas的duplicated()判斷重複值記錄
pandas的drop_duplicates()刪除數據記錄,可指定特定列或全部
numpy中unique()返回所有不同的值,且按照從小到大的順序
set(),python自帶內置函數,也能返回唯一元素的集合

示例:重複值處理

import pandas as pd
data1=['a',1]
data2=['a',1]
data3=['b',2]
data4=['b',2]
data=pd.DataFrame([data1,data2,data3,data4],columns=['col1','col2'])
print(data)
#判斷
isduplicated=data.duplicated()
print(isduplicated)
#刪除
new_1=data.drop_duplicates()
new_2=data.drop_duplicates(['col1'])
new_3=data.drop_duplicates(['col1','col2'])
print(new_1)
print(new_2)
print(new_3)

結果:
col1 col2
0 a 1
1 a 1
2 b 2
3 b 2
0 False
1 True
2 False
3 True
dtype: bool
col1 col2
0 a 1
2 b 2
col1 col2
0 a 1
2 b 2
col1 col2
0 a 1
2 b 2

示例:數據清洗

import re
#加載正則表達式庫
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

#特徵工程處理
train_df_org=pd.read_csv('train.csv')
test_df_org=pd.read_csv('test.csv')
test_df_org['Survived']=0
combined_train_test=train_df_org.append(test_df_org,sort=True)

#---Pclass字段---建立PCalss Fare Category
def pclass_fare_category(df,pclass1_mean_fare,pclass2_mean_fare,pclass3_mean_fare):
    if df['Pclass']==1:
        if df['Fare']<=pclass1_mean_fare:
            return 'Pclass1_Low'
        else:
            return 'Pclass1_High'
    elif df['Pclass']==2:
        if df['Fare']<=pclass2_mean_fare:
            return 'Pclass2_Low'
        else:
            return 'Pclass2_High'
    elif df['Pclass']==3:
        if df['Fare']<=pclass3_mean_fare:
            return 'Pclass3_Low'
        else:
            return 'Pclass3_High'

Pclass1_mean_fare=combined_train_test['Fare'].groupby(by=combined_train_test['Pclass']).mean().get([1]).values[0]		//取Pclass=1的艙的平均票價
Pclass2_mean_fare=combined_train_test['Fare'].groupby(by=combined_train_test['Pclass']).mean().get([2]).values[0]
Pclass3_mean_fare=combined_train_test['Fare'].groupby(by=combined_train_test['Pclass']).mean().get([3]).values[0]
combined_train_test['Pclass_Fare_Category']=combined_train_test.apply(pclass_fare_category,args=(Pclass1_mean_fare,Pclass2_mean_fare,Pclass3_mean_fare),axis=1)
print('# Pclass_Fare_Category...')
print(combined_train_test.groupby(['Pclass_Fare_Category','Survived'])['Survived'].count())

結果:
#/ Pclass_Fare_Category…
Pclass_Fare_Category Survived
Pclass1_High 0 49
1 48
Pclass1_Low 0 138
1 88
Pclass2_High 0 68
1 43
Pclass2_Low 0 122
1 44
Pclass3_High 0 174
1 42
Pclass3_Low 0 416
1 77
Name: Survived, dtype: int64

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章