高階函數處理

import pandas as pd
import numpy as np
import os

os.getcwd()

'D:\\Jupyter\\notebook\\Python數據清洗實戰\\數據清洗之數據轉換'

os.chdir('D:\\Jupyter\\notebook\\Python數據清洗實戰\\數據')

df = pd.read_csv('sam_tianchi_mum_baby.csv', dtype=str, encoding='utf-8')

df.head(5)

def f(x):
    if '0' in str(x):
        return '女'
    elif '1' in str(x):
        return '男'
    else:
        return '未知'

# apply函數可做很多其他處理
df['性別'] = df['gender'].apply(f)

df.head(5)

	user_id	birthday	gender	性別
0	2757	20130311	1	男
1	415971	20121111	0	女
2	1372572	20120130	1	男
3	10339332	20110910	0	女
4	10642245	20130213	0	女

# 查看性別爲未知數據
df[df['gender'] == '2'].head(5)

	user_id	birthday	gender	性別
46	49167150	20130818	2	未知
47	49983255	20140206	2	未知
51	52529655	20130611	2	未知
58	57711375	20130420	2	未知
106	99665637	20130926	2	未知

del df['性別']

# map函數主要用於映射
df['性別'] = df['gender'].map({'0': '女性', '1':'男性', '2': '未知'})

df.head(5)

	user_id	birthday	gender	性別
0	2757	20130311	1	男性
1	415971	20121111	0	女性
2	1372572	20120130	1	男性
3	10339332	20110910	0	女性
4	10642245	20130213	0	女性

del df['性別']

# map函數也可傳入自己定義的函數
df['性別'] = df['gender'].map(f)

df.head(5)

	user_id	birthday	gender	性別
0	2757	20130311	1	男
1	415971	20121111	0	女
2	1372572	20120130	1	男
3	10339332	20110910	0	女
4	10642245	20130213	0	女

# 脫敏處理
# 可使用lambda函數
df['user_id'].apply(lambda x: str(x).replace(x[1:3], '**')).head(5)

0        2**7
1      4**971
2     1**2572
3    1**39332
4    1**42245
Name: user_id, dtype: object

df['birthday'].apply(lambda x: x[0:4]).head(5)

0    2013
1    2012
2    2012
3    2011
4    2013
Name: birthday, dtype: object

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

數據清洗之高階函數處理