import pandas as pd
df = pd.DataFrame({'k1':["one", "two"]*3+["two"]*2, "k2":[1,1,2,3,3,4,4,4]})
df
|
k1 |
k2 |
0 |
one |
1 |
1 |
two |
1 |
2 |
one |
2 |
3 |
two |
3 |
4 |
one |
3 |
5 |
two |
4 |
6 |
two |
4 |
7 |
two |
4 |
查找重複數據
df.duplicated()
0 False
1 False
2 False
3 False
4 False
5 False
6 True
7 True
dtype: bool
df.duplicated(keep=False)
0 False
1 False
2 False
3 False
4 False
5 True
6 True
7 True
dtype: bool
df.duplicated(subset='k1')
0 False
1 False
2 True
3 True
4 True
5 True
6 True
7 True
dtype: bool
刪除重複數據
df.drop_duplicates()
|
k1 |
k2 |
0 |
one |
1 |
1 |
two |
1 |
2 |
one |
2 |
3 |
two |
3 |
4 |
one |
3 |
5 |
two |
4 |
df.drop_duplicates('k1')