pandas

其它

dir(變量名)# 查詢該變量能使用的函數
pd.set_option('display.max_rows',10)
pd.set_option('display.max_columns'10)
pd.__version__ # 查看包的版本

!type f:\test\News\DataAnalyst.csv # 查看文件內容、格式
!type f:\test\demo.json
!dir # 查看目錄下，文件名稱
list(open('demo.csv')) # 打開查看文件

Series

Sries最重要的一個功能是：它在算術運算中會自動對齊不同索引的數據

屬性

.values
.index
.name
.index.name

創建

pd.Series(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False)

s = {'a':'A','c':'d','b':'B'}
pd.Series(s,index=[1,3,'a','c'])
pd.Series(np.random.randint(10,50,5,dtype=int))
pd.Series(['a','ab','v'],index=list('abc'),name='demo')

修改刪除

s1['a']=
s1.iloc[0]=
s1.loc['a']=

s1.append() #返回新的Series
s1['new_index']= # 在s1上修改（添加）

del s1['index']

s1.to_dict() # 保存爲tuple
pd.isnull(s1)
pd.notnull(s1)

過濾

s1[s1<100]

描述統計

數值型
- 分位數

s1.describe()
s1.describe(percentiles=[0.1,0.25,0.5])# 選擇分位數
s1.quantile(0.5)# 分位數
s1.quantile(0.25)

分類統計

s.value_counts()# 每一類有多少項

```
pd.value_counts(df['col'])
```
```
s.pct_change()
```

DataFrame

```
df = pd.DataFrame()
```
返回df的每一行：
```
for i in df.iterrows():
    print(i)
```

IO

df = pd.read_clipboard()
df.to_clipboard()
df.to_csv('demo.csv',index=False)
df.to_json()
df.to_html('demo.html')
df.to_excel('demo.xlsx')

TimeSeries

pd.date_range(start=None, end=None, periods=None, freq=None, tz=None, normalize=False, name=None, closed=None, **kwargs)

基礎

索引對象

index對象是不可修改的
- ~~df.index[1]=~~
- ```
df.index is index
df.index = pd.index(np.arange(3))
```
- DatetimeIndex，存儲納秒級時間戳，用numpy的達特time64類型表示
- PeriodIndex，針對period（時間間隔）數據的特殊index

重命名

df.index.map(str.upper)
df.rename(index=str.upper,columns=str.lower)
df.rename(index={'ind1':'new_ind1'},
          columns={'col1':'new_col1','col2','new_col2'}
         )
df.rename(index=‘自定義函數’)

重新索引reindex

Series 根據新索引重排

se.reindex(['index_1','index_3','index_2'])

如果某個索引值不存在，就引入缺失值

se.reindex(['index_a','index_b','index_c'],fill_value=0)
# 缺失部分填充爲0

插值處理
```
se.reindex(range(6),methon='ffill')
```

DataFrame

df.reindex(index=[], #用作索引的新序列
           method = 'bfill', #插值填充方式，插值只能按行應用，即軸0
           columns=[],
           fill_value = 0, #在重新索引的過程中，需要引入缺失值時使用的替代值
           limit = 3, #前向或後向填充時的最大填充量
           level =  ,# 在MultiIndex的指定級別上匹配簡單索引，否則選取其子集
           
          )

df1.reindex(columns=df2.columns,fill_value=0)

多重索引

s = pd.Series(np.random.rand(6),
              index=[['1','1','1','2','2','2'],
                     ['a','b','v','a','b','c']])

s.unstack() # 變爲DataFrame 最內層index變爲columns

丟棄drop（）

```
se.drop('index_n')
```

df.drop(['index_1','index_2'])#刪除行
df.drop(['col1','col2'],axis=1)#刪除列

索引、選取和過濾

se['a']
se[1]
se[2:4] #取值範圍[2:4)
se['a':'c'] #取值範圍[a:c]
se[['a','b','c']]
se[[1,3]]
se[se<2]

# 選取列
df['col1']
df[['col1','col3']]

# 選取行（通過切片或布爾型數組選取行）
df[:2] #前2行
df[df['col2']]

#布爾型DataFrame索引
df[df<3]

索引的重複性

```
se.index.is_unique
```

算數運算

填充值
- 當一個對象中某個軸標籤在另一個對象中找不到時，填充一個特殊值
- ```
df1.add(df2,fill_value=0)
df.reindex(columns=df2.columns,fill_value=0)
```
  - add 加法
    
    sub 減法
    
    div 除法
    
    mul 乘法
廣播運算broadcasting

add	加法
sub	減法
div	除法
mul	乘法

apply

df.apply(lambda x:x.max() - x.min())
df.apply(lambda x:x.max() - x.min(), axis =1) 
#axis : {0 or 'index', 1 or 'columns'

```
df.apply(pd.value_counts).fillna(0)
```

applymap元素級應用

```
df.applymap(lambda x:'.2f' % x)
```

排序

sort_index()
sort_values()

pd.value_counts(se.values,
                sort=True,#按值排序
                ascending=True,#按頻率技術，升序排
                bins=3, #pd.cut()分箱，只適用於數字數據
                dropna=True #不包括NaN的數量
               )

排名

se.rank(method='first')#按出現順序排名
se.rank(method='average')#分配平均排名 
se.rank(acending=False,method='max')#使用整組的最大排名

df.rank(axis=1)
# axis : {0 or 'index', 1 or 'columns'}

描述統計

df.describe()
df.describe(percentiles=[0.5,0.75])
df.quantile([0.5,0.75])

df.mean(skipna=False)# 排除缺失值，默認爲True
df.count()#非NaN的值個數
df.idxmax(),df.idxmin()#獲取極值的索引值
df.argmin(),df.argmax()#獲取極值的索引位置（整數）
df.sum()
df.mean()
df.median()
df.mad()# 根據平均值計算平均絕對離差
df.var()# 方差
df.std()# 標準差

df.skew()#偏度，三階矩
df.kurt()#峯度，四階矩
df.cumsum()#累加
df.cummin(),df.cummax()#累計極值
df.cmprod()#累積
df.diff()#一階差分（時間序列中）
df.pct_change()# 計算百分數變化

去重

se.unique().value_counts(ascending=)#去重+排序

缺失值處理

診斷

df.isnull().values.any()
df.isnull().any()
df['col'].isnull()
df['col'].isnull().values.any()

統計

df.isnull().sum()
df.isnull().sum().sum()

取捨

df.dropna()
df.dropna(how='any')
df.dropna(thresh=2)#捨棄缺失值超過2個的行

df.dropna(how='all',axis=1)#捨棄整列爲NaN的列

填充

method

df.fillna(method='ffill')
df.fillna(method='bfill',limit=3)

填充統計數據

df['col'].fillna(df['col'].mean())
df['col'].fillna(df.groupby('')[].transform('mean'))
df.fillna({'col1':1,'col3':3})

內插補齊法
```
df.interpolate()
```
計算缺失值佔比

df.isnull().sum() / df.count()

排序-去重

先排序
```
df.sort_values(by='col_1')
```
- 因爲排序會將NaN放在最後，確保之後的去重操作不會將其他列爲NaN的值保留
再去重
```
df.drop_duplicates(subset=['col_2','col_3','col_5'])
```
1. 因爲先做了col_1的排序，所以去重後的保留行中，col_1列的元素不會爲NaN
2. DataFrame.drop_duplicates()的用法
  - 去重DataFrame.drop_duplicates(subset=None, keep='first', inplace=False)
    - subset：對應的值是列名，將這些列對應值相等的行進行去重，默認所有列subset=['col1','col2']
    - keep：刪除重複項並保留第一次出現的項'first','last',False，False：一行都不留
    - inplace：是否直接在原數據上修改

map replace

df1 = pd.DataFrame({'城市':['BJ','SH','GZ'],
                    '人口':[1000,2000,1500]
                   },
                   index=list('abc')
                  )

gdp_map={'BJ':1000,
         'SH':2000,
         'GZ':1500
        }

df1['GDP'] = df1['城市'].map(gdp_map)

根據字典給DataFrame添加新列

df['new_col']=df['字典提到的col'].map(字典)

用Series構建新column（推薦用map）

df['new_col'] = pd.Series([],index=)
# 需注意Series中值的順序，index要與df的一一對應

s.replace('要替換的值','替換爲')
s.replace(1,np.nan)
s.replace([1,2,3],[11,22,33])

數據抽樣sampling

.sample()
- 隨機取樣
- - DataFrame.sample(n=None, frac=None, replace=False,weights=None, random_state=None, axis=None*)
```
s.sample(n = 3)# 選3個隨機元素
df.sample(frac = 0.1) # 隨機取10%的元素
df.sample(n = 3)# 隨機取3行
```
  - 時間採樣Series.resample()【詳解筆記時間序列部分】

map

```
df.index.map(str.upper) # 不替換
```

list1=[1,2,3,4]
[str(x) for x in list1]
list(map(str,list1))

def test_map(x):
    return x+'_ABC'

df.index.map(test_map)
df.rename(index=test_map)

關聯

pd.merge

pd.merge
```
pd.merge(left,right,on=,how='inner')
```

pd.merge(left,right,left_on=,right_on,suffixes=('_x', '_y')

pd.concat

Series

pd.concat([s1,s2]) # 上下堆疊
pd.concat([s1,s2],axis=1)# 引入column的計算，變爲DataFrame

DataFrame
```
pd.concat([df1,df2]) # 上下堆疊
```

np.concatenate()

上下堆砌
```
np.concatenate([arr1,arr2])
```
左右連接
```
np.concatenate([arr1,arr2],axis=1)
```

combine

s1.combine_first(s2) # s2填充s1的NaN

df1.combine_first(df2)

apply預處理

df['col1'] = df['col1'].apply(str.upper)

def foo(line):
    a = line.strip().split(' ')
    return np.Series([a[1],a[3],a[5]])

df_new = df['col'].apply(foo)
df_new.rename(columns={0:'A',1:'B',2:"C"})

df.combine_first(df_new)

分箱

pd.cut()

pd.cut(x, bins,
       right=True,
       labels=None,
       retbins=False,
       precision=3,
       include_lowest=False,
       duplicates='raise')

分組

.groupby

df.groupby()
df.groupby().mean()
list(df.groupby())
dict(list(df.groupby()))['']

for a, b in df.groupby():
    print(a),print(b)

df.groupby().groups
df.groupby().get_group('')
df.groupby().get_group('').mean()

gs = df.groupby(['col1','col3'])
for (name1,name2), b in gs:
    print(name1)
    print(b)

透視表

pd.pivot_table(df,index=,
               columns=,
               values=,
               aggfunc=
              )

By儒冠多誤身 2019/04/21

【Python數據處理專題】-pandas庫

文章目錄

其它

Series

屬性

創建

修改刪除

過濾

描述統計

DataFrame

IO

TimeSeries

基礎

索引對象

重新索引reindex

多重索引

丟棄drop（）

索引、選取和過濾

索引的重複性

算數運算

apply

applymap元素級應用

排序

排名

描述統計

相關性，協方差

去重

缺失值處理

排序-去重

map replace

數據抽樣sampling

map

關聯

pd.merge

pd.concat

np.concatenate()

combine

apply預處理

分箱

分組

透視表