pandas-study

quickstart

pandas is for statistic and analysis

Object Creation

import numpy as np
import pandas as pd
s=pd.Series([1,3,5,np.nan,6,8])
s
0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64
type(s)
pandas.core.series.Series
dates=pd.date_range('20130101', periods=6)
dates
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
df=pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df
A B C D
2013-01-01 -0.690574 0.801871 -0.557254 -0.824757
2013-01-02 -0.467807 -0.105658 -2.581402 -1.116137
2013-01-03 0.007332 0.683399 -1.934149 1.210484
2013-01-04 1.077696 -1.095942 -1.276125 -0.303867
2013-01-05 -1.721047 -0.197204 1.910654 -0.606741
2013-01-06 0.914047 1.832835 -1.114695 -0.594739
df2=pd.DataFrame({'A':1., 'B':pd.Timestamp('20130102'), 
                  'C':pd.Series(1, index=list(range(4)), dtype='float32'),
                 'D':np.array([3]*4, dtype='int32'),
                 'E':pd.Categorical(["test", "train", "test", "train"]),
                 'F':'foo'})
df2
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
[3]*4
[3, 3, 3, 3]
df2.dtypes
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

Viewing Data

df2.B
0   2013-01-02
1   2013-01-02
2   2013-01-02
3   2013-01-02
Name: B, dtype: datetime64[ns]
df.head()
A B C D
2013-01-01 -0.690574 0.801871 -0.557254 -0.824757
2013-01-02 -0.467807 -0.105658 -2.581402 -1.116137
2013-01-03 0.007332 0.683399 -1.934149 1.210484
2013-01-04 1.077696 -1.095942 -1.276125 -0.303867
2013-01-05 -1.721047 -0.197204 1.910654 -0.606741
df.tail(2)
A B C D
2013-01-05 -1.721047 -0.197204 1.910654 -0.606741
2013-01-06 0.914047 1.832835 -1.114695 -0.594739
print(df.index)
print(df.columns)
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
Index(['A', 'B', 'C', 'D'], dtype='object')
df.to_numpy()
---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

<ipython-input-16-d1ec329ba876> in <module>
----> 1 df.to_numpy()


D:\Programs\anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
   4374             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   4375                 return self[name]
-> 4376             return object.__getattribute__(self, name)
   4377 
   4378     def __setattr__(self, name, value):


AttributeError: 'DataFrame' object has no attribute 'to_numpy'
df.describe()
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean -0.146726 0.319884 -0.925495 -0.372626
std 1.051240 1.010594 1.555474 0.821260
min -1.721047 -1.095942 -2.581402 -1.116137
25% -0.634882 -0.174318 -1.769643 -0.770253
50% -0.230237 0.288871 -1.195410 -0.600740
75% 0.687368 0.772253 -0.696614 -0.376585
max 1.077696 1.832835 1.910654 1.210484
df.T
2013-01-01 00:00:00 2013-01-02 00:00:00 2013-01-03 00:00:00 2013-01-04 00:00:00 2013-01-05 00:00:00 2013-01-06 00:00:00
A -0.690574 -0.467807 0.007332 1.077696 -1.721047 0.914047
B 0.801871 -0.105658 0.683399 -1.095942 -0.197204 1.832835
C -0.557254 -2.581402 -1.934149 -1.276125 1.910654 -1.114695
D -0.824757 -1.116137 1.210484 -0.303867 -0.606741 -0.594739
df.sort_index(axis=0, ascending=False)
A B C D
2013-01-06 0.914047 1.832835 -1.114695 -0.594739
2013-01-05 -1.721047 -0.197204 1.910654 -0.606741
2013-01-04 1.077696 -1.095942 -1.276125 -0.303867
2013-01-03 0.007332 0.683399 -1.934149 1.210484
2013-01-02 -0.467807 -0.105658 -2.581402 -1.116137
2013-01-01 -0.690574 0.801871 -0.557254 -0.824757
df.sort_index(axis=1, ascending=False)
D C B A
2013-01-01 -0.824757 -0.557254 0.801871 -0.690574
2013-01-02 -1.116137 -2.581402 -0.105658 -0.467807
2013-01-03 1.210484 -1.934149 0.683399 0.007332
2013-01-04 -0.303867 -1.276125 -1.095942 1.077696
2013-01-05 -0.606741 1.910654 -0.197204 -1.721047
2013-01-06 -0.594739 -1.114695 1.832835 0.914047
df.sort_values(by='B')
A B C D
2013-01-04 1.077696 -1.095942 -1.276125 -0.303867
2013-01-05 -1.721047 -0.197204 1.910654 -0.606741
2013-01-02 -0.467807 -0.105658 -2.581402 -1.116137
2013-01-03 0.007332 0.683399 -1.934149 1.210484
2013-01-01 -0.690574 0.801871 -0.557254 -0.824757
2013-01-06 0.914047 1.832835 -1.114695 -0.594739

Selection

Selection by Label

df['A']
2013-01-01   -0.690574
2013-01-02   -0.467807
2013-01-03    0.007332
2013-01-04    1.077696
2013-01-05   -1.721047
2013-01-06    0.914047
Freq: D, Name: A, dtype: float64
df
A B C D
2013-01-01 -0.690574 0.801871 -0.557254 -0.824757
2013-01-02 -0.467807 -0.105658 -2.581402 -1.116137
2013-01-03 0.007332 0.683399 -1.934149 1.210484
2013-01-04 1.077696 -1.095942 -1.276125 -0.303867
2013-01-05 -1.721047 -0.197204 1.910654 -0.606741
2013-01-06 0.914047 1.832835 -1.114695 -0.594739
df[0:3]
A B C D
2013-01-01 -0.690574 0.801871 -0.557254 -0.824757
2013-01-02 -0.467807 -0.105658 -2.581402 -1.116137
2013-01-03 0.007332 0.683399 -1.934149 1.210484
df['20130102':'20130104']
A B C D
2013-01-02 -0.467807 -0.105658 -2.581402 -1.116137
2013-01-03 0.007332 0.683399 -1.934149 1.210484
2013-01-04 1.077696 -1.095942 -1.276125 -0.303867
df.loc[dates[0]]
A   -0.690574
B    0.801871
C   -0.557254
D   -0.824757
Name: 2013-01-01 00:00:00, dtype: float64
df.loc[:, ['A', 'B']]
A B
2013-01-01 -0.690574 0.801871
2013-01-02 -0.467807 -0.105658
2013-01-03 0.007332 0.683399
2013-01-04 1.077696 -1.095942
2013-01-05 -1.721047 -0.197204
2013-01-06 0.914047 1.832835
df.loc['20130102':'20130104', ['A', 'B']]
A B
2013-01-02 -0.467807 -0.105658
2013-01-03 0.007332 0.683399
2013-01-04 1.077696 -1.095942
df.loc['20130102':'20130104']
A B C D
2013-01-02 -0.467807 -0.105658 -2.581402 -1.116137
2013-01-03 0.007332 0.683399 -1.934149 1.210484
2013-01-04 1.077696 -1.095942 -1.276125 -0.303867
df.loc['20130102':'20130103'] # endpoints included
A B C D
2013-01-02 -0.467807 -0.105658 -2.581402 -1.116137
2013-01-03 0.007332 0.683399 -1.934149 1.210484
df.loc['20130102':'20130102']
A B C D
2013-01-02 -0.467807 -0.105658 -2.581402 -1.116137
df.loc['20130102', ['A', 'B']]
A   -0.467807
B   -0.105658
Name: 2013-01-02 00:00:00, dtype: float64
df.loc['20130102',  'B'] # getting scalar value
-0.1056575005716924
df.at[dates[0],'B']
0.8018707533374761
dates[0]
Timestamp('2013-01-01 00:00:00', freq='D')

Selection by Position

df.iloc[3]
A    1.077696
B   -1.095942
C   -1.276125
D   -0.303867
Name: 2013-01-04 00:00:00, dtype: float64
df.iloc[3:5]
A B C D
2013-01-04 1.077696 -1.095942 -1.276125 -0.303867
2013-01-05 -1.721047 -0.197204 1.910654 -0.606741
df.iloc[3:5, 0:2]
A B
2013-01-04 1.077696 -1.095942
2013-01-05 -1.721047 -0.197204
df.iloc[3:5, :]
A B C D
2013-01-04 1.077696 -1.095942 -1.276125 -0.303867
2013-01-05 -1.721047 -0.197204 1.910654 -0.606741
df.iloc[:, 0:2]
A B
2013-01-01 -0.690574 0.801871
2013-01-02 -0.467807 -0.105658
2013-01-03 0.007332 0.683399
2013-01-04 1.077696 -1.095942
2013-01-05 -1.721047 -0.197204
2013-01-06 0.914047 1.832835
df.iloc[3, 0:2]
A    1.077696
B   -1.095942
Name: 2013-01-04 00:00:00, dtype: float64
df.iloc[1, 1]
-0.1056575005716924
df.iat[1, 1]
-0.1056575005716924

Selection by Boolean Indexing

df[df.A>0]
A B C D
2013-01-03 0.007332 0.683399 -1.934149 1.210484
2013-01-04 1.077696 -1.095942 -1.276125 -0.303867
2013-01-06 0.914047 1.832835 -1.114695 -0.594739
df[df>0]
A B C D
2013-01-01 NaN 0.801871 NaN NaN
2013-01-02 NaN NaN NaN NaN
2013-01-03 0.007332 0.683399 NaN 1.210484
2013-01-04 1.077696 NaN NaN NaN
2013-01-05 NaN NaN 1.910654 NaN
2013-01-06 0.914047 1.832835 NaN NaN
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three'] #setting value
df2
A B C D E
2013-01-01 -0.690574 0.801871 -0.557254 -0.824757 one
2013-01-02 -0.467807 -0.105658 -2.581402 -1.116137 one
2013-01-03 0.007332 0.683399 -1.934149 1.210484 two
2013-01-04 1.077696 -1.095942 -1.276125 -0.303867 three
2013-01-05 -1.721047 -0.197204 1.910654 -0.606741 four
2013-01-06 0.914047 1.832835 -1.114695 -0.594739 three
df2[df2['E'].isin(['two', 'four'])]
A B C D E
2013-01-03 0.007332 0.683399 -1.934149 1.210484 two
2013-01-05 -1.721047 -0.197204 1.910654 -0.606741 four

Setting Value

s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6))
s1
2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64
df['F']=s1
df
A B C D F
2013-01-01 -0.690574 0.801871 -0.557254 -0.824757 NaN
2013-01-02 -0.467807 -0.105658 -2.581402 -1.116137 1.0
2013-01-03 0.007332 0.683399 -1.934149 1.210484 2.0
2013-01-04 1.077696 -1.095942 -1.276125 -0.303867 3.0
2013-01-05 -1.721047 -0.197204 1.910654 -0.606741 4.0
2013-01-06 0.914047 1.832835 -1.114695 -0.594739 5.0
df.at[dates[0], 'A'] = 0
df.at[dates[0], 'A']
0.0
df.iat[0, 1]=0
df.loc[:, 'D'] = np.array([5]*len(df))
df
A B C D F
2013-01-01 0.000000 0.000000 -0.557254 5 NaN
2013-01-02 -0.467807 -0.105658 -2.581402 5 1.0
2013-01-03 0.007332 0.683399 -1.934149 5 2.0
2013-01-04 1.077696 -1.095942 -1.276125 5 3.0
2013-01-05 -1.721047 -0.197204 1.910654 5 4.0
2013-01-06 0.914047 1.832835 -1.114695 5 5.0
df2 = df.copy()
df2[df2>0] = -df2
df2
A B C D F
2013-01-01 0.000000 0.000000 -0.557254 -5 NaN
2013-01-02 -0.467807 -0.105658 -2.581402 -5 -1.0
2013-01-03 -0.007332 -0.683399 -1.934149 -5 -2.0
2013-01-04 -1.077696 -1.095942 -1.276125 -5 -3.0
2013-01-05 -1.721047 -0.197204 -1.910654 -5 -4.0
2013-01-06 -0.914047 -1.832835 -1.114695 -5 -5.0

Missing Data

df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1], 'E'] = 1
df1 # np.nan represent missing data
A B C D F E
2013-01-01 0.000000 0.000000 -0.557254 5 NaN 1.0
2013-01-02 -0.467807 -0.105658 -2.581402 5 1.0 1.0
2013-01-03 0.007332 0.683399 -1.934149 5 2.0 NaN
2013-01-04 1.077696 -1.095942 -1.276125 5 3.0 NaN
df1.dropna(how='any')
A B C D F E
2013-01-02 -0.467807 -0.105658 -2.581402 5 1.0 1.0
df1.fillna(value=5)
A B C D F E
2013-01-01 0.000000 0.000000 -0.557254 5 5.0 1.0
2013-01-02 -0.467807 -0.105658 -2.581402 5 1.0 1.0
2013-01-03 0.007332 0.683399 -1.934149 5 2.0 5.0
2013-01-04 1.077696 -1.095942 -1.276125 5 3.0 5.0
pd.isna(df1)
A B C D F E
2013-01-01 False False False False True False
2013-01-02 False False False False False False
2013-01-03 False False False False False True
2013-01-04 False False False False False True

Operations

  • Stats
  • Apply Functions
  • Histogramming
  • String Methods

Merge

  • Concat
  • Join
  • Append

Grouping

Reshaping

  • Stack
  • Pivot Tables

Time Series

Categoricals

Data In/Out

  • CSV
  • HDFS
  • Excel

Plotting

 ts = pd.Series(np.random.randn(1000),index=pd.date_range('1/1/2000', periods=1000))
ts
2000-01-01    0.249514
2000-01-02    0.779492
2000-01-03    2.535827
2000-01-04    1.034724
2000-01-05   -2.398410
2000-01-06    0.203969
2000-01-07   -1.632751
2000-01-08    1.385398
2000-01-09   -0.695489
2000-01-10   -1.497335
2000-01-11   -1.062965
2000-01-12   -0.672158
2000-01-13   -0.361664
2000-01-14    1.177963
2000-01-15   -0.210622
2000-01-16   -1.963907
2000-01-17    0.576599
2000-01-18   -1.628227
2000-01-19    0.210709
2000-01-20   -0.722918
2000-01-21    2.112262
2000-01-22   -0.090371
2000-01-23    0.991765
2000-01-24   -0.081367
2000-01-25    0.778599
2000-01-26    0.794808
2000-01-27    0.203199
2000-01-28    0.549142
2000-01-29   -1.042209
2000-01-30   -0.488402
                ...   
2002-08-28   -0.956878
2002-08-29   -1.513276
2002-08-30   -0.015315
2002-08-31   -0.450373
2002-09-01   -1.302343
2002-09-02   -0.975905
2002-09-03   -1.167191
2002-09-04    0.264979
2002-09-05   -0.825148
2002-09-06    0.446931
2002-09-07   -1.151106
2002-09-08   -1.553937
2002-09-09   -0.433001
2002-09-10   -0.082801
2002-09-11    1.041824
2002-09-12   -0.192759
2002-09-13   -0.033006
2002-09-14   -0.241379
2002-09-15    0.219681
2002-09-16   -0.534304
2002-09-17   -1.836852
2002-09-18    0.782766
2002-09-19   -1.221114
2002-09-20   -1.021442
2002-09-21    0.732057
2002-09-22    1.461166
2002-09-23   -0.014286
2002-09-24    0.689332
2002-09-25    0.557755
2002-09-26    0.301062
Freq: D, Length: 1000, dtype: float64
ts.cumsum()
2000-01-01     0.249514
2000-01-02     1.029007
2000-01-03     3.564834
2000-01-04     4.599558
2000-01-05     2.201148
2000-01-06     2.405117
2000-01-07     0.772366
2000-01-08     2.157764
2000-01-09     1.462275
2000-01-10    -0.035060
2000-01-11    -1.098025
2000-01-12    -1.770183
2000-01-13    -2.131847
2000-01-14    -0.953884
2000-01-15    -1.164507
2000-01-16    -3.128414
2000-01-17    -2.551815
2000-01-18    -4.180042
2000-01-19    -3.969332
2000-01-20    -4.692251
2000-01-21    -2.579988
2000-01-22    -2.670360
2000-01-23    -1.678595
2000-01-24    -1.759962
2000-01-25    -0.981362
2000-01-26    -0.186554
2000-01-27     0.016645
2000-01-28     0.565787
2000-01-29    -0.476423
2000-01-30    -0.964825
                ...    
2002-08-28    27.620051
2002-08-29    26.106774
2002-08-30    26.091460
2002-08-31    25.641087
2002-09-01    24.338744
2002-09-02    23.362839
2002-09-03    22.195648
2002-09-04    22.460626
2002-09-05    21.635478
2002-09-06    22.082409
2002-09-07    20.931304
2002-09-08    19.377367
2002-09-09    18.944366
2002-09-10    18.861565
2002-09-11    19.903388
2002-09-12    19.710629
2002-09-13    19.677623
2002-09-14    19.436245
2002-09-15    19.655925
2002-09-16    19.121621
2002-09-17    17.284769
2002-09-18    18.067535
2002-09-19    16.846421
2002-09-20    15.824979
2002-09-21    16.557036
2002-09-22    18.018202
2002-09-23    18.003916
2002-09-24    18.693247
2002-09-25    19.251003
2002-09-26    19.552065
Freq: D, Length: 1000, dtype: float64
ts.cumsum().plot()
<matplotlib.axes._subplots.AxesSubplot at 0xc3601d0>

png

output_75_1.png

df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
                 columns=['A', 'B', 'C', 'D'])
df=df.cumsum()
df
A B C D
2000-01-01 0.068281 -0.132664 0.070582 0.493786
2000-01-02 -0.143809 0.982013 0.674319 0.840579
2000-01-03 -1.632460 1.969912 -0.610960 1.083595
2000-01-04 -1.809409 2.393567 -0.644043 0.776470
2000-01-05 -1.410989 1.926901 -0.374365 -0.349545
2000-01-06 -0.571236 0.510310 0.263010 -1.158115
2000-01-07 -0.949301 -0.332644 1.274226 -1.594909
2000-01-08 -1.464345 -0.301202 1.385892 -0.874542
2000-01-09 -1.541978 -0.760144 1.082006 -0.964494
2000-01-10 -1.688916 -0.011463 0.586871 -0.772981
2000-01-11 -2.738662 -1.064460 0.205806 -1.624307
2000-01-12 -1.816450 -0.919008 3.146092 -3.034167
2000-01-13 -0.369013 -0.792585 4.034958 -3.381528
2000-01-14 -1.030219 -0.882393 3.360184 -4.622791
2000-01-15 -0.267468 -1.644123 4.425037 -3.199782
2000-01-16 -1.274040 -1.317257 3.946021 -1.854756
2000-01-17 -1.633838 -1.380520 4.982910 -2.332076
2000-01-18 -1.740065 -1.695852 5.197173 -0.738558
2000-01-19 -1.435079 0.092813 3.733614 -1.590195
2000-01-20 -2.276656 0.341896 4.010572 -1.492866
2000-01-21 -3.193353 0.695627 3.131444 -0.234961
2000-01-22 -2.720893 2.388687 3.521364 -0.156640
2000-01-23 -4.709745 2.472230 2.789145 -1.229235
2000-01-24 -5.011993 2.665329 2.248143 -0.773138
2000-01-25 -8.642091 2.537319 3.510750 0.303818
2000-01-26 -9.362809 4.882305 3.944840 -0.177612
2000-01-27 -9.162106 4.018060 2.944868 -1.076716
2000-01-28 -8.882097 4.458919 2.539262 -2.143664
2000-01-29 -8.311994 2.264650 1.962080 -2.440440
2000-01-30 -6.661903 1.881069 1.626243 -2.246792
... ... ... ... ...
2002-08-28 -50.814278 10.453969 2.536033 21.415121
2002-08-29 -51.896724 11.679757 3.722690 22.311230
2002-08-30 -51.168872 10.544244 4.704257 22.395454
2002-08-31 -50.216407 9.360483 4.824638 22.622415
2002-09-01 -49.155902 10.002493 3.789504 22.808149
2002-09-02 -51.839076 10.490752 4.323344 21.314234
2002-09-03 -51.996993 10.218370 5.026497 20.520225
2002-09-04 -53.369587 10.647158 3.875207 21.262938
2002-09-05 -54.236496 10.180358 5.459626 20.664416
2002-09-06 -54.348958 11.098400 5.820596 21.098987
2002-09-07 -56.714004 11.167728 4.879682 20.725511
2002-09-08 -57.418896 11.304261 5.456125 21.188847
2002-09-09 -55.980077 12.488385 5.529494 20.326200
2002-09-10 -56.214281 14.106066 4.355109 19.350696
2002-09-11 -56.541622 14.926102 3.006789 21.219695
2002-09-12 -57.434431 13.767285 3.615759 20.840787
2002-09-13 -59.228884 13.133139 2.129772 20.877084
2002-09-14 -59.429832 13.282503 0.289519 22.303381
2002-09-15 -59.082571 15.174494 0.053155 22.048665
2002-09-16 -58.386137 14.649073 0.713117 20.763790
2002-09-17 -58.570674 15.404131 -0.173061 21.791286
2002-09-18 -57.916990 15.121028 1.747123 22.860719
2002-09-19 -57.483817 13.923232 3.844805 23.486515
2002-09-20 -58.522180 14.579001 3.927660 22.452312
2002-09-21 -60.020574 14.422935 4.434261 21.947218
2002-09-22 -61.440244 15.268679 3.568874 22.143399
2002-09-23 -61.086400 12.421193 3.859252 21.580740
2002-09-24 -62.016026 12.336201 3.591135 22.457797
2002-09-25 -63.534143 12.569431 4.402746 21.675755
2002-09-26 -64.290337 12.206089 3.779266 22.571632

1000 rows × 4 columns

import matplotlib.pyplot as plt
plt.figure()
<Figure size 432x288 with 0 Axes>




<Figure size 432x288 with 0 Axes>
df.plot()
<matplotlib.axes._subplots.AxesSubplot at 0xc111c88>

png

plt.legend(loc='best')
No handles with labels found to put in legend.





<matplotlib.legend.Legend at 0xbd8db00>

png


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章