最全的pandas基础运算（上）

原創

2019-07-06 19:42

请关注微信公众号：机器知心媛，每天进步一点点
import pandas as pd
import numpy as np
s=pd.Series([1,3,6,np.nan,44,1])
print(s)
# 这里我们没有给数据指定索引
# 会自动创建一个0到N-1的整数型索引
# 0     1.0
# 1     3.0
# 2     6.0
# 3     NaN
# 4    44.0
# 5     1.0
# dtype: float64

datas=pd.date_range('20160101',periods=6)
df=pd.DataFrame(np.random.randn(6,4),index=datas,columns=['a','b','c','d'])
#DataFrame是一个表格型的数据结构，包含一组有序的列
#既有行索引又有列索引
print(df)
#                    a         b         c         d
# 2016-01-01  1.585208 -0.990413  1.236099 -0.388732
# 2016-01-02  0.495724  0.059505  0.416845 -0.181266
# 2016-01-03 -1.859331  0.071485 -0.665165  0.110948
# 2016-01-04  2.665616 -0.974616  0.047750 -0.623723
# 2016-01-05 -0.618240 -0.770843  0.779370 -1.378176
# 2016-01-06 -0.101552  1.311490 -0.665657  0.854078
print(df['b'])
# 2016-01-01   -2.038565
# 2016-01-02    0.378850
# 2016-01-03    0.354676
# 2016-01-04    0.096856
# 2016-01-05    2.455016
# 2016-01-06    0.702862
# Freq: D, Name: b, dtype: float64

df1=pd.DataFrame(np.arange(12).reshape((3,4)))
print(df1)
#    0  1   2   3
# 0  0  1   2   3
# 1  4  5   6   7
# 2  8  9  10  11

df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'foo'})

print(df2)
#      A          B    C  D      E    F
# 0  1.0 2013-01-02  1.0  3   test  foo
# 1  1.0 2013-01-02  1.0  3  train  foo
# 2  1.0 2013-01-02  1.0  3   test  foo
# 3  1.0 2013-01-02  1.0  3  train  foo
print(df2.dtypes)
# A           float64
# B    datetime64[ns]
# C           float32
# D             int32
# E          category
# F            object
# dtype: object
print(df2.index)
# Int64Index([0, 1, 2, 3], dtype='int64')
print(df2.columns)
# Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
print(df2.values)
# [[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
#  [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
#  [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
#  [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]
print(df2.describe())
#          A    C    D
# count  4.0  4.0  4.0
# mean   1.0  1.0  3.0
# std    0.0  0.0  0.0
# min    1.0  1.0  3.0
# 25%    1.0  1.0  3.0
# 50%    1.0  1.0  3.0
# 75%    1.0  1.0  3.0
# max    1.0  1.0  3.0
print(df2.T)#转置

#对index进行排序
print(df2.sort_index(axis=1,ascending=False))
#      F      E  D    C          B    A
# 0  foo   test  3  1.0 2013-01-02  1.0
# 1  foo  train  3  1.0 2013-01-02  1.0
# 2  foo   test  3  1.0 2013-01-02  1.0
# 3  foo  train  3  1.0 2013-01-02  1.0

#对数据值进行排序
print(df2.sort_values(by='B'))
#      A          B    C  D      E    F
# 0  1.0 2013-01-02  1.0  3   test  foo
# 1  1.0 2013-01-02  1.0  3  train  foo
# 2  1.0 2013-01-02  1.0  3   test  foo
# 3  1.0 2013-01-02  1.0  3  train  foo


import numpy as np
import pandas as pd
dates=pd.date_range('20190101',periods=6)
df=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df['A'])
# print(df.A)
# 2019-01-01     0
# 2019-01-02     4
# 2019-01-03     8
# 2019-01-04    12
# 2019-01-05    16
# 2019-01-06    20
# Freq: D, Name: A, dtype: int32
print(df[0:3])
#             A  B   C   D
# 2019-01-01  0  1   2   3
# 2019-01-02  4  5   6   7
# 2019-01-03  8  9  10  11
print(df["20190102":"20190104"])
#              A   B   C   D
# 2019-01-02   4   5   6   7
# 2019-01-03   8   9  10  11
# 2019-01-04  12  13  14  15
#通过标签名字选取某一行数据
print(df.loc['20190101'])
# A    0
# B    1
# C    2
# D    3
# Name: 2019-01-01 00:00:00, dtype: int32

print(df.loc[:,'A':'B'])
#              A   B
# 2019-01-01   0   1
# 2019-01-02   4   5
# 2019-01-03   8   9
# 2019-01-04  12  13
# 2019-01-05  16  17
# 2019-01-06  20  21

print(df.loc['20190102',['A','B']])
# A    4
# B    5

#根据位置选择iloc
print(df.iloc[3,1])#13
print(df.iloc[3:5,1:3])
print(df.iloc[[1,3,5],1:3])
#混合选择,选择前三行和‘A’和‘C’的两列
print(df.ix[:3,['A','C']])
#通过判断筛选
print(df[df.A>8])
#              A   B   C   D
# 2019-01-04  12  13  14  15
# 2019-01-05  16  17  18  19
# 2019-01-06  20  21  22  23

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

最全的pandas基础运算（上）

全國程序員及格卷曝光，你能答對幾題？

Dijkstra算法詳解

字典樹

CodeForce 534C Polycarpus' Dice （數學推理）

LCM Walk HDU

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結