Python pandas 模块,Series, DataFrame 学习笔记

https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#basics-dataframe

Python  pandas 模块,Series, DataFrame 学习笔记

 

 https://note.youdao.com/s/LFip7Cc5

 

 

python pandas 笔记1

包含头文件

#!/usr/bin/evn python

import numpy as np
import pandas as pd

Series

"""
Series

Series is a one-dimensional labeled array capable of holding any data type
(integers, strings, floating point numbers, Python objects, etc.).

The axis labels are collectively referred to as the index.
The basic method to create a Series is to call:
s = pd.Series(data, index=index)
"""


# from ndarray
s1 = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
print("pd s1:\n", s1)
print("pd s1.index:", s1.index)
print("pd s1.values:", s1.values)
pd s1:
 a -0.261995
b 0.119171
c -0.129191
d -1.385260
e -0.087495
dtype: float64
pd s1.index: Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
pd s1.values: [-0.26199524 0.11917108 -0.12919125 -1.38525982 -0.08749467]
# from dict
d = {"b": 1, "a": 0, "c": 2}
s2 = pd.Series(d)
print("pd s2:\n", s2)

s3 = pd.Series(d, index=["b", "c", "d", "a"])
print("pd s3:\n", s3)
pd s2:
 b 1
a 0
c 2
dtype: int64
pd s3:
 b 1.0
c 2.0
d NaN
a 0.0
dtype: float64
# from scalar value
s4 = pd.Series(5.0, index=["a", "b", "c", "d", "e"])
print("pd s4:\n", s4)
pd s4:
 a 5.0
b 5.0
c 5.0
d 5.0
e 5.0
dtype: float64

# Series is ndarray-like,可以像数组一样访问 Series 里面的数据
print("pd s3.array:\n", s3.array[1])
# Series is dic-like. 可以像数组一样访问 Series 里面的数据
print("pd s3.['b']:\n", s3['b'])
print("test s3 key:", "b" in s3)
print("test s3 key:", "f" in s3)
pd s3.array:
 2.0
pd s3.['b']:
 1.0
test s3 key: True
test s3 key: False

# Using the Series.get() method, a missing lable will return None or specified default:
print("Series.get() method:", s3.get("name"))
print("Series.get() method:", s3.get("name", np.nan))
print("Series.get() method:", s3.get("name", "henry"))
Series.get() method: None
Series.get() method: nan
Series.get() method: henry
# Series also has a name attribute:
s5 = pd.Series(np.random.randn(5), name="henry")
print("pd s5:\n", s5)

s6 = s5.rename("henry2")
print("pd s6:\n", s6)
print("pd s6.head():\n", s6.head())
print("pd s6.head(2):\n", s6.head(2))
pd s5:
 0 -0.476002
1 0.248520
2 1.094846
3 0.505171
4 -0.176442
Name: henry, dtype: float64
pd s6:
 0 -0.476002
1 0.248520
2 1.094846
3 0.505171
4 -0.176442
Name: henry2, dtype: float64
pd s6.head():
 0 -0.476002
1 0.248520
2 1.094846
3 0.505171
4 -0.176442
Name: henry2, dtype: float64
pd s6.head(2):
 0 -0.476002
1 0.248520
Name: henry2, dtype: float64

DataFrame

"""
DataFrame

DataFrame is a 2-dimensional labeled data structure with columns of potentially different types.
You can think of it like a spreadsheet or SQL table,
or a dict of Series objects.

It is generally the most commonly used pandas object.
Like Series, DataFrame accepts many different kinds of input:

Dict of 1D ndarrays, lists, dicts, or Series
2-D numpy.ndarray
Structured or record ndarray
A Series
Another DataFrame

Along with the data, you can optionally pass index (row labels) and columns (column labels) arguments.
If you pass an index and / or columns,
you are guaranteeing the index and / or columns of the resulting DataFrame.
Thus, a dict of Series plus a specific index will discard all data
not matching up to the passed index.

If axis labels are not passed,
they will be constructed from the input data based on common sense rules.

"""


# from dic of Series or dicts
d2 = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([10.0, 20.0, 30.0, 40.0], index=["a", "b", "c", "d"])
}
df1 = pd.DataFrame(d2)
print("DataFrame df1:", df1)

df2 = pd.DataFrame(d2, index=["d", "b", "a"])
print("DataFrame df2:", df2)
df2.rename(columns={'two': 'symbol'}, inplace=True)
print("DataFrame df2.rename:", df2)

df3 = pd.DataFrame(d2, index=["d", "b", "a"], columns=["two", "three"])
print("DataFrame df3:", df3)
DataFrame df1:    one   two
a 1.0 10.0
b 2.0 20.0
c 3.0 30.0
d NaN 40.0
DataFrame df2: one two
d NaN 40.0
b 2.0 20.0
a 1.0 10.0
DataFrame df2.rename: one symbol
d NaN 40.0
b 2.0 20.0
a 1.0 10.0
DataFrame df3: two three
d 40.0 NaN
b 20.0 NaN
a 10.0 NaN

# the row and colunm lables can be accessed respectively by acessing the index and columns attributes:
# 索引名称
print("df3.index:", df3.index)
# 名称
print("df3.columns:", df3.columns)

df3.index: Index(['d', 'b', 'a'], dtype='object')
df3.columns: Index(['two', 'three'], dtype='object')

# from dict of ndarrays / lists
d = {
    "one":[1.0, 2.0, 3.0, 4.0],
    "two":[4.0, 3.0, 2.0, 1.0]
}
df4 = pd.DataFrame(d)
print("DataFrame df4:", df4)

df5 = pd.DataFrame(d, index=["a", "b", "c", "d"])
print("DataFrame df5:", df5)
DataFrame df4:    one  two
0 1.0 4.0
1 2.0 3.0
2 3.0 2.0
3 4.0 1.0
DataFrame df5: one two
a 1.0 4.0
b 2.0 3.0
c 3.0 2.0
d 4.0 1.0

# from structed or record array
data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")])
print("DataFrame data1:", data)
data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
print("DataFrame data2:", data)
df6 = pd.DataFrame(data)
print("DataFrame df6:", df6)

df7 = pd.DataFrame(data, index=["first", "second"])
print("DataFrame df7:", df7)

df8 = pd.DataFrame(data, columns=["C", "A", "B"])
print("DataFrame df8:", df8)
DataFrame data1: [(0, 0., b'') (0, 0., b'')]
DataFrame data2: [(1, 2., b'Hello') (2, 3., b'World')]
DataFrame df6: A B C
0 1 2.0 b'Hello'
1 2 3.0 b'World'
DataFrame df7: A B C
first 1 2.0 b'Hello'
second 2 3.0 b'World'
DataFrame df8: C A B
0 b'Hello' 1 2.0
1 b'World' 2 3.0

# from a list of dicts
data2 = [
    {"a":1, "b":2},
    {"a":5,"b":10,"c":20}
]
df9 = pd.DataFrame(data2)
print("DataFrame df9:", df9)


print("DataFrame df10:", pd.DataFrame(data2, index=["first", "second"]))

# 只获取columns 列出的那几列数据
print("DataFrame df11:", pd.DataFrame(data2, columns=["a", "b"]))
DataFrame df9:    a   b     c
0 1 2 NaN
1 5 10 20.0
DataFrame df10: a b c
first 1 2 NaN
second 5 10 20.0
DataFrame df11: a b
0 1 2
1 5 10

# from a dict of tuples
df12 = pd.DataFrame(
    {
        ("a","b"):{("A", "B"):1, ("A", "C"):2},
        ("a", "a"): {("A", "C"): 3, ("A", "B"): 4},
        ("a", "c"): {("A", "B"): 5, ("A", "C"): 6},
        ("b", "a"): {("A", "C"): 7, ("A", "B"): 7},
        ("b", "b"): {("A", "D"): 9, ("A", "B"): 10}
    })
print("DataFrame df12:", df12)

DataFrame df12:        a              b      
       b a c a b
A B 1.0 4.0 5.0 7.0 10.0
  C 2.0 3.0 6.0 7.0 NaN
  D NaN NaN NaN NaN 9.0

# from a Series
ser = pd.Series(range(3), index=list("abc"), name="ser")
print("ser:", ser)
print("DataFrame df13:", pd.DataFrame(ser))
# Series 里面定义的name,就是DataFrame里面的列 名称
print("DataFrame df14:", pd.DataFrame(ser, columns=["ser"]))
print("DataFrame df15:", pd.DataFrame(ser, columns=["ser", "name2"]))

ser: a    0
b 1
c 2
Name: ser, dtype: int64
DataFrame df13: ser
a 0
b 1
c 2
DataFrame df14: ser
a 0
b 1
c 2
DataFrame df15: ser name2
a 0 NaN
b 1 NaN
c 2 NaN

# from a list of namedtuples
from collections import  namedtuple
Point = namedtuple("Point", "x y")
print("DataFrame df16:", pd.DataFrame([Point(0,0), Point(0,3), (2,3)]))
Point3D = namedtuple("Point3D", "x y z")
print("DataFrame df17:", pd.DataFrame([Point3D(0,0,0), Point3D(0,3,5), Point(2,3)]))


DataFrame df16:    x  y
0 0 0
1 0 3
2 2 3
DataFrame df17: x y z
0 0 0 0.0
1 0 3 5.0
2 2 3 NaN

# from a list of dataclasses
from dataclasses import make_dataclass
Point = make_dataclass("Point", [("x", int), ("y", int)])
print("DataFrame df18:", pd.DataFrame([Point(0,0), Point(0,3), Point(2,3)]))



DataFrame df18:    x  y
0 0 0
1 0 3
2 2 3

Alternate constructors

"""
DataFrame.from_dict

DataFrame.from_dict() takes a dict of dicts or a dict of array-like sequences and returns a DataFrame.
It operates like the DataFrame constructor except for the orient parameter
which is 'columns' by default,
but which can be set to 'index' in order to use the dict keys as row labels.
"""


print("df19:",pd.DataFrame.from_dict(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])])))

# orient="index", 列明 和索引名交换了,相当于旋转了数组
# If you pass orient='index', the keys will be the row labels.
# In this case, you can also pass the desired column names:
print("df20:",pd.DataFrame.from_dict(
    dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]),
    orient="index",
    columns=["one", "two", "three"],))

print("df21:",pd.DataFrame.from_dict(
    dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]),
    orient="index"))

df19:    A  B
0 1 4
1 2 5
2 3 6
df20: one two three
A 1 2 3
B 4 5 6
df21: 0 1 2
A 1 2 3
B 4 5 6

DataFrame.from_records

DataFrame.from_records() takes a list of tuples or an ndarray with structured dtype.
It works analogously to the normal DataFrame constructor, except that the resulting DataFrame index may be a specific field of the structured dtype.

【暂时不理解】

Column selection, addition, deletion

"""
You can treat a DataFrame semantically like a dict of like-indexed Series objects.
Getting, setting, and deleting columns works with the same syntax as the analogous dict operations:

"""


# 访问df 的某列,df的某列就是一个 Series
print("df1", df1)
print("df22", df1["one"])

df1["three"] = df1["one"] * df1["two"]
# 判断df1["one"]里面每个元素是否 大于2,结果是 一个Bool类型变量
df1["flag"] = df1["one"]>2
print("df23:", df1)
df1    one   two
a 1.0 10.0
b 2.0 20.0
c 3.0 30.0
d NaN 40.0
df22 a 1.0
b 2.0
c 3.0
d NaN
Name: one, dtype: float64
df23: one two three flag
a 1.0 10.0 10.0 False
b 2.0 20.0 40.0 False
c 3.0 30.0 90.0 True
d NaN 40.0 NaN False
# Columns can be deleted or popped like with a dict:
del df1["two"]
three = df1.pop("three")
print("df three:", three)
print("df24:", df1)
df three: a    10.0
b 40.0
c 90.0
d NaN
Name: three, dtype: float64
df24: one flag
a 1.0 False
b 2.0 False
c 3.0 True
d NaN False
# when inserting a scalar value, it will naturally be propagated to fill the column.
df1["foo"] = "bar"
print("df25:", df1)
df25:    one   flag  foo
a 1.0 False bar
b 2.0 False bar
c 3.0 True bar
d NaN False bar

# when inserting a Series that does not have the same index as the DataFrame,it will be conformed to the DataFrame's index
## slicing 切片知识补充
tag = "henry, hello slicing!"
print("tag[:]", tag[:])
# [) 切片是一个 半闭半开 的区间
print("tag[1:2]", tag[1:2])
print("tag[0:2]", tag[0:2])
print("tag[:2]", tag[:2])

print("Series [:]", df1["one"][:])
print("Series [:2]", df1["one"][:2])
df1["one_trunc"] = df1["one"][:2]
print("DataFrame df25:", df1)
tag[:] henry, hello slicing!
tag[1:2] e
tag[0:2] he
tag[:2] he
Series [:] a 1.0
b 2.0
c 3.0
d NaN
Name: one, dtype: float64
Series [:2] a 1.0
b 2.0
Name: one, dtype: float64
DataFrame df25: one flag foo one_trunc
a 1.0 False bar 1.0
b 2.0 False bar 2.0
c 3.0 True bar NaN
d NaN False bar NaN
# 指定位置插入一列
# You can insert raw ndarrays but their length must match the length of the DataFrame’s index.
# By default, columns get inserted at the end. DataFrame.insert() inserts at a particular location in the column
df1.insert(1,"insert_bar", df1["one"])
print("DataFrame df26:",df1)

DataFrame df26:    one  insert_bar   flag  foo  one_trunc
a 1.0 1.0 False bar 1.0
b 2.0 2.0 False bar 2.0
c 3.0 3.0 True bar NaN
d NaN NaN False bar NaN

# 通过 assign() 方法,从已有的列中 创造一个新的列
print("df1.head()", df1.head())
print("df1['one'].head()", df1["one"].head())
## assign 方法创造了新的列,但是不会改变之前的df数据,新的列是在 返回的数据里面
df2 = df1.assign(new_col=df1["one"]/df1["one_trunc"])
print("DataFrame df27:",df1) # df1 还是之前的结构,没有改变
print("DataFrame df28:",df2) # df2 才是改变后的结构
df1.head()    one  insert_bar   flag  foo  one_trunc
a 1.0 1.0 False bar 1.0
b 2.0 2.0 False bar 2.0
c 3.0 3.0 True bar NaN
d NaN NaN False bar NaN
df1['one'].head() a 1.0
b 2.0
c 3.0
d NaN
Name: one, dtype: float64
DataFrame df27: one insert_bar flag foo one_trunc
a 1.0 1.0 False bar 1.0
b 2.0 2.0 False bar 2.0
c 3.0 3.0 True bar NaN
d NaN NaN False bar NaN
DataFrame df28: one insert_bar flag foo one_trunc new_col
a 1.0 1.0 False bar 1.0 1.0
b 2.0 2.0 False bar 2.0 1.0
c 3.0 3.0 True bar NaN NaN
d NaN NaN False bar NaN NaN

# 通过函数的方式来创建新的列
# In the example above, we inserted a precomputed value.
# We can also pass in a function of one argument to be evaluated on the DataFrame being assigned to.
df3 = df1.assign(func_col=lambda x:(x["one"]+10))
print("DataFrame df29:", df3)

DataFrame df29:    one  insert_bar   flag  foo  one_trunc  func_col
a 1.0 1.0 False bar 1.0 11.0
b 2.0 2.0 False bar 2.0 12.0
c 3.0 3.0 True bar NaN 13.0
d NaN NaN False bar NaN NaN

20230310.csv

SepalLength,SepalWidth,PetalLength,PetalWidth,Name
 5.1 ,  3.5 ,         1.4 ,        0.2  ,Iris-setosa
  4.9 , 3.0  ,        1.4  ,       0.2 , Iris-setosa
4.7,3.2,1.3 ,0.2,Iris-setosa
4.6 ,        3.1  ,        1.5 ,        0.2  ,Iris-setosa
 5.0  ,       3.6  ,        1.4  ,       0.2,  Iris-setosa


# assign() always returns a copy of the data, leaving the original DataFrame untouched.
# assign 通常不会改变 原来的DataFrame数据,而是返回数据的拷贝
iris = pd.read_csv("20230310.csv")
print("csv data:", iris)
print("iris.assign:",iris.assign(sepal_ratio=iris["SepalWidth"] + 10).head() )
print("iris cloumns:", iris.columns)
## 注意,从csv读取的属性列 有空格
# iris cloumns: Index(['SepalLength', 'SepalWidth', 'PetalLength', ' PetalWidth', 'Name'], dtype='object')
print("csv data:[PetalWidth]", iris["PetalWidth"])
print("csv data:[PetalLength]", iris["PetalLength"])


print("PetalRatio:",iris.assign(PetalRatio=lambda x: x.PetalWidth / x.PetalLength).head())
csv data:    SepalLength  SepalWidth  PetalLength  PetalWidth           Name
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
iris.assign: SepalLength SepalWidth PetalLength PetalWidth Name sepal_ratio
0 5.1 3.5 1.4 0.2 Iris-setosa 13.5
1 4.9 3.0 1.4 0.2 Iris-setosa 13.0
2 4.7 3.2 1.3 0.2 Iris-setosa 13.2
3 4.6 3.1 1.5 0.2 Iris-setosa 13.1
4 5.0 3.6 1.4 0.2 Iris-setosa 13.6
iris cloumns: Index(['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Name'], dtype='object')
csv data:[PetalWidth] 0 0.2
1 0.2
2 0.2
3 0.2
4 0.2
Name: PetalWidth, dtype: float64
csv data:[PetalLength] 0 1.4
1 1.4
2 1.3
3 1.5
4 1.4
Name: PetalLength, dtype: float64
PetalRatio: SepalLength SepalWidth PetalLength PetalWidth Name PetalRatio
0 5.1 3.5 1.4 0.2 Iris-setosa 0.142857
1 4.9 3.0 1.4 0.2 Iris-setosa 0.142857
2 4.7 3.2 1.3 0.2 Iris-setosa 0.153846
3 4.6 3.1 1.5 0.2 Iris-setosa 0.133333
4 5.0 3.6 1.4 0.2 Iris-setosa 0.142857

"""
(
iris.query("SepalLength > 5")
.assign(
SepalRatio=lambda x: x.SepalWidth / x.SepalLength,
PetalRatio=lambda x: x.PetalLength / x.PetalLength,
).plot(kind="scatter", x="SepalRatio", y="PetalRatio")
)
"""

"""
The function signature for assign() is simply **kwargs.
The keys are the column names for the new fields,
and the values are either a value to be inserted (for example, a Series or NumPy array),
or a function of one argument to be called on the DataFrame.
A copy of the original DataFrame is returned, with the new values inserted.

The order of **kwargs is preserved. This allows for dependent assignment,
where an expression later in **kwargs can refer to a column created earlier in the same assign().
"""

# In the second expression, x['C'] will refer to the newly created column, that’s equal to dfa['A'] + dfa['B'].
dfa = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
dfb = dfa.assign(C=lambda x: x["A"] + x["B"], D=lambda x: x["A"] + x["C"])
print("dfa:", dfa)
print("dfb:", dfb)

dfa:    A  B
0 1 4
1 2 5
2 3 6
dfb: A B C D
0 1 4 5 6
1 2 5 7 9
2 3 6 9 12

Indexing / selection

Operation                           Syntax                  Result
select column : df[col] Series
select row by label : df.loc[label] Series
Select row by integer location: df.iloc[loc] Series
Slice rows: df[5:10] DataFrame
Select rows by boolean vector: df[bool_vec] DataFrame
# Row selection, for example, returns a Series whose index is the columns of the DataFrame:
print("df30:", df1)
## 选出 某一行(b 是一个索引值,选出这个索引的行)Select row by label
print("df31:", df1.loc["b"])


df30:    one  insert_bar   flag  foo  one_trunc
a 1.0 1.0 False bar 1.0
b 2.0 2.0 False bar 2.0
c 3.0 3.0 True bar NaN
d NaN NaN False bar NaN
df31: one 2
insert_bar 2
flag False
foo bar
one_trunc 2
Name: b, dtype: object
# Select row by integer location
print("df32:", df1.iloc[2])

df32: one              3
insert_bar 3
flag True
foo bar
one_trunc NaN
Name: c, dtype: object

Data alignment and arithmetic

"""
Data alignment between DataFrame objects automatically align on both the columns and the index (row labels).
Again, the resulting object will have the union of the column and row labels.
"""


df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=["A", "B", "C"])
print("df40:", df)
print("df41:", df2)
print("df42:", df + df2)
print("df43:", df - df.iloc[0])

print("df44:", df * 5 + 2)
print("df45:", 1 / df)
print("df46:", df ** 4)


df40:           A         B         C         D
0 -1.084084 0.183785 -1.153985 0.055283
1 0.253552 0.077291 -0.303460 0.701300
2 0.821357 -1.116865 0.610512 -1.327411
3 -0.251630 -0.341660 0.730774 -0.584376
4 -1.215528 -0.691270 -0.552341 0.278375
5 -0.009694 -0.963907 1.593190 -0.991862
6 0.377235 -1.089611 -0.515518 -0.252789
7 -1.158782 -1.405582 -0.189190 -1.787201
8 1.469154 0.924453 -0.305886 -0.527754
9 -0.416995 1.469462 -1.107226 0.941600
df41: A B C
0 -0.184298 1.094119 -0.623001
1 -0.531990 -0.025734 -0.948708
2 0.877716 -1.547748 -0.753285
3 -0.248297 -1.370722 1.646786
4 0.958594 -0.373161 1.166930
5 -0.626382 1.731893 0.521530
6 -0.008678 0.955742 0.463842
df42: A B C D
0 -1.268382 1.277904 -1.776987 NaN
1 -0.278439 0.051558 -1.252168 NaN
2 1.699073 -2.664612 -0.142773 NaN
3 -0.499928 -1.712383 2.377560 NaN
4 -0.256934 -1.064431 0.614589 NaN
5 -0.636076 0.767986 2.114720 NaN
6 0.368556 -0.133869 -0.051676 NaN
7 NaN NaN NaN NaN
8 NaN NaN NaN NaN
9 NaN NaN NaN NaN
df43: A B C D
0 0.000000 0.000000 0.000000 0.000000
1 1.337636 -0.106494 0.850525 0.646017
2 1.905441 -1.300650 1.764497 -1.382694
3 0.832454 -0.525445 1.884759 -0.639659
4 -0.131444 -0.875055 0.601645 0.223092
5 1.074390 -1.147693 2.747175 -1.047145
6 1.461319 -1.273396 0.638467 -0.308072
7 -0.074698 -1.589367 0.964795 -1.842484
8 2.553238 0.740668 0.848100 -0.583037
9 0.667089 1.285677 0.046759 0.886317
df44: A B C D
0 -3.420421 2.918925 -3.769926 2.276415
1 3.267758 2.386457 0.482700 5.506499
2 6.106784 -3.584324 5.052559 -4.637055
3 0.741848 0.291698 5.653871 -0.921882
4 -4.077641 -1.456350 -0.761703 3.391873
5 1.951529 -2.819537 9.965949 -2.959312
6 3.886173 -3.448053 -0.577592 0.736056
7 -3.793910 -5.027910 1.054048 -6.936004
8 9.345768 6.622265 0.470572 -0.638769
9 -0.084974 9.347311 -3.536131 6.708002
df45: A B C D
0 -0.922438 5.441138 -0.866562 18.088755
1 3.943970 12.938049 -3.295328 1.425924
2 1.217498 -0.895364 1.637970 -0.753346
3 -3.974082 -2.926883 1.368412 -1.711226
4 -0.822688 -1.446613 -1.810477 3.592281
5 -103.155091 -1.037444 0.627672 -1.008204
6 2.650870 -0.917759 -1.939795 -3.955872
7 -0.862975 -0.711449 -5.285678 -0.559534
8 0.680664 1.081721 -3.269195 -1.894823
9 -2.398111 0.680521 -0.903158 1.062022
df46: A B C D
0 1.381186e+00 0.001141 1.773377 0.000009
1 4.133002e-03 0.000036 0.008480 0.241888
2 4.551214e-01 1.555974 0.138924 3.104715
3 4.009152e-03 0.013626 0.285189 0.116619
4 2.183033e+00 0.228345 0.093074 0.006005
5 8.831558e-09 0.863260 6.442732 0.967845
6 2.025099e-02 1.409565 0.070628 0.004083
7 1.803047e+00 3.903235 0.001281 10.202189
8 4.658744e+00 0.730364 0.008755 0.077576
9 3.023589e-02 4.662659 1.502954 0.786080


df1 = pd.DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}, dtype=bool)
df2 = pd.DataFrame({"a": [0, 1, 1], "b": [1, 1, 0]}, dtype=bool)
print("df1:", df1)
print("df2:", df2)
print("df47:", df1 & df2)
print("df48:", df1 | df2)
print("df49:", df1 ^ df2)
print("df50:", -df1)

df1:        a      b
0 True False
1 False True
2 True True
df2: a b
0 False True
1 True True
2 True False
df47: a b
0 False False
1 False True
2 True False
df48: a b
0 True True
1 True True
2 True True
df49: a b
0 True True
1 True False
2 False True
df50: a b
0 False True
1 True False
2 False False

## 旋转 DataFrame 里面的数据
print("df", df)
print("df[:5]", df[:5].T)



df A B C D
0 -1.084084 0.183785 -1.153985 0.055283
1 0.253552 0.077291 -0.303460 0.701300
2 0.821357 -1.116865 0.610512 -1.327411
3 -0.251630 -0.341660 0.730774 -0.584376
4 -1.215528 -0.691270 -0.552341 0.278375
5 -0.009694 -0.963907 1.593190 -0.991862
6 0.377235 -1.089611 -0.515518 -0.252789
7 -1.158782 -1.405582 -0.189190 -1.787201
8 1.469154 0.924453 -0.305886 -0.527754
9 -0.416995 1.469462 -1.107226 0.941600
df[:5] 0 1 2 3 4
A -1.084084 0.253552 0.821357 -0.251630 -1.215528
B 0.183785 0.077291 -1.116865 -0.341660 -0.691270
C -1.153985 -0.303460 0.610512 0.730774 -0.552341
D 0.055283 0.701300 -1.327411 -0.584376 0.278375

 

完整的脚本

#!/usr/bin/evn python

import numpy as np
import pandas as pd

# ------------------------ Series

"""
Series

Series is a one-dimensional labeled array capable of holding any data type 
(integers, strings, floating point numbers, Python objects, etc.). 

The axis labels are collectively referred to as the index. 
The basic method to create a Series is to call:
s = pd.Series(data, index=index)
"""
# from ndarray
s1 = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
print("pd s1:\n", s1)
print("pd s1.index:", s1.index)
print("pd s1.values:", s1.values)

# from dict
d = {"b": 1, "a": 0, "c": 2}
s2 = pd.Series(d)
print("pd s2:\n", s2)

s3 = pd.Series(d, index=["b", "c", "d", "a"])
print("pd s3:\n", s3)

# from scalar value
s4 = pd.Series(5.0, index=["a", "b", "c", "d", "e"])
print("pd s4:\n", s4)

# Series is ndarray-like,可以像数组一样访问 Series 里面的数据
print("pd s3.array:\n", s3.array[1])
# Series is dic-like. 可以像数组一样访问 Series 里面的数据
print("pd s3.['b']:\n", s3['b'])
print("test s3 key:", "b" in s3)
print("test s3 key:", "f" in s3)

# Using the  Series.get() method, a missing lable will return None or specified default:
print("Series.get() method:", s3.get("name"))
print("Series.get() method:", s3.get("name", np.nan))
print("Series.get() method:", s3.get("name", "henry"))

# Series also has a name attribute:
s5 = pd.Series(np.random.randn(5), name="henry")
print("pd s5:\n", s5)

s6 = s5.rename("henry2")
print("pd s6:\n", s6)
print("pd s6.head():\n", s6.head())
print("pd s6.head(2):\n", s6.head(2))

# ------------------------ DataFrame
"""
DataFrame

DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. 
You can think of it like a spreadsheet or SQL table, 
or a dict of Series objects. 

It is generally the most commonly used pandas object. 
Like Series, DataFrame accepts many different kinds of input:

  Dict of 1D ndarrays, lists, dicts, or Series
  2-D numpy.ndarray
  Structured or record ndarray
  A Series
  Another DataFrame

Along with the data, you can optionally pass index (row labels) and columns (column labels) arguments. 
If you pass an index and / or columns, 
you are guaranteeing the index and / or columns of the resulting DataFrame. 
Thus, a dict of Series plus a specific index will discard all data 
not matching up to the passed index.

If axis labels are not passed, 
they will be constructed from the input data based on common sense rules.

"""

# from dic of Series or dicts
d2 = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([10.0, 20.0, 30.0, 40.0], index=["a", "b", "c", "d"])
}
df1 = pd.DataFrame(d2)
print("DataFrame df1:", df1)

df2 = pd.DataFrame(d2, index=["d", "b", "a"])
print("DataFrame df2:", df2)
df2.rename(columns={'two': 'symbol'}, inplace=True)
print("DataFrame df2.rename:", df2)

df3 = pd.DataFrame(d2, index=["d", "b", "a"], columns=["two", "three"])
print("DataFrame df3:", df3)

# the row and colunm lables can be accessed respectively by acessing the index and columns attributes:
#  索引名称
print("df3.index:", df3.index)
# 列 名称
print("df3.columns:", df3.columns)


# from dict of ndarrays / lists
d = {
    "one":[1.0, 2.0, 3.0, 4.0],
    "two":[4.0, 3.0, 2.0, 1.0]
}
df4 = pd.DataFrame(d)
print("DataFrame df4:", df4)

df5 = pd.DataFrame(d, index=["a", "b", "c", "d"])
print("DataFrame df5:", df5)

# from structed or record array
data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")])
print("DataFrame data1:", data)
data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
print("DataFrame data2:", data)
df6 = pd.DataFrame(data)
print("DataFrame df6:", df6)

df7 = pd.DataFrame(data, index=["first", "second"])
print("DataFrame df7:", df7)

df8 = pd.DataFrame(data, columns=["C", "A", "B"])
print("DataFrame df8:", df8)


# from a list of dicts
data2 = [
    {"a":1, "b":2},
    {"a":5,"b":10,"c":20}
]
df9 = pd.DataFrame(data2)
print("DataFrame df9:", df9)


print("DataFrame df10:", pd.DataFrame(data2, index=["first", "second"]))

# 只获取columns 列出的那几列数据
print("DataFrame df11:", pd.DataFrame(data2, columns=["a", "b"]))

# from a dict of tuples
df12 = pd.DataFrame(
    {
        ("a","b"):{("A", "B"):1, ("A", "C"):2},
        ("a", "a"): {("A", "C"): 3, ("A", "B"): 4},
        ("a", "c"): {("A", "B"): 5, ("A", "C"): 6},
        ("b", "a"): {("A", "C"): 7, ("A", "B"): 7},
        ("b", "b"): {("A", "D"): 9, ("A", "B"): 10}
    })
print("DataFrame df12:", df12)

# from a Series
ser = pd.Series(range(3), index=list("abc"), name="ser")
print("ser:", ser)
print("DataFrame df13:", pd.DataFrame(ser))
# Series 里面定义的name,就是DataFrame里面的列 名称
print("DataFrame df14:", pd.DataFrame(ser, columns=["ser"]))
print("DataFrame df15:", pd.DataFrame(ser, columns=["ser", "name2"]))


# from a list of namedtuples
from collections import  namedtuple
Point = namedtuple("Point", "x y")
print("DataFrame df16:", pd.DataFrame([Point(0,0), Point(0,3), (2,3)]))
Point3D = namedtuple("Point3D", "x y z")
print("DataFrame df17:", pd.DataFrame([Point3D(0,0,0), Point3D(0,3,5), Point(2,3)]))

# from a list of dataclasses
from dataclasses import make_dataclass
Point = make_dataclass("Point", [("x", int), ("y", int)])
print("DataFrame df18:", pd.DataFrame([Point(0,0), Point(0,3), Point(2,3)]))


#--------------------------------------- Alternate constructors
"""
DataFrame.from_dict

DataFrame.from_dict() takes a dict of dicts or a dict of array-like sequences and returns a DataFrame. 
It operates like the DataFrame constructor except for the orient parameter 
which is 'columns' by default, 
but which can be set to 'index' in order to use the dict keys as row labels.
"""
print("df19:",pd.DataFrame.from_dict(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])])))

# orient="index", 把 列明 和索引名交换了,相当于旋转了数组
# If you pass orient='index', the keys will be the row labels.
# In this case, you can also pass the desired column names:
print("df20:",pd.DataFrame.from_dict(
    dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]),
    orient="index",
    columns=["one", "two", "three"],))

print("df21:",pd.DataFrame.from_dict(
    dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]),
    orient="index"))


"""
DataFrame.from_records

DataFrame.from_records() takes a list of tuples or an ndarray with structured dtype. 
It works analogously to the normal DataFrame constructor,
except that the resulting DataFrame index may be a specific field of the structured dtype.

"""


# ----------------- Column selection, addition, deletion
"""
You can treat a DataFrame semantically like a dict of like-indexed Series objects. 
Getting, setting, and deleting columns works with the same syntax as the analogous dict operations:

"""
# 访问df 的某列,df的某列就是一个 Series
print("df1", df1)
print("df22", df1["one"])

df1["three"] = df1["one"] * df1["two"]
# 判断df1["one"]里面每个元素是否 大于2,结果是 一个Bool类型变量
df1["flag"] = df1["one"]>2
print("df23:", df1)


# Columns can be deleted or popped like with a dict:
del df1["two"]
three = df1.pop("three")
print("df three:", three)
print("df24:", df1)

# when inserting a scalar value, it will naturally be propagated to fill the column.
df1["foo"] = "bar"
print("df25:", df1)

# when inserting a Series that does not have the same index as the DataFrame,it will be conformed to the DataFrame's index

## slicing 切片知识补充
tag = "henry, hello slicing!"
print("tag[:]", tag[:])
# [) 切片是一个 半闭半开 的区间
print("tag[1:2]", tag[1:2])
print("tag[0:2]", tag[0:2])
print("tag[:2]", tag[:2])

print("Series [:]", df1["one"][:])
print("Series [:2]", df1["one"][:2])
df1["one_trunc"] = df1["one"][:2]
print("DataFrame df25:", df1)


# 指定位置插入一列
# You can insert raw ndarrays but their length must match the length of the DataFrame’s index.
# By default, columns get inserted at the end. DataFrame.insert() inserts at a particular location in the column
df1.insert(1,"insert_bar", df1["one"])
print("DataFrame df26:",df1)


# 通过 assign() 方法,从已有的列中 创造一个新的列
print("df1.head()", df1.head())
print("df1['one'].head()", df1["one"].head())
## assign  方法创造了新的列,但是不会改变之前的df数据,新的列是在 返回的数据里面
df2 = df1.assign(new_col=df1["one"]/df1["one_trunc"])
print("DataFrame df27:",df1) # df1 还是之前的结构,没有改变
print("DataFrame df28:",df2) # df2 才是改变后的结构

#  通过函数的方式来创建新的列
# In the example above, we inserted a precomputed value.
# We can also pass in a function of one argument to be evaluated on the DataFrame being assigned to.
df3 = df1.assign(func_col=lambda x:(x["one"]+10))
print("DataFrame df29:", df3)

# assign() always returns a copy of the data, leaving the original DataFrame untouched.
# assign 通常不会改变 原来的DataFrame数据,而是返回数据的拷贝
iris = pd.read_csv("20230310.csv")
print("csv data:", iris)
print("iris.assign:",iris.assign(sepal_ratio=iris["SepalWidth"] + 10).head() )
print("iris cloumns:", iris.columns)
## 注意,从csv读取的属性列 有空格
# iris cloumns: Index(['SepalLength', 'SepalWidth', 'PetalLength', '  PetalWidth', 'Name'], dtype='object')
print("csv data:[PetalWidth]", iris["PetalWidth"])
print("csv data:[PetalLength]", iris["PetalLength"])


print("PetalRatio:",iris.assign(PetalRatio=lambda x: x.PetalWidth / x.PetalLength).head())
"""
(
    iris.query("SepalLength > 5")
.assign(
        SepalRatio=lambda x: x.SepalWidth / x.SepalLength,
PetalRatio=lambda x: x.PetalLength / x.PetalLength,
).plot(kind="scatter", x="SepalRatio", y="PetalRatio")
)
"""



"""
The function signature for assign() is simply **kwargs. 
The keys are the column names for the new fields, 
and the values are either a value to be inserted (for example, a Series or NumPy array), 
or a function of one argument to be called on the DataFrame. 
A copy of the original DataFrame is returned, with the new values inserted.

The order of **kwargs is preserved. This allows for dependent assignment, 
where an expression later in **kwargs can refer to a column created earlier in the same assign().
"""
# In the second expression, x['C'] will refer to the newly created column, that’s equal to dfa['A'] + dfa['B'].
dfa = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
dfb = dfa.assign(C=lambda x: x["A"] + x["B"], D=lambda x: x["A"] + x["C"])
print("dfa:", dfa)
print("dfb:", dfb)

# ---------------------------- Indexing / selection
"""
Operation                           Syntax                  Result
select column :                     df[col]                   Series
select row by label :               df.loc[label]             Series
Select row by integer location:     df.iloc[loc]              Series
Slice rows:                         df[5:10]                  DataFrame
Select rows by boolean vector:      df[bool_vec]              DataFrame
"""
# Row selection, for example, returns a Series whose index is the columns of the DataFrame:
print("df30:", df1)
## 选出 某一行(b 是一个索引值,选出这个索引的行)Select row by label
print("df31:", df1.loc["b"])

# Select row by integer location
print("df32:", df1.iloc[2])


# --------------------------------- Data alignment and arithmetic

"""
Data alignment between DataFrame objects automatically align on both the columns and the index (row labels).
Again, the resulting object will have the union of the column and row labels.
"""
df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=["A", "B", "C"])
print("df40:", df)
print("df41:", df2)
print("df42:", df + df2)
print("df43:", df - df.iloc[0])

print("df44:", df * 5 + 2)
print("df45:", 1 / df)
print("df46:", df ** 4)


df1 = pd.DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}, dtype=bool)
df2 = pd.DataFrame({"a": [0, 1, 1], "b": [1, 1, 0]}, dtype=bool)
print("df1:", df1)
print("df2:", df2)
print("df47:", df1 & df2)
print("df48:", df1 | df2)
print("df49:", df1 ^ df2)
print("df50:", -df1)


## 旋转 DataFrame 里面的数据
print("df", df)
print("df[:5]", df[:5].T)
View Code

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章