https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#basics-dataframe

Python pandas 模块，Series, DataFrame 学习笔记

python pandas 笔记1

包含头文件

#!/usr/bin/evn python

import numpy as np
import pandas as pd

Series

"""
Series

Series is a one-dimensional labeled array capable of holding any data type
(integers, strings, floating point numbers, Python objects, etc.).

The axis labels are collectively referred to as the index.
The basic method to create a Series is to call:
s = pd.Series(data, index=index)
"""


# from ndarray
s1 = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
print("pd s1:\n", s1)
print("pd s1.index:", s1.index)
print("pd s1.values:", s1.values)

pd s1:
 a   -0.261995
b    0.119171
c   -0.129191
d   -1.385260
e   -0.087495
dtype: float64
pd s1.index: Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
pd s1.values: [-0.26199524  0.11917108 -0.12919125 -1.38525982 -0.08749467]

# from dict
d = {"b": 1, "a": 0, "c": 2}
s2 = pd.Series(d)
print("pd s2:\n", s2)

s3 = pd.Series(d, index=["b", "c", "d", "a"])
print("pd s3:\n", s3)

pd s2:
 b    1
a    0
c    2
dtype: int64
pd s3:
 b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

# from scalar value
s4 = pd.Series(5.0, index=["a", "b", "c", "d", "e"])
print("pd s4:\n", s4)

pd s4:
 a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64


# Series is ndarray-like，可以像数组一样访问 Series 里面的数据
print("pd s3.array:\n", s3.array[1])
# Series is dic-like. 可以像数组一样访问 Series 里面的数据
print("pd s3.['b']:\n", s3['b'])
print("test s3 key:", "b" in s3)
print("test s3 key:", "f" in s3)

pd s3.array:
 2.0
pd s3.['b']:
 1.0
test s3 key: True
test s3 key: False


# Using the  Series.get() method, a missing lable will return None or specified default:
print("Series.get() method:", s3.get("name"))
print("Series.get() method:", s3.get("name", np.nan))
print("Series.get() method:", s3.get("name", "henry"))

Series.get() method: None
Series.get() method: nan
Series.get() method: henry

# Series also has a name attribute:
s5 = pd.Series(np.random.randn(5), name="henry")
print("pd s5:\n", s5)

s6 = s5.rename("henry2")
print("pd s6:\n", s6)
print("pd s6.head():\n", s6.head())
print("pd s6.head(2):\n", s6.head(2))

pd s5:
 0   -0.476002
1    0.248520
2    1.094846
3    0.505171
4   -0.176442
Name: henry, dtype: float64
pd s6:
 0   -0.476002
1    0.248520
2    1.094846
3    0.505171
4   -0.176442
Name: henry2, dtype: float64
pd s6.head():
 0   -0.476002
1    0.248520
2    1.094846
3    0.505171
4   -0.176442
Name: henry2, dtype: float64
pd s6.head(2):
 0   -0.476002
1    0.248520
Name: henry2, dtype: float64

DataFrame

"""
DataFrame

DataFrame is a 2-dimensional labeled data structure with columns of potentially different types.
You can think of it like a spreadsheet or SQL table,
or a dict of Series objects.

It is generally the most commonly used pandas object.
Like Series, DataFrame accepts many different kinds of input:

Dict of 1D ndarrays, lists, dicts, or Series
2-D numpy.ndarray
Structured or record ndarray
A Series
Another DataFrame

Along with the data, you can optionally pass index (row labels) and columns (column labels) arguments.
If you pass an index and / or columns,
you are guaranteeing the index and / or columns of the resulting DataFrame.
Thus, a dict of Series plus a specific index will discard all data
not matching up to the passed index.

If axis labels are not passed,
they will be constructed from the input data based on common sense rules.

"""


# from dic of Series or dicts
d2 = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([10.0, 20.0, 30.0, 40.0], index=["a", "b", "c", "d"])
}
df1 = pd.DataFrame(d2)
print("DataFrame df1:", df1)

df2 = pd.DataFrame(d2, index=["d", "b", "a"])
print("DataFrame df2:", df2)
df2.rename(columns={'two': 'symbol'}, inplace=True)
print("DataFrame df2.rename:", df2)

df3 = pd.DataFrame(d2, index=["d", "b", "a"], columns=["two", "three"])
print("DataFrame df3:", df3)

DataFrame df1:    one   two
a  1.0  10.0
b  2.0  20.0
c  3.0  30.0
d  NaN  40.0
DataFrame df2:    one   two
d  NaN  40.0
b  2.0  20.0
a  1.0  10.0
DataFrame df2.rename:    one  symbol
d  NaN    40.0
b  2.0    20.0
a  1.0    10.0
DataFrame df3:     two three
d  40.0   NaN
b  20.0   NaN
a  10.0   NaN


# the row and colunm lables can be accessed respectively by acessing the index and columns attributes:
#  索引名称
print("df3.index:", df3.index)
# 列 名称
print("df3.columns:", df3.columns)

df3.index: Index(['d', 'b', 'a'], dtype='object')
df3.columns: Index(['two', 'three'], dtype='object')


# from dict of ndarrays / lists
d = {
    "one":[1.0, 2.0, 3.0, 4.0],
    "two":[4.0, 3.0, 2.0, 1.0]
}
df4 = pd.DataFrame(d)
print("DataFrame df4:", df4)

df5 = pd.DataFrame(d, index=["a", "b", "c", "d"])
print("DataFrame df5:", df5)

DataFrame df4:    one  two
0  1.0  4.0
1  2.0  3.0
2  3.0  2.0
3  4.0  1.0
DataFrame df5:    one  two
a  1.0  4.0
b  2.0  3.0
c  3.0  2.0
d  4.0  1.0


# from structed or record array
data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")])
print("DataFrame data1:", data)
data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
print("DataFrame data2:", data)
df6 = pd.DataFrame(data)
print("DataFrame df6:", df6)

df7 = pd.DataFrame(data, index=["first", "second"])
print("DataFrame df7:", df7)

df8 = pd.DataFrame(data, columns=["C", "A", "B"])
print("DataFrame df8:", df8)

DataFrame data1: [(0, 0., b'') (0, 0., b'')]
DataFrame data2: [(1, 2., b'Hello') (2, 3., b'World')]
DataFrame df6:    A    B         C
0  1  2.0  b'Hello'
1  2  3.0  b'World'
DataFrame df7:         A    B         C
first   1  2.0  b'Hello'
second  2  3.0  b'World'
DataFrame df8:           C  A    B
0  b'Hello'  1  2.0
1  b'World'  2  3.0


# from a list of dicts
data2 = [
    {"a":1, "b":2},
    {"a":5,"b":10,"c":20}
]
df9 = pd.DataFrame(data2)
print("DataFrame df9:", df9)


print("DataFrame df10:", pd.DataFrame(data2, index=["first", "second"]))

# 只获取columns 列出的那几列数据
print("DataFrame df11:", pd.DataFrame(data2, columns=["a", "b"]))

DataFrame df9:    a   b     c
0  1   2   NaN
1  5  10  20.0
DataFrame df10:         a   b     c
first   1   2   NaN
second  5  10  20.0
DataFrame df11:    a   b
0  1   2
1  5  10


# from a dict of tuples
df12 = pd.DataFrame(
    {
        ("a","b"):{("A", "B"):1, ("A", "C"):2},
        ("a", "a"): {("A", "C"): 3, ("A", "B"): 4},
        ("a", "c"): {("A", "B"): 5, ("A", "C"): 6},
        ("b", "a"): {("A", "C"): 7, ("A", "B"): 7},
        ("b", "b"): {("A", "D"): 9, ("A", "B"): 10}
    })
print("DataFrame df12:", df12)

DataFrame df12:        a              b      
       b    a    c    a     b
A B  1.0  4.0  5.0  7.0  10.0
  C  2.0  3.0  6.0  7.0   NaN
  D  NaN  NaN  NaN  NaN   9.0


# from a Series
ser = pd.Series(range(3), index=list("abc"), name="ser")
print("ser:", ser)
print("DataFrame df13:", pd.DataFrame(ser))
# Series 里面定义的name，就是DataFrame里面的列 名称
print("DataFrame df14:", pd.DataFrame(ser, columns=["ser"]))
print("DataFrame df15:", pd.DataFrame(ser, columns=["ser", "name2"]))

ser: a    0
b    1
c    2
Name: ser, dtype: int64
DataFrame df13:    ser
a    0
b    1
c    2
DataFrame df14:    ser
a    0
b    1
c    2
DataFrame df15:    ser name2
a    0   NaN
b    1   NaN
c    2   NaN


# from a list of namedtuples
from collections import  namedtuple
Point = namedtuple("Point", "x y")
print("DataFrame df16:", pd.DataFrame([Point(0,0), Point(0,3), (2,3)]))
Point3D = namedtuple("Point3D", "x y z")
print("DataFrame df17:", pd.DataFrame([Point3D(0,0,0), Point3D(0,3,5), Point(2,3)]))

DataFrame df16:    x  y
0  0  0
1  0  3
2  2  3
DataFrame df17:    x  y    z
0  0  0  0.0
1  0  3  5.0
2  2  3  NaN


# from a list of dataclasses
from dataclasses import make_dataclass
Point = make_dataclass("Point", [("x", int), ("y", int)])
print("DataFrame df18:", pd.DataFrame([Point(0,0), Point(0,3), Point(2,3)]))

DataFrame df18:    x  y
0  0  0
1  0  3
2  2  3

Alternate constructors

"""
DataFrame.from_dict

DataFrame.from_dict() takes a dict of dicts or a dict of array-like sequences and returns a DataFrame.
It operates like the DataFrame constructor except for the orient parameter
which is 'columns' by default,
but which can be set to 'index' in order to use the dict keys as row labels.
"""


print("df19:",pd.DataFrame.from_dict(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])])))

# orient="index", 把 列明 和索引名交换了，相当于旋转了数组
# If you pass orient='index', the keys will be the row labels.
# In this case, you can also pass the desired column names:
print("df20:",pd.DataFrame.from_dict(
    dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]),
    orient="index",
    columns=["one", "two", "three"],))

print("df21:",pd.DataFrame.from_dict(
    dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]),
    orient="index"))

df19:    A  B
0  1  4
1  2  5
2  3  6
df20:    one  two  three
A    1    2      3
B    4    5      6
df21:    0  1  2
A  1  2  3
B  4  5  6

DataFrame.from_records

DataFrame.from_records() takes a list of tuples or an ndarray with structured dtype.
It works analogously to the normal DataFrame constructor, except that the resulting DataFrame index may be a specific field of the structured dtype.

【暂时不理解】

Column selection, addition, deletion

"""
You can treat a DataFrame semantically like a dict of like-indexed Series objects.
Getting, setting, and deleting columns works with the same syntax as the analogous dict operations:

"""


# 访问df 的某列，df的某列就是一个 Series
print("df1", df1)
print("df22", df1["one"])

df1["three"] = df1["one"] * df1["two"]
# 判断df1["one"]里面每个元素是否 大于2，结果是 一个Bool类型变量
df1["flag"] = df1["one"]>2
print("df23:", df1)

df1    one   two
a  1.0  10.0
b  2.0  20.0
c  3.0  30.0
d  NaN  40.0
df22 a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64
df23:    one   two  three   flag
a  1.0  10.0   10.0  False
b  2.0  20.0   40.0  False
c  3.0  30.0   90.0   True
d  NaN  40.0    NaN  False

# Columns can be deleted or popped like with a dict:
del df1["two"]
three = df1.pop("three")
print("df three:", three)
print("df24:", df1)

df three: a    10.0
b    40.0
c    90.0
d     NaN
Name: three, dtype: float64
df24:    one   flag
a  1.0  False
b  2.0  False
c  3.0   True
d  NaN  False

# when inserting a scalar value, it will naturally be propagated to fill the column.
df1["foo"] = "bar"
print("df25:", df1)

df25:    one   flag  foo
a  1.0  False  bar
b  2.0  False  bar
c  3.0   True  bar
d  NaN  False  bar


# when inserting a Series that does not have the same index as the DataFrame,it will be conformed to the DataFrame's index
## slicing 切片知识补充
tag = "henry, hello slicing!"
print("tag[:]", tag[:])
# [) 切片是一个 半闭半开 的区间
print("tag[1:2]", tag[1:2])
print("tag[0:2]", tag[0:2])
print("tag[:2]", tag[:2])

print("Series [:]", df1["one"][:])
print("Series [:2]", df1["one"][:2])
df1["one_trunc"] = df1["one"][:2]
print("DataFrame df25:", df1)

tag[:] henry, hello slicing!
tag[1:2] e
tag[0:2] he
tag[:2] he
Series [:] a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64
Series [:2] a    1.0
b    2.0
Name: one, dtype: float64
DataFrame df25:    one   flag  foo  one_trunc
a  1.0  False  bar        1.0
b  2.0  False  bar        2.0
c  3.0   True  bar        NaN
d  NaN  False  bar        NaN

# 指定位置插入一列
# You can insert raw ndarrays but their length must match the length of the DataFrame’s index.
# By default, columns get inserted at the end. DataFrame.insert() inserts at a particular location in the column
df1.insert(1,"insert_bar", df1["one"])
print("DataFrame df26:",df1)

DataFrame df26:    one  insert_bar   flag  foo  one_trunc
a  1.0         1.0  False  bar        1.0
b  2.0         2.0  False  bar        2.0
c  3.0         3.0   True  bar        NaN
d  NaN         NaN  False  bar        NaN


# 通过 assign() 方法，从已有的列中 创造一个新的列
print("df1.head()", df1.head())
print("df1['one'].head()", df1["one"].head())
## assign  方法创造了新的列，但是不会改变之前的df数据，新的列是在 返回的数据里面
df2 = df1.assign(new_col=df1["one"]/df1["one_trunc"])
print("DataFrame df27:",df1) # df1 还是之前的结构，没有改变
print("DataFrame df28:",df2) # df2 才是改变后的结构

df1.head()    one  insert_bar   flag  foo  one_trunc
a  1.0         1.0  False  bar        1.0
b  2.0         2.0  False  bar        2.0
c  3.0         3.0   True  bar        NaN
d  NaN         NaN  False  bar        NaN
df1['one'].head() a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64
DataFrame df27:    one  insert_bar   flag  foo  one_trunc
a  1.0         1.0  False  bar        1.0
b  2.0         2.0  False  bar        2.0
c  3.0         3.0   True  bar        NaN
d  NaN         NaN  False  bar        NaN
DataFrame df28:    one  insert_bar   flag  foo  one_trunc  new_col
a  1.0         1.0  False  bar        1.0      1.0
b  2.0         2.0  False  bar        2.0      1.0
c  3.0         3.0   True  bar        NaN      NaN
d  NaN         NaN  False  bar        NaN      NaN


#  通过函数的方式来创建新的列
# In the example above, we inserted a precomputed value.
# We can also pass in a function of one argument to be evaluated on the DataFrame being assigned to.
df3 = df1.assign(func_col=lambda x:(x["one"]+10))
print("DataFrame df29:", df3)

DataFrame df29:    one  insert_bar   flag  foo  one_trunc  func_col
a  1.0         1.0  False  bar        1.0      11.0
b  2.0         2.0  False  bar        2.0      12.0
c  3.0         3.0   True  bar        NaN      13.0
d  NaN         NaN  False  bar        NaN       NaN

20230310.csv

SepalLength,SepalWidth,PetalLength,PetalWidth,Name
 5.1 ,  3.5 ,         1.4 ,        0.2  ,Iris-setosa
  4.9 , 3.0  ,        1.4  ,       0.2 , Iris-setosa
4.7,3.2,1.3 ,0.2,Iris-setosa
4.6 ,        3.1  ,        1.5 ,        0.2  ,Iris-setosa
 5.0  ,       3.6  ,        1.4  ,       0.2,  Iris-setosa


# assign() always returns a copy of the data, leaving the original DataFrame untouched.
# assign 通常不会改变 原来的DataFrame数据，而是返回数据的拷贝
iris = pd.read_csv("20230310.csv")
print("csv data:", iris)
print("iris.assign:",iris.assign(sepal_ratio=iris["SepalWidth"] + 10).head() )
print("iris cloumns:", iris.columns)
## 注意，从csv读取的属性列 有空格
# iris cloumns: Index(['SepalLength', 'SepalWidth', 'PetalLength', '  PetalWidth', 'Name'], dtype='object')
print("csv data:[PetalWidth]", iris["PetalWidth"])
print("csv data:[PetalLength]", iris["PetalLength"])


print("PetalRatio:",iris.assign(PetalRatio=lambda x: x.PetalWidth / x.PetalLength).head())

csv data:    SepalLength  SepalWidth  PetalLength  PetalWidth           Name
0          5.1         3.5          1.4         0.2    Iris-setosa
1          4.9         3.0          1.4         0.2    Iris-setosa
2          4.7         3.2          1.3         0.2    Iris-setosa
3          4.6         3.1          1.5         0.2    Iris-setosa
4          5.0         3.6          1.4         0.2    Iris-setosa
iris.assign:    SepalLength  SepalWidth  PetalLength  PetalWidth           Name  sepal_ratio
0          5.1         3.5          1.4         0.2    Iris-setosa         13.5
1          4.9         3.0          1.4         0.2    Iris-setosa         13.0
2          4.7         3.2          1.3         0.2    Iris-setosa         13.2
3          4.6         3.1          1.5         0.2    Iris-setosa         13.1
4          5.0         3.6          1.4         0.2    Iris-setosa         13.6
iris cloumns: Index(['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Name'], dtype='object')
csv data:[PetalWidth] 0    0.2
1    0.2
2    0.2
3    0.2
4    0.2
Name: PetalWidth, dtype: float64
csv data:[PetalLength] 0    1.4
1    1.4
2    1.3
3    1.5
4    1.4
Name: PetalLength, dtype: float64
PetalRatio:    SepalLength  SepalWidth  PetalLength  PetalWidth           Name  PetalRatio
0          5.1         3.5          1.4         0.2    Iris-setosa    0.142857
1          4.9         3.0          1.4         0.2    Iris-setosa    0.142857
2          4.7         3.2          1.3         0.2    Iris-setosa    0.153846
3          4.6         3.1          1.5         0.2    Iris-setosa    0.133333
4          5.0         3.6          1.4         0.2    Iris-setosa    0.142857

"""
(
iris.query("SepalLength > 5")
.assign(
SepalRatio=lambda x: x.SepalWidth / x.SepalLength,
PetalRatio=lambda x: x.PetalLength / x.PetalLength,
).plot(kind="scatter", x="SepalRatio", y="PetalRatio")
)
"""

"""
The function signature for assign() is simply **kwargs.
The keys are the column names for the new fields,
and the values are either a value to be inserted (for example, a Series or NumPy array),
or a function of one argument to be called on the DataFrame.
A copy of the original DataFrame is returned, with the new values inserted.

The order of **kwargs is preserved. This allows for dependent assignment,
where an expression later in **kwargs can refer to a column created earlier in the same assign().
"""

# In the second expression, x['C'] will refer to the newly created column, that’s equal to dfa['A'] + dfa['B'].
dfa = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
dfb = dfa.assign(C=lambda x: x["A"] + x["B"], D=lambda x: x["A"] + x["C"])
print("dfa:", dfa)
print("dfb:", dfb)

dfa:    A  B
0  1  4
1  2  5
2  3  6
dfb:    A  B  C   D
0  1  4  5   6
1  2  5  7   9
2  3  6  9  12

Indexing / selection

Operation                           Syntax                  Result
select column :                     df[col]                   Series
select row by label :               df.loc[label]             Series
Select row by integer location:     df.iloc[loc]              Series
Slice rows:                         df[5:10]                  DataFrame
Select rows by boolean vector:      df[bool_vec]              DataFrame

# Row selection, for example, returns a Series whose index is the columns of the DataFrame:
print("df30:", df1)
## 选出 某一行（b 是一个索引值，选出这个索引的行）Select row by label
print("df31:", df1.loc["b"])

df30:    one  insert_bar   flag  foo  one_trunc
a  1.0         1.0  False  bar        1.0
b  2.0         2.0  False  bar        2.0
c  3.0         3.0   True  bar        NaN
d  NaN         NaN  False  bar        NaN
df31: one               2
insert_bar        2
flag          False
foo             bar
one_trunc         2
Name: b, dtype: object

# Select row by integer location
print("df32:", df1.iloc[2])

df32: one              3
insert_bar       3
flag          True
foo            bar
one_trunc      NaN
Name: c, dtype: object

Data alignment and arithmetic

"""
Data alignment between DataFrame objects automatically align on both the columns and the index (row labels).
Again, the resulting object will have the union of the column and row labels.
"""


df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=["A", "B", "C"])
print("df40:", df)
print("df41:", df2)
print("df42:", df + df2)
print("df43:", df - df.iloc[0])

print("df44:", df * 5 + 2)
print("df45:", 1 / df)
print("df46:", df ** 4)

df40:           A         B         C         D
0 -1.084084  0.183785 -1.153985  0.055283
1  0.253552  0.077291 -0.303460  0.701300
2  0.821357 -1.116865  0.610512 -1.327411
3 -0.251630 -0.341660  0.730774 -0.584376
4 -1.215528 -0.691270 -0.552341  0.278375
5 -0.009694 -0.963907  1.593190 -0.991862
6  0.377235 -1.089611 -0.515518 -0.252789
7 -1.158782 -1.405582 -0.189190 -1.787201
8  1.469154  0.924453 -0.305886 -0.527754
9 -0.416995  1.469462 -1.107226  0.941600
df41:           A         B         C
0 -0.184298  1.094119 -0.623001
1 -0.531990 -0.025734 -0.948708
2  0.877716 -1.547748 -0.753285
3 -0.248297 -1.370722  1.646786
4  0.958594 -0.373161  1.166930
5 -0.626382  1.731893  0.521530
6 -0.008678  0.955742  0.463842
df42:           A         B         C   D
0 -1.268382  1.277904 -1.776987 NaN
1 -0.278439  0.051558 -1.252168 NaN
2  1.699073 -2.664612 -0.142773 NaN
3 -0.499928 -1.712383  2.377560 NaN
4 -0.256934 -1.064431  0.614589 NaN
5 -0.636076  0.767986  2.114720 NaN
6  0.368556 -0.133869 -0.051676 NaN
7       NaN       NaN       NaN NaN
8       NaN       NaN       NaN NaN
9       NaN       NaN       NaN NaN
df43:           A         B         C         D
0  0.000000  0.000000  0.000000  0.000000
1  1.337636 -0.106494  0.850525  0.646017
2  1.905441 -1.300650  1.764497 -1.382694
3  0.832454 -0.525445  1.884759 -0.639659
4 -0.131444 -0.875055  0.601645  0.223092
5  1.074390 -1.147693  2.747175 -1.047145
6  1.461319 -1.273396  0.638467 -0.308072
7 -0.074698 -1.589367  0.964795 -1.842484
8  2.553238  0.740668  0.848100 -0.583037
9  0.667089  1.285677  0.046759  0.886317
df44:           A         B         C         D
0 -3.420421  2.918925 -3.769926  2.276415
1  3.267758  2.386457  0.482700  5.506499
2  6.106784 -3.584324  5.052559 -4.637055
3  0.741848  0.291698  5.653871 -0.921882
4 -4.077641 -1.456350 -0.761703  3.391873
5  1.951529 -2.819537  9.965949 -2.959312
6  3.886173 -3.448053 -0.577592  0.736056
7 -3.793910 -5.027910  1.054048 -6.936004
8  9.345768  6.622265  0.470572 -0.638769
9 -0.084974  9.347311 -3.536131  6.708002
df45:             A          B         C          D
0   -0.922438   5.441138 -0.866562  18.088755
1    3.943970  12.938049 -3.295328   1.425924
2    1.217498  -0.895364  1.637970  -0.753346
3   -3.974082  -2.926883  1.368412  -1.711226
4   -0.822688  -1.446613 -1.810477   3.592281
5 -103.155091  -1.037444  0.627672  -1.008204
6    2.650870  -0.917759 -1.939795  -3.955872
7   -0.862975  -0.711449 -5.285678  -0.559534
8    0.680664   1.081721 -3.269195  -1.894823
9   -2.398111   0.680521 -0.903158   1.062022
df46:               A         B         C          D
0  1.381186e+00  0.001141  1.773377   0.000009
1  4.133002e-03  0.000036  0.008480   0.241888
2  4.551214e-01  1.555974  0.138924   3.104715
3  4.009152e-03  0.013626  0.285189   0.116619
4  2.183033e+00  0.228345  0.093074   0.006005
5  8.831558e-09  0.863260  6.442732   0.967845
6  2.025099e-02  1.409565  0.070628   0.004083
7  1.803047e+00  3.903235  0.001281  10.202189
8  4.658744e+00  0.730364  0.008755   0.077576
9  3.023589e-02  4.662659  1.502954   0.786080


df1 = pd.DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}, dtype=bool)
df2 = pd.DataFrame({"a": [0, 1, 1], "b": [1, 1, 0]}, dtype=bool)
print("df1:", df1)
print("df2:", df2)
print("df47:", df1 & df2)
print("df48:", df1 | df2)
print("df49:", df1 ^ df2)
print("df50:", -df1)

df1:        a      b
0   True  False
1  False   True
2   True   True
df2:        a      b
0  False   True
1   True   True
2   True  False
df47:        a      b
0  False  False
1  False   True
2   True  False
df48:       a     b
0  True  True
1  True  True
2  True  True
df49:        a      b
0   True   True
1   True  False
2  False   True
df50:        a      b
0  False   True
1   True  False
2  False  False

## 旋转 DataFrame 里面的数据
print("df", df)
print("df[:5]", df[:5].T)


df           A         B         C         D
0 -1.084084  0.183785 -1.153985  0.055283
1  0.253552  0.077291 -0.303460  0.701300
2  0.821357 -1.116865  0.610512 -1.327411
3 -0.251630 -0.341660  0.730774 -0.584376
4 -1.215528 -0.691270 -0.552341  0.278375
5 -0.009694 -0.963907  1.593190 -0.991862
6  0.377235 -1.089611 -0.515518 -0.252789
7 -1.158782 -1.405582 -0.189190 -1.787201
8  1.469154  0.924453 -0.305886 -0.527754
9 -0.416995  1.469462 -1.107226  0.941600
df[:5]           0         1         2         3         4
A -1.084084  0.253552  0.821357 -0.251630 -1.215528
B  0.183785  0.077291 -1.116865 -0.341660 -0.691270
C -1.153985 -0.303460  0.610512  0.730774 -0.552341
D  0.055283  0.701300 -1.327411 -0.584376  0.278375

完整的脚本

#!/usr/bin/evn python

import numpy as np
import pandas as pd

# ------------------------ Series

"""
Series

Series is a one-dimensional labeled array capable of holding any data type 
(integers, strings, floating point numbers, Python objects, etc.). 

The axis labels are collectively referred to as the index. 
The basic method to create a Series is to call:
s = pd.Series(data, index=index)
"""
# from ndarray
s1 = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
print("pd s1:\n", s1)
print("pd s1.index:", s1.index)
print("pd s1.values:", s1.values)

# from dict
d = {"b": 1, "a": 0, "c": 2}
s2 = pd.Series(d)
print("pd s2:\n", s2)

s3 = pd.Series(d, index=["b", "c", "d", "a"])
print("pd s3:\n", s3)

# from scalar value
s4 = pd.Series(5.0, index=["a", "b", "c", "d", "e"])
print("pd s4:\n", s4)

# Series is ndarray-like，可以像数组一样访问 Series 里面的数据
print("pd s3.array:\n", s3.array[1])
# Series is dic-like. 可以像数组一样访问 Series 里面的数据
print("pd s3.['b']:\n", s3['b'])
print("test s3 key:", "b" in s3)
print("test s3 key:", "f" in s3)

# Using the  Series.get() method, a missing lable will return None or specified default:
print("Series.get() method:", s3.get("name"))
print("Series.get() method:", s3.get("name", np.nan))
print("Series.get() method:", s3.get("name", "henry"))

# Series also has a name attribute:
s5 = pd.Series(np.random.randn(5), name="henry")
print("pd s5:\n", s5)

s6 = s5.rename("henry2")
print("pd s6:\n", s6)
print("pd s6.head():\n", s6.head())
print("pd s6.head(2):\n", s6.head(2))

# ------------------------ DataFrame
"""
DataFrame

DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. 
You can think of it like a spreadsheet or SQL table, 
or a dict of Series objects. 

It is generally the most commonly used pandas object. 
Like Series, DataFrame accepts many different kinds of input:

  Dict of 1D ndarrays, lists, dicts, or Series
  2-D numpy.ndarray
  Structured or record ndarray
  A Series
  Another DataFrame

Along with the data, you can optionally pass index (row labels) and columns (column labels) arguments. 
If you pass an index and / or columns, 
you are guaranteeing the index and / or columns of the resulting DataFrame. 
Thus, a dict of Series plus a specific index will discard all data 
not matching up to the passed index.

If axis labels are not passed, 
they will be constructed from the input data based on common sense rules.

"""

# from dic of Series or dicts
d2 = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([10.0, 20.0, 30.0, 40.0], index=["a", "b", "c", "d"])
}
df1 = pd.DataFrame(d2)
print("DataFrame df1:", df1)

df2 = pd.DataFrame(d2, index=["d", "b", "a"])
print("DataFrame df2:", df2)
df2.rename(columns={'two': 'symbol'}, inplace=True)
print("DataFrame df2.rename:", df2)

df3 = pd.DataFrame(d2, index=["d", "b", "a"], columns=["two", "three"])
print("DataFrame df3:", df3)

# the row and colunm lables can be accessed respectively by acessing the index and columns attributes:
#  索引名称
print("df3.index:", df3.index)
# 列 名称
print("df3.columns:", df3.columns)


# from dict of ndarrays / lists
d = {
    "one":[1.0, 2.0, 3.0, 4.0],
    "two":[4.0, 3.0, 2.0, 1.0]
}
df4 = pd.DataFrame(d)
print("DataFrame df4:", df4)

df5 = pd.DataFrame(d, index=["a", "b", "c", "d"])
print("DataFrame df5:", df5)

# from structed or record array
data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")])
print("DataFrame data1:", data)
data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
print("DataFrame data2:", data)
df6 = pd.DataFrame(data)
print("DataFrame df6:", df6)

df7 = pd.DataFrame(data, index=["first", "second"])
print("DataFrame df7:", df7)

df8 = pd.DataFrame(data, columns=["C", "A", "B"])
print("DataFrame df8:", df8)


# from a list of dicts
data2 = [
    {"a":1, "b":2},
    {"a":5,"b":10,"c":20}
]
df9 = pd.DataFrame(data2)
print("DataFrame df9:", df9)


print("DataFrame df10:", pd.DataFrame(data2, index=["first", "second"]))

# 只获取columns 列出的那几列数据
print("DataFrame df11:", pd.DataFrame(data2, columns=["a", "b"]))

# from a dict of tuples
df12 = pd.DataFrame(
    {
        ("a","b"):{("A", "B"):1, ("A", "C"):2},
        ("a", "a"): {("A", "C"): 3, ("A", "B"): 4},
        ("a", "c"): {("A", "B"): 5, ("A", "C"): 6},
        ("b", "a"): {("A", "C"): 7, ("A", "B"): 7},
        ("b", "b"): {("A", "D"): 9, ("A", "B"): 10}
    })
print("DataFrame df12:", df12)

# from a Series
ser = pd.Series(range(3), index=list("abc"), name="ser")
print("ser:", ser)
print("DataFrame df13:", pd.DataFrame(ser))
# Series 里面定义的name，就是DataFrame里面的列 名称
print("DataFrame df14:", pd.DataFrame(ser, columns=["ser"]))
print("DataFrame df15:", pd.DataFrame(ser, columns=["ser", "name2"]))


# from a list of namedtuples
from collections import  namedtuple
Point = namedtuple("Point", "x y")
print("DataFrame df16:", pd.DataFrame([Point(0,0), Point(0,3), (2,3)]))
Point3D = namedtuple("Point3D", "x y z")
print("DataFrame df17:", pd.DataFrame([Point3D(0,0,0), Point3D(0,3,5), Point(2,3)]))

# from a list of dataclasses
from dataclasses import make_dataclass
Point = make_dataclass("Point", [("x", int), ("y", int)])
print("DataFrame df18:", pd.DataFrame([Point(0,0), Point(0,3), Point(2,3)]))


#--------------------------------------- Alternate constructors
"""
DataFrame.from_dict

DataFrame.from_dict() takes a dict of dicts or a dict of array-like sequences and returns a DataFrame. 
It operates like the DataFrame constructor except for the orient parameter 
which is 'columns' by default, 
but which can be set to 'index' in order to use the dict keys as row labels.
"""
print("df19:",pd.DataFrame.from_dict(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])])))

# orient="index", 把 列明 和索引名交换了，相当于旋转了数组
# If you pass orient='index', the keys will be the row labels.
# In this case, you can also pass the desired column names:
print("df20:",pd.DataFrame.from_dict(
    dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]),
    orient="index",
    columns=["one", "two", "three"],))

print("df21:",pd.DataFrame.from_dict(
    dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]),
    orient="index"))


"""
DataFrame.from_records

DataFrame.from_records() takes a list of tuples or an ndarray with structured dtype. 
It works analogously to the normal DataFrame constructor,
except that the resulting DataFrame index may be a specific field of the structured dtype.

"""


# ----------------- Column selection, addition, deletion
"""
You can treat a DataFrame semantically like a dict of like-indexed Series objects. 
Getting, setting, and deleting columns works with the same syntax as the analogous dict operations:

"""
# 访问df 的某列，df的某列就是一个 Series
print("df1", df1)
print("df22", df1["one"])

df1["three"] = df1["one"] * df1["two"]
# 判断df1["one"]里面每个元素是否 大于2，结果是 一个Bool类型变量
df1["flag"] = df1["one"]>2
print("df23:", df1)


# Columns can be deleted or popped like with a dict:
del df1["two"]
three = df1.pop("three")
print("df three:", three)
print("df24:", df1)

# when inserting a scalar value, it will naturally be propagated to fill the column.
df1["foo"] = "bar"
print("df25:", df1)

# when inserting a Series that does not have the same index as the DataFrame,it will be conformed to the DataFrame's index

## slicing 切片知识补充
tag = "henry, hello slicing!"
print("tag[:]", tag[:])
# [) 切片是一个 半闭半开 的区间
print("tag[1:2]", tag[1:2])
print("tag[0:2]", tag[0:2])
print("tag[:2]", tag[:2])

print("Series [:]", df1["one"][:])
print("Series [:2]", df1["one"][:2])
df1["one_trunc"] = df1["one"][:2]
print("DataFrame df25:", df1)


# 指定位置插入一列
# You can insert raw ndarrays but their length must match the length of the DataFrame’s index.
# By default, columns get inserted at the end. DataFrame.insert() inserts at a particular location in the column
df1.insert(1,"insert_bar", df1["one"])
print("DataFrame df26:",df1)


# 通过 assign() 方法，从已有的列中 创造一个新的列
print("df1.head()", df1.head())
print("df1['one'].head()", df1["one"].head())
## assign  方法创造了新的列，但是不会改变之前的df数据，新的列是在 返回的数据里面
df2 = df1.assign(new_col=df1["one"]/df1["one_trunc"])
print("DataFrame df27:",df1) # df1 还是之前的结构，没有改变
print("DataFrame df28:",df2) # df2 才是改变后的结构

#  通过函数的方式来创建新的列
# In the example above, we inserted a precomputed value.
# We can also pass in a function of one argument to be evaluated on the DataFrame being assigned to.
df3 = df1.assign(func_col=lambda x:(x["one"]+10))
print("DataFrame df29:", df3)

# assign() always returns a copy of the data, leaving the original DataFrame untouched.
# assign 通常不会改变 原来的DataFrame数据，而是返回数据的拷贝
iris = pd.read_csv("20230310.csv")
print("csv data:", iris)
print("iris.assign:",iris.assign(sepal_ratio=iris["SepalWidth"] + 10).head() )
print("iris cloumns:", iris.columns)
## 注意，从csv读取的属性列 有空格
# iris cloumns: Index(['SepalLength', 'SepalWidth', 'PetalLength', '  PetalWidth', 'Name'], dtype='object')
print("csv data:[PetalWidth]", iris["PetalWidth"])
print("csv data:[PetalLength]", iris["PetalLength"])


print("PetalRatio:",iris.assign(PetalRatio=lambda x: x.PetalWidth / x.PetalLength).head())
"""
(
    iris.query("SepalLength > 5")
.assign(
        SepalRatio=lambda x: x.SepalWidth / x.SepalLength,
PetalRatio=lambda x: x.PetalLength / x.PetalLength,
).plot(kind="scatter", x="SepalRatio", y="PetalRatio")
)
"""



"""
The function signature for assign() is simply **kwargs. 
The keys are the column names for the new fields, 
and the values are either a value to be inserted (for example, a Series or NumPy array), 
or a function of one argument to be called on the DataFrame. 
A copy of the original DataFrame is returned, with the new values inserted.

The order of **kwargs is preserved. This allows for dependent assignment, 
where an expression later in **kwargs can refer to a column created earlier in the same assign().
"""
# In the second expression, x['C'] will refer to the newly created column, that’s equal to dfa['A'] + dfa['B'].
dfa = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
dfb = dfa.assign(C=lambda x: x["A"] + x["B"], D=lambda x: x["A"] + x["C"])
print("dfa:", dfa)
print("dfb:", dfb)

# ---------------------------- Indexing / selection
"""
Operation                           Syntax                  Result
select column :                     df[col]                   Series
select row by label :               df.loc[label]             Series
Select row by integer location:     df.iloc[loc]              Series
Slice rows:                         df[5:10]                  DataFrame
Select rows by boolean vector:      df[bool_vec]              DataFrame
"""
# Row selection, for example, returns a Series whose index is the columns of the DataFrame:
print("df30:", df1)
## 选出 某一行（b 是一个索引值，选出这个索引的行）Select row by label
print("df31:", df1.loc["b"])

# Select row by integer location
print("df32:", df1.iloc[2])


# --------------------------------- Data alignment and arithmetic

"""
Data alignment between DataFrame objects automatically align on both the columns and the index (row labels).
Again, the resulting object will have the union of the column and row labels.
"""
df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=["A", "B", "C"])
print("df40:", df)
print("df41:", df2)
print("df42:", df + df2)
print("df43:", df - df.iloc[0])

print("df44:", df * 5 + 2)
print("df45:", 1 / df)
print("df46:", df ** 4)


df1 = pd.DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}, dtype=bool)
df2 = pd.DataFrame({"a": [0, 1, 1], "b": [1, 1, 0]}, dtype=bool)
print("df1:", df1)
print("df2:", df2)
print("df47:", df1 & df2)
print("df48:", df1 | df2)
print("df49:", df1 ^ df2)
print("df50:", -df1)


## 旋转 DataFrame 里面的数据
print("df", df)
print("df[:5]", df[:5].T)

View Code

Python pandas 模块，Series, DataFrame 学习笔记

python pandas 笔记1

Series

DataFrame

Alternate constructors

DataFrame.from_records

Column selection, addition, deletion

Indexing / selection

Data alignment and arithmetic

AI模型 Llama 3体验笔记

【面试准备】又一次失败的面试经历，题目离谱～资深软件测试工程师

dotnet 8 版本与银河麒麟V10和UOS系统的 glibc 兼容性

docker之旅 6.docker下面安裝mysql, 訪問mysql

docker之旅 6.docker常用的一些腳本記錄

docker之旅 7.docker的端口映射解決所有不服

mysql 用戶權限問題

docker之旅 3.安裝docker-compose以及讓nginx跟隨docker啓動

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結