https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#basics-dataframe
Python pandas 模塊,Series, DataFrame 學習筆記
https://note.youdao.com/s/LFip7Cc5
python pandas 筆記1
包含頭文件
#!/usr/bin/evn python
import numpy as np
import pandas as pd
Series
"""
Series
Series is a one-dimensional labeled array capable of holding any data type
(integers, strings, floating point numbers, Python objects, etc.).
The axis labels are collectively referred to as the index.
The basic method to create a Series is to call:
s = pd.Series(data, index=index)
"""
# from ndarray
s1 = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
print("pd s1:\n", s1)
print("pd s1.index:", s1.index)
print("pd s1.values:", s1.values)
pd s1:
a -0.261995
b 0.119171
c -0.129191
d -1.385260
e -0.087495
dtype: float64
pd s1.index: Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
pd s1.values: [-0.26199524 0.11917108 -0.12919125 -1.38525982 -0.08749467]
# from dict
d = {"b": 1, "a": 0, "c": 2}
s2 = pd.Series(d)
print("pd s2:\n", s2)
s3 = pd.Series(d, index=["b", "c", "d", "a"])
print("pd s3:\n", s3)
pd s2:
b 1
a 0
c 2
dtype: int64
pd s3:
b 1.0
c 2.0
d NaN
a 0.0
dtype: float64
# from scalar value
s4 = pd.Series(5.0, index=["a", "b", "c", "d", "e"])
print("pd s4:\n", s4)
pd s4:
a 5.0
b 5.0
c 5.0
d 5.0
e 5.0
dtype: float64
# Series is ndarray-like,可以像數組一樣訪問 Series 裏面的數據
print("pd s3.array:\n", s3.array[1])
# Series is dic-like. 可以像數組一樣訪問 Series 裏面的數據
print("pd s3.['b']:\n", s3['b'])
print("test s3 key:", "b" in s3)
print("test s3 key:", "f" in s3)
pd s3.array:
2.0
pd s3.['b']:
1.0
test s3 key: True
test s3 key: False
# Using the Series.get() method, a missing lable will return None or specified default:
print("Series.get() method:", s3.get("name"))
print("Series.get() method:", s3.get("name", np.nan))
print("Series.get() method:", s3.get("name", "henry"))
Series.get() method: None
Series.get() method: nan
Series.get() method: henry
# Series also has a name attribute:
s5 = pd.Series(np.random.randn(5), name="henry")
print("pd s5:\n", s5)
s6 = s5.rename("henry2")
print("pd s6:\n", s6)
print("pd s6.head():\n", s6.head())
print("pd s6.head(2):\n", s6.head(2))
pd s5:
0 -0.476002
1 0.248520
2 1.094846
3 0.505171
4 -0.176442
Name: henry, dtype: float64
pd s6:
0 -0.476002
1 0.248520
2 1.094846
3 0.505171
4 -0.176442
Name: henry2, dtype: float64
pd s6.head():
0 -0.476002
1 0.248520
2 1.094846
3 0.505171
4 -0.176442
Name: henry2, dtype: float64
pd s6.head(2):
0 -0.476002
1 0.248520
Name: henry2, dtype: float64
DataFrame
"""
DataFrame
DataFrame is a 2-dimensional labeled data structure with columns of potentially different types.
You can think of it like a spreadsheet or SQL table,
or a dict of Series objects.
It is generally the most commonly used pandas object.
Like Series, DataFrame accepts many different kinds of input:
Dict of 1D ndarrays, lists, dicts, or Series
2-D numpy.ndarray
Structured or record ndarray
A Series
Another DataFrame
Along with the data, you can optionally pass index (row labels) and columns (column labels) arguments.
If you pass an index and / or columns,
you are guaranteeing the index and / or columns of the resulting DataFrame.
Thus, a dict of Series plus a specific index will discard all data
not matching up to the passed index.
If axis labels are not passed,
they will be constructed from the input data based on common sense rules.
"""
# from dic of Series or dicts
d2 = {
"one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
"two": pd.Series([10.0, 20.0, 30.0, 40.0], index=["a", "b", "c", "d"])
}
df1 = pd.DataFrame(d2)
print("DataFrame df1:", df1)
df2 = pd.DataFrame(d2, index=["d", "b", "a"])
print("DataFrame df2:", df2)
df2.rename(columns={'two': 'symbol'}, inplace=True)
print("DataFrame df2.rename:", df2)
df3 = pd.DataFrame(d2, index=["d", "b", "a"], columns=["two", "three"])
print("DataFrame df3:", df3)
DataFrame df1: one two
a 1.0 10.0
b 2.0 20.0
c 3.0 30.0
d NaN 40.0
DataFrame df2: one two
d NaN 40.0
b 2.0 20.0
a 1.0 10.0
DataFrame df2.rename: one symbol
d NaN 40.0
b 2.0 20.0
a 1.0 10.0
DataFrame df3: two three
d 40.0 NaN
b 20.0 NaN
a 10.0 NaN
# the row and colunm lables can be accessed respectively by acessing the index and columns attributes:
# 索引名稱
print("df3.index:", df3.index)
# 列 名稱
print("df3.columns:", df3.columns)
df3.index: Index(['d', 'b', 'a'], dtype='object')
df3.columns: Index(['two', 'three'], dtype='object')
# from dict of ndarrays / lists
d = {
"one":[1.0, 2.0, 3.0, 4.0],
"two":[4.0, 3.0, 2.0, 1.0]
}
df4 = pd.DataFrame(d)
print("DataFrame df4:", df4)
df5 = pd.DataFrame(d, index=["a", "b", "c", "d"])
print("DataFrame df5:", df5)
DataFrame df4: one two
0 1.0 4.0
1 2.0 3.0
2 3.0 2.0
3 4.0 1.0
DataFrame df5: one two
a 1.0 4.0
b 2.0 3.0
c 3.0 2.0
d 4.0 1.0
# from structed or record array
data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")])
print("DataFrame data1:", data)
data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
print("DataFrame data2:", data)
df6 = pd.DataFrame(data)
print("DataFrame df6:", df6)
df7 = pd.DataFrame(data, index=["first", "second"])
print("DataFrame df7:", df7)
df8 = pd.DataFrame(data, columns=["C", "A", "B"])
print("DataFrame df8:", df8)
DataFrame data1: [(0, 0., b'') (0, 0., b'')]
DataFrame data2: [(1, 2., b'Hello') (2, 3., b'World')]
DataFrame df6: A B C
0 1 2.0 b'Hello'
1 2 3.0 b'World'
DataFrame df7: A B C
first 1 2.0 b'Hello'
second 2 3.0 b'World'
DataFrame df8: C A B
0 b'Hello' 1 2.0
1 b'World' 2 3.0
# from a list of dicts
data2 = [
{"a":1, "b":2},
{"a":5,"b":10,"c":20}
]
df9 = pd.DataFrame(data2)
print("DataFrame df9:", df9)
print("DataFrame df10:", pd.DataFrame(data2, index=["first", "second"]))
# 只獲取columns 列出的那幾列數據
print("DataFrame df11:", pd.DataFrame(data2, columns=["a", "b"]))
DataFrame df9: a b c
0 1 2 NaN
1 5 10 20.0
DataFrame df10: a b c
first 1 2 NaN
second 5 10 20.0
DataFrame df11: a b
0 1 2
1 5 10
# from a dict of tuples
df12 = pd.DataFrame(
{
("a","b"):{("A", "B"):1, ("A", "C"):2},
("a", "a"): {("A", "C"): 3, ("A", "B"): 4},
("a", "c"): {("A", "B"): 5, ("A", "C"): 6},
("b", "a"): {("A", "C"): 7, ("A", "B"): 7},
("b", "b"): {("A", "D"): 9, ("A", "B"): 10}
})
print("DataFrame df12:", df12)
DataFrame df12: a b
b a c a b
A B 1.0 4.0 5.0 7.0 10.0
C 2.0 3.0 6.0 7.0 NaN
D NaN NaN NaN NaN 9.0
# from a Series
ser = pd.Series(range(3), index=list("abc"), name="ser")
print("ser:", ser)
print("DataFrame df13:", pd.DataFrame(ser))
# Series 裏面定義的name,就是DataFrame裏面的列 名稱
print("DataFrame df14:", pd.DataFrame(ser, columns=["ser"]))
print("DataFrame df15:", pd.DataFrame(ser, columns=["ser", "name2"]))
ser: a 0
b 1
c 2
Name: ser, dtype: int64
DataFrame df13: ser
a 0
b 1
c 2
DataFrame df14: ser
a 0
b 1
c 2
DataFrame df15: ser name2
a 0 NaN
b 1 NaN
c 2 NaN
# from a list of namedtuples
from collections import namedtuple
Point = namedtuple("Point", "x y")
print("DataFrame df16:", pd.DataFrame([Point(0,0), Point(0,3), (2,3)]))
Point3D = namedtuple("Point3D", "x y z")
print("DataFrame df17:", pd.DataFrame([Point3D(0,0,0), Point3D(0,3,5), Point(2,3)]))
DataFrame df16: x y
0 0 0
1 0 3
2 2 3
DataFrame df17: x y z
0 0 0 0.0
1 0 3 5.0
2 2 3 NaN
# from a list of dataclasses
from dataclasses import make_dataclass
Point = make_dataclass("Point", [("x", int), ("y", int)])
print("DataFrame df18:", pd.DataFrame([Point(0,0), Point(0,3), Point(2,3)]))
DataFrame df18: x y
0 0 0
1 0 3
2 2 3
Alternate constructors
"""
DataFrame.from_dict
DataFrame.from_dict() takes a dict of dicts or a dict of array-like sequences and returns a DataFrame.
It operates like the DataFrame constructor except for the orient parameter
which is 'columns' by default,
but which can be set to 'index' in order to use the dict keys as row labels.
"""
print("df19:",pd.DataFrame.from_dict(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])])))
# orient="index", 把 列明 和索引名交換了,相當於旋轉了數組
# If you pass orient='index', the keys will be the row labels.
# In this case, you can also pass the desired column names:
print("df20:",pd.DataFrame.from_dict(
dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]),
orient="index",
columns=["one", "two", "three"],))
print("df21:",pd.DataFrame.from_dict(
dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]),
orient="index"))
df19: A B
0 1 4
1 2 5
2 3 6
df20: one two three
A 1 2 3
B 4 5 6
df21: 0 1 2
A 1 2 3
B 4 5 6
DataFrame.from_records
DataFrame.from_records() takes a list of tuples or an ndarray with structured dtype.
It works analogously to the normal DataFrame constructor, except that the resulting DataFrame index may be a specific field of the structured dtype.
【暫時不理解】
Column selection, addition, deletion
"""
You can treat a DataFrame semantically like a dict of like-indexed Series objects.
Getting, setting, and deleting columns works with the same syntax as the analogous dict operations:
"""
# 訪問df 的某列,df的某列就是一個 Series
print("df1", df1)
print("df22", df1["one"])
df1["three"] = df1["one"] * df1["two"]
# 判斷df1["one"]裏面每個元素是否 大於2,結果是 一個Bool類型變量
df1["flag"] = df1["one"]>2
print("df23:", df1)
df1 one two
a 1.0 10.0
b 2.0 20.0
c 3.0 30.0
d NaN 40.0
df22 a 1.0
b 2.0
c 3.0
d NaN
Name: one, dtype: float64
df23: one two three flag
a 1.0 10.0 10.0 False
b 2.0 20.0 40.0 False
c 3.0 30.0 90.0 True
d NaN 40.0 NaN False
# Columns can be deleted or popped like with a dict:
del df1["two"]
three = df1.pop("three")
print("df three:", three)
print("df24:", df1)
df three: a 10.0
b 40.0
c 90.0
d NaN
Name: three, dtype: float64
df24: one flag
a 1.0 False
b 2.0 False
c 3.0 True
d NaN False
# when inserting a scalar value, it will naturally be propagated to fill the column.
df1["foo"] = "bar"
print("df25:", df1)
df25: one flag foo
a 1.0 False bar
b 2.0 False bar
c 3.0 True bar
d NaN False bar
# when inserting a Series that does not have the same index as the DataFrame,it will be conformed to the DataFrame's index
## slicing 切片知識補充
tag = "henry, hello slicing!"
print("tag[:]", tag[:])
# [) 切片是一個 半閉半開 的區間
print("tag[1:2]", tag[1:2])
print("tag[0:2]", tag[0:2])
print("tag[:2]", tag[:2])
print("Series [:]", df1["one"][:])
print("Series [:2]", df1["one"][:2])
df1["one_trunc"] = df1["one"][:2]
print("DataFrame df25:", df1)
tag[:] henry, hello slicing!
tag[1:2] e
tag[0:2] he
tag[:2] he
Series [:] a 1.0
b 2.0
c 3.0
d NaN
Name: one, dtype: float64
Series [:2] a 1.0
b 2.0
Name: one, dtype: float64
DataFrame df25: one flag foo one_trunc
a 1.0 False bar 1.0
b 2.0 False bar 2.0
c 3.0 True bar NaN
d NaN False bar NaN
# 指定位置插入一列
# You can insert raw ndarrays but their length must match the length of the DataFrame’s index.
# By default, columns get inserted at the end. DataFrame.insert() inserts at a particular location in the column
df1.insert(1,"insert_bar", df1["one"])
print("DataFrame df26:",df1)
DataFrame df26: one insert_bar flag foo one_trunc
a 1.0 1.0 False bar 1.0
b 2.0 2.0 False bar 2.0
c 3.0 3.0 True bar NaN
d NaN NaN False bar NaN
# 通過 assign() 方法,從已有的列中 創造一個新的列
print("df1.head()", df1.head())
print("df1['one'].head()", df1["one"].head())
## assign 方法創造了新的列,但是不會改變之前的df數據,新的列是在 返回的數據裏面
df2 = df1.assign(new_col=df1["one"]/df1["one_trunc"])
print("DataFrame df27:",df1) # df1 還是之前的結構,沒有改變
print("DataFrame df28:",df2) # df2 纔是改變後的結構
df1.head() one insert_bar flag foo one_trunc
a 1.0 1.0 False bar 1.0
b 2.0 2.0 False bar 2.0
c 3.0 3.0 True bar NaN
d NaN NaN False bar NaN
df1['one'].head() a 1.0
b 2.0
c 3.0
d NaN
Name: one, dtype: float64
DataFrame df27: one insert_bar flag foo one_trunc
a 1.0 1.0 False bar 1.0
b 2.0 2.0 False bar 2.0
c 3.0 3.0 True bar NaN
d NaN NaN False bar NaN
DataFrame df28: one insert_bar flag foo one_trunc new_col
a 1.0 1.0 False bar 1.0 1.0
b 2.0 2.0 False bar 2.0 1.0
c 3.0 3.0 True bar NaN NaN
d NaN NaN False bar NaN NaN
# 通過函數的方式來創建新的列
# In the example above, we inserted a precomputed value.
# We can also pass in a function of one argument to be evaluated on the DataFrame being assigned to.
df3 = df1.assign(func_col=lambda x:(x["one"]+10))
print("DataFrame df29:", df3)
DataFrame df29: one insert_bar flag foo one_trunc func_col
a 1.0 1.0 False bar 1.0 11.0
b 2.0 2.0 False bar 2.0 12.0
c 3.0 3.0 True bar NaN 13.0
d NaN NaN False bar NaN NaN
20230310.csv
SepalLength,SepalWidth,PetalLength,PetalWidth,Name
5.1 , 3.5 , 1.4 , 0.2 ,Iris-setosa
4.9 , 3.0 , 1.4 , 0.2 , Iris-setosa
4.7,3.2,1.3 ,0.2,Iris-setosa
4.6 , 3.1 , 1.5 , 0.2 ,Iris-setosa
5.0 , 3.6 , 1.4 , 0.2, Iris-setosa
# assign() always returns a copy of the data, leaving the original DataFrame untouched.
# assign 通常不會改變 原來的DataFrame數據,而是返回數據的拷貝
iris = pd.read_csv("20230310.csv")
print("csv data:", iris)
print("iris.assign:",iris.assign(sepal_ratio=iris["SepalWidth"] + 10).head() )
print("iris cloumns:", iris.columns)
## 注意,從csv讀取的屬性列 有空格
# iris cloumns: Index(['SepalLength', 'SepalWidth', 'PetalLength', ' PetalWidth', 'Name'], dtype='object')
print("csv data:[PetalWidth]", iris["PetalWidth"])
print("csv data:[PetalLength]", iris["PetalLength"])
print("PetalRatio:",iris.assign(PetalRatio=lambda x: x.PetalWidth / x.PetalLength).head())
csv data: SepalLength SepalWidth PetalLength PetalWidth Name
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
iris.assign: SepalLength SepalWidth PetalLength PetalWidth Name sepal_ratio
0 5.1 3.5 1.4 0.2 Iris-setosa 13.5
1 4.9 3.0 1.4 0.2 Iris-setosa 13.0
2 4.7 3.2 1.3 0.2 Iris-setosa 13.2
3 4.6 3.1 1.5 0.2 Iris-setosa 13.1
4 5.0 3.6 1.4 0.2 Iris-setosa 13.6
iris cloumns: Index(['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Name'], dtype='object')
csv data:[PetalWidth] 0 0.2
1 0.2
2 0.2
3 0.2
4 0.2
Name: PetalWidth, dtype: float64
csv data:[PetalLength] 0 1.4
1 1.4
2 1.3
3 1.5
4 1.4
Name: PetalLength, dtype: float64
PetalRatio: SepalLength SepalWidth PetalLength PetalWidth Name PetalRatio
0 5.1 3.5 1.4 0.2 Iris-setosa 0.142857
1 4.9 3.0 1.4 0.2 Iris-setosa 0.142857
2 4.7 3.2 1.3 0.2 Iris-setosa 0.153846
3 4.6 3.1 1.5 0.2 Iris-setosa 0.133333
4 5.0 3.6 1.4 0.2 Iris-setosa 0.142857
"""
(
iris.query("SepalLength > 5")
.assign(
SepalRatio=lambda x: x.SepalWidth / x.SepalLength,
PetalRatio=lambda x: x.PetalLength / x.PetalLength,
).plot(kind="scatter", x="SepalRatio", y="PetalRatio")
)
"""
"""
The function signature for assign() is simply **kwargs.
The keys are the column names for the new fields,
and the values are either a value to be inserted (for example, a Series or NumPy array),
or a function of one argument to be called on the DataFrame.
A copy of the original DataFrame is returned, with the new values inserted.
The order of **kwargs is preserved. This allows for dependent assignment,
where an expression later in **kwargs can refer to a column created earlier in the same assign().
"""
# In the second expression, x['C'] will refer to the newly created column, that’s equal to dfa['A'] + dfa['B'].
dfa = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
dfb = dfa.assign(C=lambda x: x["A"] + x["B"], D=lambda x: x["A"] + x["C"])
print("dfa:", dfa)
print("dfb:", dfb)
dfa: A B
0 1 4
1 2 5
2 3 6
dfb: A B C D
0 1 4 5 6
1 2 5 7 9
2 3 6 9 12
Indexing / selection
Operation Syntax Result
select column : df[col] Series
select row by label : df.loc[label] Series
Select row by integer location: df.iloc[loc] Series
Slice rows: df[5:10] DataFrame
Select rows by boolean vector: df[bool_vec] DataFrame
# Row selection, for example, returns a Series whose index is the columns of the DataFrame:
print("df30:", df1)
## 選出 某一行(b 是一個索引值,選出這個索引的行)Select row by label
print("df31:", df1.loc["b"])
df30: one insert_bar flag foo one_trunc
a 1.0 1.0 False bar 1.0
b 2.0 2.0 False bar 2.0
c 3.0 3.0 True bar NaN
d NaN NaN False bar NaN
df31: one 2
insert_bar 2
flag False
foo bar
one_trunc 2
Name: b, dtype: object
# Select row by integer location
print("df32:", df1.iloc[2])
df32: one 3
insert_bar 3
flag True
foo bar
one_trunc NaN
Name: c, dtype: object
Data alignment and arithmetic
"""
Data alignment between DataFrame objects automatically align on both the columns and the index (row labels).
Again, the resulting object will have the union of the column and row labels.
"""
df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=["A", "B", "C"])
print("df40:", df)
print("df41:", df2)
print("df42:", df + df2)
print("df43:", df - df.iloc[0])
print("df44:", df * 5 + 2)
print("df45:", 1 / df)
print("df46:", df ** 4)
df40: A B C D
0 -1.084084 0.183785 -1.153985 0.055283
1 0.253552 0.077291 -0.303460 0.701300
2 0.821357 -1.116865 0.610512 -1.327411
3 -0.251630 -0.341660 0.730774 -0.584376
4 -1.215528 -0.691270 -0.552341 0.278375
5 -0.009694 -0.963907 1.593190 -0.991862
6 0.377235 -1.089611 -0.515518 -0.252789
7 -1.158782 -1.405582 -0.189190 -1.787201
8 1.469154 0.924453 -0.305886 -0.527754
9 -0.416995 1.469462 -1.107226 0.941600
df41: A B C
0 -0.184298 1.094119 -0.623001
1 -0.531990 -0.025734 -0.948708
2 0.877716 -1.547748 -0.753285
3 -0.248297 -1.370722 1.646786
4 0.958594 -0.373161 1.166930
5 -0.626382 1.731893 0.521530
6 -0.008678 0.955742 0.463842
df42: A B C D
0 -1.268382 1.277904 -1.776987 NaN
1 -0.278439 0.051558 -1.252168 NaN
2 1.699073 -2.664612 -0.142773 NaN
3 -0.499928 -1.712383 2.377560 NaN
4 -0.256934 -1.064431 0.614589 NaN
5 -0.636076 0.767986 2.114720 NaN
6 0.368556 -0.133869 -0.051676 NaN
7 NaN NaN NaN NaN
8 NaN NaN NaN NaN
9 NaN NaN NaN NaN
df43: A B C D
0 0.000000 0.000000 0.000000 0.000000
1 1.337636 -0.106494 0.850525 0.646017
2 1.905441 -1.300650 1.764497 -1.382694
3 0.832454 -0.525445 1.884759 -0.639659
4 -0.131444 -0.875055 0.601645 0.223092
5 1.074390 -1.147693 2.747175 -1.047145
6 1.461319 -1.273396 0.638467 -0.308072
7 -0.074698 -1.589367 0.964795 -1.842484
8 2.553238 0.740668 0.848100 -0.583037
9 0.667089 1.285677 0.046759 0.886317
df44: A B C D
0 -3.420421 2.918925 -3.769926 2.276415
1 3.267758 2.386457 0.482700 5.506499
2 6.106784 -3.584324 5.052559 -4.637055
3 0.741848 0.291698 5.653871 -0.921882
4 -4.077641 -1.456350 -0.761703 3.391873
5 1.951529 -2.819537 9.965949 -2.959312
6 3.886173 -3.448053 -0.577592 0.736056
7 -3.793910 -5.027910 1.054048 -6.936004
8 9.345768 6.622265 0.470572 -0.638769
9 -0.084974 9.347311 -3.536131 6.708002
df45: A B C D
0 -0.922438 5.441138 -0.866562 18.088755
1 3.943970 12.938049 -3.295328 1.425924
2 1.217498 -0.895364 1.637970 -0.753346
3 -3.974082 -2.926883 1.368412 -1.711226
4 -0.822688 -1.446613 -1.810477 3.592281
5 -103.155091 -1.037444 0.627672 -1.008204
6 2.650870 -0.917759 -1.939795 -3.955872
7 -0.862975 -0.711449 -5.285678 -0.559534
8 0.680664 1.081721 -3.269195 -1.894823
9 -2.398111 0.680521 -0.903158 1.062022
df46: A B C D
0 1.381186e+00 0.001141 1.773377 0.000009
1 4.133002e-03 0.000036 0.008480 0.241888
2 4.551214e-01 1.555974 0.138924 3.104715
3 4.009152e-03 0.013626 0.285189 0.116619
4 2.183033e+00 0.228345 0.093074 0.006005
5 8.831558e-09 0.863260 6.442732 0.967845
6 2.025099e-02 1.409565 0.070628 0.004083
7 1.803047e+00 3.903235 0.001281 10.202189
8 4.658744e+00 0.730364 0.008755 0.077576
9 3.023589e-02 4.662659 1.502954 0.786080
df1 = pd.DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}, dtype=bool)
df2 = pd.DataFrame({"a": [0, 1, 1], "b": [1, 1, 0]}, dtype=bool)
print("df1:", df1)
print("df2:", df2)
print("df47:", df1 & df2)
print("df48:", df1 | df2)
print("df49:", df1 ^ df2)
print("df50:", -df1)
df1: a b
0 True False
1 False True
2 True True
df2: a b
0 False True
1 True True
2 True False
df47: a b
0 False False
1 False True
2 True False
df48: a b
0 True True
1 True True
2 True True
df49: a b
0 True True
1 True False
2 False True
df50: a b
0 False True
1 True False
2 False False
## 旋轉 DataFrame 裏面的數據
print("df", df)
print("df[:5]", df[:5].T)
df A B C D
0 -1.084084 0.183785 -1.153985 0.055283
1 0.253552 0.077291 -0.303460 0.701300
2 0.821357 -1.116865 0.610512 -1.327411
3 -0.251630 -0.341660 0.730774 -0.584376
4 -1.215528 -0.691270 -0.552341 0.278375
5 -0.009694 -0.963907 1.593190 -0.991862
6 0.377235 -1.089611 -0.515518 -0.252789
7 -1.158782 -1.405582 -0.189190 -1.787201
8 1.469154 0.924453 -0.305886 -0.527754
9 -0.416995 1.469462 -1.107226 0.941600
df[:5] 0 1 2 3 4
A -1.084084 0.253552 0.821357 -0.251630 -1.215528
B 0.183785 0.077291 -1.116865 -0.341660 -0.691270
C -1.153985 -0.303460 0.610512 0.730774 -0.552341
D 0.055283 0.701300 -1.327411 -0.584376 0.278375
完整的腳本
#!/usr/bin/evn python import numpy as np import pandas as pd # ------------------------ Series """ Series Series is a one-dimensional labeled array capable of holding any data type (integers, strings, floating point numbers, Python objects, etc.). The axis labels are collectively referred to as the index. The basic method to create a Series is to call: s = pd.Series(data, index=index) """ # from ndarray s1 = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"]) print("pd s1:\n", s1) print("pd s1.index:", s1.index) print("pd s1.values:", s1.values) # from dict d = {"b": 1, "a": 0, "c": 2} s2 = pd.Series(d) print("pd s2:\n", s2) s3 = pd.Series(d, index=["b", "c", "d", "a"]) print("pd s3:\n", s3) # from scalar value s4 = pd.Series(5.0, index=["a", "b", "c", "d", "e"]) print("pd s4:\n", s4) # Series is ndarray-like,可以像數組一樣訪問 Series 裏面的數據 print("pd s3.array:\n", s3.array[1]) # Series is dic-like. 可以像數組一樣訪問 Series 裏面的數據 print("pd s3.['b']:\n", s3['b']) print("test s3 key:", "b" in s3) print("test s3 key:", "f" in s3) # Using the Series.get() method, a missing lable will return None or specified default: print("Series.get() method:", s3.get("name")) print("Series.get() method:", s3.get("name", np.nan)) print("Series.get() method:", s3.get("name", "henry")) # Series also has a name attribute: s5 = pd.Series(np.random.randn(5), name="henry") print("pd s5:\n", s5) s6 = s5.rename("henry2") print("pd s6:\n", s6) print("pd s6.head():\n", s6.head()) print("pd s6.head(2):\n", s6.head(2)) # ------------------------ DataFrame """ DataFrame DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. You can think of it like a spreadsheet or SQL table, or a dict of Series objects. It is generally the most commonly used pandas object. Like Series, DataFrame accepts many different kinds of input: Dict of 1D ndarrays, lists, dicts, or Series 2-D numpy.ndarray Structured or record ndarray A Series Another DataFrame Along with the data, you can optionally pass index (row labels) and columns (column labels) arguments. If you pass an index and / or columns, you are guaranteeing the index and / or columns of the resulting DataFrame. Thus, a dict of Series plus a specific index will discard all data not matching up to the passed index. If axis labels are not passed, they will be constructed from the input data based on common sense rules. """ # from dic of Series or dicts d2 = { "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]), "two": pd.Series([10.0, 20.0, 30.0, 40.0], index=["a", "b", "c", "d"]) } df1 = pd.DataFrame(d2) print("DataFrame df1:", df1) df2 = pd.DataFrame(d2, index=["d", "b", "a"]) print("DataFrame df2:", df2) df2.rename(columns={'two': 'symbol'}, inplace=True) print("DataFrame df2.rename:", df2) df3 = pd.DataFrame(d2, index=["d", "b", "a"], columns=["two", "three"]) print("DataFrame df3:", df3) # the row and colunm lables can be accessed respectively by acessing the index and columns attributes: # 索引名稱 print("df3.index:", df3.index) # 列 名稱 print("df3.columns:", df3.columns) # from dict of ndarrays / lists d = { "one":[1.0, 2.0, 3.0, 4.0], "two":[4.0, 3.0, 2.0, 1.0] } df4 = pd.DataFrame(d) print("DataFrame df4:", df4) df5 = pd.DataFrame(d, index=["a", "b", "c", "d"]) print("DataFrame df5:", df5) # from structed or record array data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")]) print("DataFrame data1:", data) data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] print("DataFrame data2:", data) df6 = pd.DataFrame(data) print("DataFrame df6:", df6) df7 = pd.DataFrame(data, index=["first", "second"]) print("DataFrame df7:", df7) df8 = pd.DataFrame(data, columns=["C", "A", "B"]) print("DataFrame df8:", df8) # from a list of dicts data2 = [ {"a":1, "b":2}, {"a":5,"b":10,"c":20} ] df9 = pd.DataFrame(data2) print("DataFrame df9:", df9) print("DataFrame df10:", pd.DataFrame(data2, index=["first", "second"])) # 只獲取columns 列出的那幾列數據 print("DataFrame df11:", pd.DataFrame(data2, columns=["a", "b"])) # from a dict of tuples df12 = pd.DataFrame( { ("a","b"):{("A", "B"):1, ("A", "C"):2}, ("a", "a"): {("A", "C"): 3, ("A", "B"): 4}, ("a", "c"): {("A", "B"): 5, ("A", "C"): 6}, ("b", "a"): {("A", "C"): 7, ("A", "B"): 7}, ("b", "b"): {("A", "D"): 9, ("A", "B"): 10} }) print("DataFrame df12:", df12) # from a Series ser = pd.Series(range(3), index=list("abc"), name="ser") print("ser:", ser) print("DataFrame df13:", pd.DataFrame(ser)) # Series 裏面定義的name,就是DataFrame裏面的列 名稱 print("DataFrame df14:", pd.DataFrame(ser, columns=["ser"])) print("DataFrame df15:", pd.DataFrame(ser, columns=["ser", "name2"])) # from a list of namedtuples from collections import namedtuple Point = namedtuple("Point", "x y") print("DataFrame df16:", pd.DataFrame([Point(0,0), Point(0,3), (2,3)])) Point3D = namedtuple("Point3D", "x y z") print("DataFrame df17:", pd.DataFrame([Point3D(0,0,0), Point3D(0,3,5), Point(2,3)])) # from a list of dataclasses from dataclasses import make_dataclass Point = make_dataclass("Point", [("x", int), ("y", int)]) print("DataFrame df18:", pd.DataFrame([Point(0,0), Point(0,3), Point(2,3)])) #--------------------------------------- Alternate constructors """ DataFrame.from_dict DataFrame.from_dict() takes a dict of dicts or a dict of array-like sequences and returns a DataFrame. It operates like the DataFrame constructor except for the orient parameter which is 'columns' by default, but which can be set to 'index' in order to use the dict keys as row labels. """ print("df19:",pd.DataFrame.from_dict(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]))) # orient="index", 把 列明 和索引名交換了,相當於旋轉了數組 # If you pass orient='index', the keys will be the row labels. # In this case, you can also pass the desired column names: print("df20:",pd.DataFrame.from_dict( dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]), orient="index", columns=["one", "two", "three"],)) print("df21:",pd.DataFrame.from_dict( dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]), orient="index")) """ DataFrame.from_records DataFrame.from_records() takes a list of tuples or an ndarray with structured dtype. It works analogously to the normal DataFrame constructor, except that the resulting DataFrame index may be a specific field of the structured dtype. """ # ----------------- Column selection, addition, deletion """ You can treat a DataFrame semantically like a dict of like-indexed Series objects. Getting, setting, and deleting columns works with the same syntax as the analogous dict operations: """ # 訪問df 的某列,df的某列就是一個 Series print("df1", df1) print("df22", df1["one"]) df1["three"] = df1["one"] * df1["two"] # 判斷df1["one"]裏面每個元素是否 大於2,結果是 一個Bool類型變量 df1["flag"] = df1["one"]>2 print("df23:", df1) # Columns can be deleted or popped like with a dict: del df1["two"] three = df1.pop("three") print("df three:", three) print("df24:", df1) # when inserting a scalar value, it will naturally be propagated to fill the column. df1["foo"] = "bar" print("df25:", df1) # when inserting a Series that does not have the same index as the DataFrame,it will be conformed to the DataFrame's index ## slicing 切片知識補充 tag = "henry, hello slicing!" print("tag[:]", tag[:]) # [) 切片是一個 半閉半開 的區間 print("tag[1:2]", tag[1:2]) print("tag[0:2]", tag[0:2]) print("tag[:2]", tag[:2]) print("Series [:]", df1["one"][:]) print("Series [:2]", df1["one"][:2]) df1["one_trunc"] = df1["one"][:2] print("DataFrame df25:", df1) # 指定位置插入一列 # You can insert raw ndarrays but their length must match the length of the DataFrame’s index. # By default, columns get inserted at the end. DataFrame.insert() inserts at a particular location in the column df1.insert(1,"insert_bar", df1["one"]) print("DataFrame df26:",df1) # 通過 assign() 方法,從已有的列中 創造一個新的列 print("df1.head()", df1.head()) print("df1['one'].head()", df1["one"].head()) ## assign 方法創造了新的列,但是不會改變之前的df數據,新的列是在 返回的數據裏面 df2 = df1.assign(new_col=df1["one"]/df1["one_trunc"]) print("DataFrame df27:",df1) # df1 還是之前的結構,沒有改變 print("DataFrame df28:",df2) # df2 纔是改變後的結構 # 通過函數的方式來創建新的列 # In the example above, we inserted a precomputed value. # We can also pass in a function of one argument to be evaluated on the DataFrame being assigned to. df3 = df1.assign(func_col=lambda x:(x["one"]+10)) print("DataFrame df29:", df3) # assign() always returns a copy of the data, leaving the original DataFrame untouched. # assign 通常不會改變 原來的DataFrame數據,而是返回數據的拷貝 iris = pd.read_csv("20230310.csv") print("csv data:", iris) print("iris.assign:",iris.assign(sepal_ratio=iris["SepalWidth"] + 10).head() ) print("iris cloumns:", iris.columns) ## 注意,從csv讀取的屬性列 有空格 # iris cloumns: Index(['SepalLength', 'SepalWidth', 'PetalLength', ' PetalWidth', 'Name'], dtype='object') print("csv data:[PetalWidth]", iris["PetalWidth"]) print("csv data:[PetalLength]", iris["PetalLength"]) print("PetalRatio:",iris.assign(PetalRatio=lambda x: x.PetalWidth / x.PetalLength).head()) """ ( iris.query("SepalLength > 5") .assign( SepalRatio=lambda x: x.SepalWidth / x.SepalLength, PetalRatio=lambda x: x.PetalLength / x.PetalLength, ).plot(kind="scatter", x="SepalRatio", y="PetalRatio") ) """ """ The function signature for assign() is simply **kwargs. The keys are the column names for the new fields, and the values are either a value to be inserted (for example, a Series or NumPy array), or a function of one argument to be called on the DataFrame. A copy of the original DataFrame is returned, with the new values inserted. The order of **kwargs is preserved. This allows for dependent assignment, where an expression later in **kwargs can refer to a column created earlier in the same assign(). """ # In the second expression, x['C'] will refer to the newly created column, that’s equal to dfa['A'] + dfa['B']. dfa = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) dfb = dfa.assign(C=lambda x: x["A"] + x["B"], D=lambda x: x["A"] + x["C"]) print("dfa:", dfa) print("dfb:", dfb) # ---------------------------- Indexing / selection """ Operation Syntax Result select column : df[col] Series select row by label : df.loc[label] Series Select row by integer location: df.iloc[loc] Series Slice rows: df[5:10] DataFrame Select rows by boolean vector: df[bool_vec] DataFrame """ # Row selection, for example, returns a Series whose index is the columns of the DataFrame: print("df30:", df1) ## 選出 某一行(b 是一個索引值,選出這個索引的行)Select row by label print("df31:", df1.loc["b"]) # Select row by integer location print("df32:", df1.iloc[2]) # --------------------------------- Data alignment and arithmetic """ Data alignment between DataFrame objects automatically align on both the columns and the index (row labels). Again, the resulting object will have the union of the column and row labels. """ df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"]) df2 = pd.DataFrame(np.random.randn(7, 3), columns=["A", "B", "C"]) print("df40:", df) print("df41:", df2) print("df42:", df + df2) print("df43:", df - df.iloc[0]) print("df44:", df * 5 + 2) print("df45:", 1 / df) print("df46:", df ** 4) df1 = pd.DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}, dtype=bool) df2 = pd.DataFrame({"a": [0, 1, 1], "b": [1, 1, 0]}, dtype=bool) print("df1:", df1) print("df2:", df2) print("df47:", df1 & df2) print("df48:", df1 | df2) print("df49:", df1 ^ df2) print("df50:", -df1) ## 旋轉 DataFrame 裏面的數據 print("df", df) print("df[:5]", df[:5].T)