Python pandas 模塊,Series, DataFrame 學習筆記

https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#basics-dataframe

Python  pandas 模塊,Series, DataFrame 學習筆記

 

 https://note.youdao.com/s/LFip7Cc5

 

 

python pandas 筆記1

包含頭文件

#!/usr/bin/evn python

import numpy as np
import pandas as pd

Series

"""
Series

Series is a one-dimensional labeled array capable of holding any data type
(integers, strings, floating point numbers, Python objects, etc.).

The axis labels are collectively referred to as the index.
The basic method to create a Series is to call:
s = pd.Series(data, index=index)
"""


# from ndarray
s1 = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
print("pd s1:\n", s1)
print("pd s1.index:", s1.index)
print("pd s1.values:", s1.values)
pd s1:
 a -0.261995
b 0.119171
c -0.129191
d -1.385260
e -0.087495
dtype: float64
pd s1.index: Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
pd s1.values: [-0.26199524 0.11917108 -0.12919125 -1.38525982 -0.08749467]
# from dict
d = {"b": 1, "a": 0, "c": 2}
s2 = pd.Series(d)
print("pd s2:\n", s2)

s3 = pd.Series(d, index=["b", "c", "d", "a"])
print("pd s3:\n", s3)
pd s2:
 b 1
a 0
c 2
dtype: int64
pd s3:
 b 1.0
c 2.0
d NaN
a 0.0
dtype: float64
# from scalar value
s4 = pd.Series(5.0, index=["a", "b", "c", "d", "e"])
print("pd s4:\n", s4)
pd s4:
 a 5.0
b 5.0
c 5.0
d 5.0
e 5.0
dtype: float64

# Series is ndarray-like,可以像數組一樣訪問 Series 裏面的數據
print("pd s3.array:\n", s3.array[1])
# Series is dic-like. 可以像數組一樣訪問 Series 裏面的數據
print("pd s3.['b']:\n", s3['b'])
print("test s3 key:", "b" in s3)
print("test s3 key:", "f" in s3)
pd s3.array:
 2.0
pd s3.['b']:
 1.0
test s3 key: True
test s3 key: False

# Using the Series.get() method, a missing lable will return None or specified default:
print("Series.get() method:", s3.get("name"))
print("Series.get() method:", s3.get("name", np.nan))
print("Series.get() method:", s3.get("name", "henry"))
Series.get() method: None
Series.get() method: nan
Series.get() method: henry
# Series also has a name attribute:
s5 = pd.Series(np.random.randn(5), name="henry")
print("pd s5:\n", s5)

s6 = s5.rename("henry2")
print("pd s6:\n", s6)
print("pd s6.head():\n", s6.head())
print("pd s6.head(2):\n", s6.head(2))
pd s5:
 0 -0.476002
1 0.248520
2 1.094846
3 0.505171
4 -0.176442
Name: henry, dtype: float64
pd s6:
 0 -0.476002
1 0.248520
2 1.094846
3 0.505171
4 -0.176442
Name: henry2, dtype: float64
pd s6.head():
 0 -0.476002
1 0.248520
2 1.094846
3 0.505171
4 -0.176442
Name: henry2, dtype: float64
pd s6.head(2):
 0 -0.476002
1 0.248520
Name: henry2, dtype: float64

DataFrame

"""
DataFrame

DataFrame is a 2-dimensional labeled data structure with columns of potentially different types.
You can think of it like a spreadsheet or SQL table,
or a dict of Series objects.

It is generally the most commonly used pandas object.
Like Series, DataFrame accepts many different kinds of input:

Dict of 1D ndarrays, lists, dicts, or Series
2-D numpy.ndarray
Structured or record ndarray
A Series
Another DataFrame

Along with the data, you can optionally pass index (row labels) and columns (column labels) arguments.
If you pass an index and / or columns,
you are guaranteeing the index and / or columns of the resulting DataFrame.
Thus, a dict of Series plus a specific index will discard all data
not matching up to the passed index.

If axis labels are not passed,
they will be constructed from the input data based on common sense rules.

"""


# from dic of Series or dicts
d2 = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([10.0, 20.0, 30.0, 40.0], index=["a", "b", "c", "d"])
}
df1 = pd.DataFrame(d2)
print("DataFrame df1:", df1)

df2 = pd.DataFrame(d2, index=["d", "b", "a"])
print("DataFrame df2:", df2)
df2.rename(columns={'two': 'symbol'}, inplace=True)
print("DataFrame df2.rename:", df2)

df3 = pd.DataFrame(d2, index=["d", "b", "a"], columns=["two", "three"])
print("DataFrame df3:", df3)
DataFrame df1:    one   two
a 1.0 10.0
b 2.0 20.0
c 3.0 30.0
d NaN 40.0
DataFrame df2: one two
d NaN 40.0
b 2.0 20.0
a 1.0 10.0
DataFrame df2.rename: one symbol
d NaN 40.0
b 2.0 20.0
a 1.0 10.0
DataFrame df3: two three
d 40.0 NaN
b 20.0 NaN
a 10.0 NaN

# the row and colunm lables can be accessed respectively by acessing the index and columns attributes:
# 索引名稱
print("df3.index:", df3.index)
# 名稱
print("df3.columns:", df3.columns)

df3.index: Index(['d', 'b', 'a'], dtype='object')
df3.columns: Index(['two', 'three'], dtype='object')

# from dict of ndarrays / lists
d = {
    "one":[1.0, 2.0, 3.0, 4.0],
    "two":[4.0, 3.0, 2.0, 1.0]
}
df4 = pd.DataFrame(d)
print("DataFrame df4:", df4)

df5 = pd.DataFrame(d, index=["a", "b", "c", "d"])
print("DataFrame df5:", df5)
DataFrame df4:    one  two
0 1.0 4.0
1 2.0 3.0
2 3.0 2.0
3 4.0 1.0
DataFrame df5: one two
a 1.0 4.0
b 2.0 3.0
c 3.0 2.0
d 4.0 1.0

# from structed or record array
data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")])
print("DataFrame data1:", data)
data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
print("DataFrame data2:", data)
df6 = pd.DataFrame(data)
print("DataFrame df6:", df6)

df7 = pd.DataFrame(data, index=["first", "second"])
print("DataFrame df7:", df7)

df8 = pd.DataFrame(data, columns=["C", "A", "B"])
print("DataFrame df8:", df8)
DataFrame data1: [(0, 0., b'') (0, 0., b'')]
DataFrame data2: [(1, 2., b'Hello') (2, 3., b'World')]
DataFrame df6: A B C
0 1 2.0 b'Hello'
1 2 3.0 b'World'
DataFrame df7: A B C
first 1 2.0 b'Hello'
second 2 3.0 b'World'
DataFrame df8: C A B
0 b'Hello' 1 2.0
1 b'World' 2 3.0

# from a list of dicts
data2 = [
    {"a":1, "b":2},
    {"a":5,"b":10,"c":20}
]
df9 = pd.DataFrame(data2)
print("DataFrame df9:", df9)


print("DataFrame df10:", pd.DataFrame(data2, index=["first", "second"]))

# 只獲取columns 列出的那幾列數據
print("DataFrame df11:", pd.DataFrame(data2, columns=["a", "b"]))
DataFrame df9:    a   b     c
0 1 2 NaN
1 5 10 20.0
DataFrame df10: a b c
first 1 2 NaN
second 5 10 20.0
DataFrame df11: a b
0 1 2
1 5 10

# from a dict of tuples
df12 = pd.DataFrame(
    {
        ("a","b"):{("A", "B"):1, ("A", "C"):2},
        ("a", "a"): {("A", "C"): 3, ("A", "B"): 4},
        ("a", "c"): {("A", "B"): 5, ("A", "C"): 6},
        ("b", "a"): {("A", "C"): 7, ("A", "B"): 7},
        ("b", "b"): {("A", "D"): 9, ("A", "B"): 10}
    })
print("DataFrame df12:", df12)

DataFrame df12:        a              b      
       b a c a b
A B 1.0 4.0 5.0 7.0 10.0
  C 2.0 3.0 6.0 7.0 NaN
  D NaN NaN NaN NaN 9.0

# from a Series
ser = pd.Series(range(3), index=list("abc"), name="ser")
print("ser:", ser)
print("DataFrame df13:", pd.DataFrame(ser))
# Series 裏面定義的name,就是DataFrame裏面的列 名稱
print("DataFrame df14:", pd.DataFrame(ser, columns=["ser"]))
print("DataFrame df15:", pd.DataFrame(ser, columns=["ser", "name2"]))

ser: a    0
b 1
c 2
Name: ser, dtype: int64
DataFrame df13: ser
a 0
b 1
c 2
DataFrame df14: ser
a 0
b 1
c 2
DataFrame df15: ser name2
a 0 NaN
b 1 NaN
c 2 NaN

# from a list of namedtuples
from collections import  namedtuple
Point = namedtuple("Point", "x y")
print("DataFrame df16:", pd.DataFrame([Point(0,0), Point(0,3), (2,3)]))
Point3D = namedtuple("Point3D", "x y z")
print("DataFrame df17:", pd.DataFrame([Point3D(0,0,0), Point3D(0,3,5), Point(2,3)]))


DataFrame df16:    x  y
0 0 0
1 0 3
2 2 3
DataFrame df17: x y z
0 0 0 0.0
1 0 3 5.0
2 2 3 NaN

# from a list of dataclasses
from dataclasses import make_dataclass
Point = make_dataclass("Point", [("x", int), ("y", int)])
print("DataFrame df18:", pd.DataFrame([Point(0,0), Point(0,3), Point(2,3)]))



DataFrame df18:    x  y
0 0 0
1 0 3
2 2 3

Alternate constructors

"""
DataFrame.from_dict

DataFrame.from_dict() takes a dict of dicts or a dict of array-like sequences and returns a DataFrame.
It operates like the DataFrame constructor except for the orient parameter
which is 'columns' by default,
but which can be set to 'index' in order to use the dict keys as row labels.
"""


print("df19:",pd.DataFrame.from_dict(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])])))

# orient="index", 列明 和索引名交換了,相當於旋轉了數組
# If you pass orient='index', the keys will be the row labels.
# In this case, you can also pass the desired column names:
print("df20:",pd.DataFrame.from_dict(
    dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]),
    orient="index",
    columns=["one", "two", "three"],))

print("df21:",pd.DataFrame.from_dict(
    dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]),
    orient="index"))

df19:    A  B
0 1 4
1 2 5
2 3 6
df20: one two three
A 1 2 3
B 4 5 6
df21: 0 1 2
A 1 2 3
B 4 5 6

DataFrame.from_records

DataFrame.from_records() takes a list of tuples or an ndarray with structured dtype.
It works analogously to the normal DataFrame constructor, except that the resulting DataFrame index may be a specific field of the structured dtype.

【暫時不理解】

Column selection, addition, deletion

"""
You can treat a DataFrame semantically like a dict of like-indexed Series objects.
Getting, setting, and deleting columns works with the same syntax as the analogous dict operations:

"""


# 訪問df 的某列,df的某列就是一個 Series
print("df1", df1)
print("df22", df1["one"])

df1["three"] = df1["one"] * df1["two"]
# 判斷df1["one"]裏面每個元素是否 大於2,結果是 一個Bool類型變量
df1["flag"] = df1["one"]>2
print("df23:", df1)
df1    one   two
a 1.0 10.0
b 2.0 20.0
c 3.0 30.0
d NaN 40.0
df22 a 1.0
b 2.0
c 3.0
d NaN
Name: one, dtype: float64
df23: one two three flag
a 1.0 10.0 10.0 False
b 2.0 20.0 40.0 False
c 3.0 30.0 90.0 True
d NaN 40.0 NaN False
# Columns can be deleted or popped like with a dict:
del df1["two"]
three = df1.pop("three")
print("df three:", three)
print("df24:", df1)
df three: a    10.0
b 40.0
c 90.0
d NaN
Name: three, dtype: float64
df24: one flag
a 1.0 False
b 2.0 False
c 3.0 True
d NaN False
# when inserting a scalar value, it will naturally be propagated to fill the column.
df1["foo"] = "bar"
print("df25:", df1)
df25:    one   flag  foo
a 1.0 False bar
b 2.0 False bar
c 3.0 True bar
d NaN False bar

# when inserting a Series that does not have the same index as the DataFrame,it will be conformed to the DataFrame's index
## slicing 切片知識補充
tag = "henry, hello slicing!"
print("tag[:]", tag[:])
# [) 切片是一個 半閉半開 的區間
print("tag[1:2]", tag[1:2])
print("tag[0:2]", tag[0:2])
print("tag[:2]", tag[:2])

print("Series [:]", df1["one"][:])
print("Series [:2]", df1["one"][:2])
df1["one_trunc"] = df1["one"][:2]
print("DataFrame df25:", df1)
tag[:] henry, hello slicing!
tag[1:2] e
tag[0:2] he
tag[:2] he
Series [:] a 1.0
b 2.0
c 3.0
d NaN
Name: one, dtype: float64
Series [:2] a 1.0
b 2.0
Name: one, dtype: float64
DataFrame df25: one flag foo one_trunc
a 1.0 False bar 1.0
b 2.0 False bar 2.0
c 3.0 True bar NaN
d NaN False bar NaN
# 指定位置插入一列
# You can insert raw ndarrays but their length must match the length of the DataFrame’s index.
# By default, columns get inserted at the end. DataFrame.insert() inserts at a particular location in the column
df1.insert(1,"insert_bar", df1["one"])
print("DataFrame df26:",df1)

DataFrame df26:    one  insert_bar   flag  foo  one_trunc
a 1.0 1.0 False bar 1.0
b 2.0 2.0 False bar 2.0
c 3.0 3.0 True bar NaN
d NaN NaN False bar NaN

# 通過 assign() 方法,從已有的列中 創造一個新的列
print("df1.head()", df1.head())
print("df1['one'].head()", df1["one"].head())
## assign 方法創造了新的列,但是不會改變之前的df數據,新的列是在 返回的數據裏面
df2 = df1.assign(new_col=df1["one"]/df1["one_trunc"])
print("DataFrame df27:",df1) # df1 還是之前的結構,沒有改變
print("DataFrame df28:",df2) # df2 纔是改變後的結構
df1.head()    one  insert_bar   flag  foo  one_trunc
a 1.0 1.0 False bar 1.0
b 2.0 2.0 False bar 2.0
c 3.0 3.0 True bar NaN
d NaN NaN False bar NaN
df1['one'].head() a 1.0
b 2.0
c 3.0
d NaN
Name: one, dtype: float64
DataFrame df27: one insert_bar flag foo one_trunc
a 1.0 1.0 False bar 1.0
b 2.0 2.0 False bar 2.0
c 3.0 3.0 True bar NaN
d NaN NaN False bar NaN
DataFrame df28: one insert_bar flag foo one_trunc new_col
a 1.0 1.0 False bar 1.0 1.0
b 2.0 2.0 False bar 2.0 1.0
c 3.0 3.0 True bar NaN NaN
d NaN NaN False bar NaN NaN

# 通過函數的方式來創建新的列
# In the example above, we inserted a precomputed value.
# We can also pass in a function of one argument to be evaluated on the DataFrame being assigned to.
df3 = df1.assign(func_col=lambda x:(x["one"]+10))
print("DataFrame df29:", df3)

DataFrame df29:    one  insert_bar   flag  foo  one_trunc  func_col
a 1.0 1.0 False bar 1.0 11.0
b 2.0 2.0 False bar 2.0 12.0
c 3.0 3.0 True bar NaN 13.0
d NaN NaN False bar NaN NaN

20230310.csv

SepalLength,SepalWidth,PetalLength,PetalWidth,Name
 5.1 ,  3.5 ,         1.4 ,        0.2  ,Iris-setosa
  4.9 , 3.0  ,        1.4  ,       0.2 , Iris-setosa
4.7,3.2,1.3 ,0.2,Iris-setosa
4.6 ,        3.1  ,        1.5 ,        0.2  ,Iris-setosa
 5.0  ,       3.6  ,        1.4  ,       0.2,  Iris-setosa


# assign() always returns a copy of the data, leaving the original DataFrame untouched.
# assign 通常不會改變 原來的DataFrame數據,而是返回數據的拷貝
iris = pd.read_csv("20230310.csv")
print("csv data:", iris)
print("iris.assign:",iris.assign(sepal_ratio=iris["SepalWidth"] + 10).head() )
print("iris cloumns:", iris.columns)
## 注意,從csv讀取的屬性列 有空格
# iris cloumns: Index(['SepalLength', 'SepalWidth', 'PetalLength', ' PetalWidth', 'Name'], dtype='object')
print("csv data:[PetalWidth]", iris["PetalWidth"])
print("csv data:[PetalLength]", iris["PetalLength"])


print("PetalRatio:",iris.assign(PetalRatio=lambda x: x.PetalWidth / x.PetalLength).head())
csv data:    SepalLength  SepalWidth  PetalLength  PetalWidth           Name
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
iris.assign: SepalLength SepalWidth PetalLength PetalWidth Name sepal_ratio
0 5.1 3.5 1.4 0.2 Iris-setosa 13.5
1 4.9 3.0 1.4 0.2 Iris-setosa 13.0
2 4.7 3.2 1.3 0.2 Iris-setosa 13.2
3 4.6 3.1 1.5 0.2 Iris-setosa 13.1
4 5.0 3.6 1.4 0.2 Iris-setosa 13.6
iris cloumns: Index(['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Name'], dtype='object')
csv data:[PetalWidth] 0 0.2
1 0.2
2 0.2
3 0.2
4 0.2
Name: PetalWidth, dtype: float64
csv data:[PetalLength] 0 1.4
1 1.4
2 1.3
3 1.5
4 1.4
Name: PetalLength, dtype: float64
PetalRatio: SepalLength SepalWidth PetalLength PetalWidth Name PetalRatio
0 5.1 3.5 1.4 0.2 Iris-setosa 0.142857
1 4.9 3.0 1.4 0.2 Iris-setosa 0.142857
2 4.7 3.2 1.3 0.2 Iris-setosa 0.153846
3 4.6 3.1 1.5 0.2 Iris-setosa 0.133333
4 5.0 3.6 1.4 0.2 Iris-setosa 0.142857

"""
(
iris.query("SepalLength > 5")
.assign(
SepalRatio=lambda x: x.SepalWidth / x.SepalLength,
PetalRatio=lambda x: x.PetalLength / x.PetalLength,
).plot(kind="scatter", x="SepalRatio", y="PetalRatio")
)
"""

"""
The function signature for assign() is simply **kwargs.
The keys are the column names for the new fields,
and the values are either a value to be inserted (for example, a Series or NumPy array),
or a function of one argument to be called on the DataFrame.
A copy of the original DataFrame is returned, with the new values inserted.

The order of **kwargs is preserved. This allows for dependent assignment,
where an expression later in **kwargs can refer to a column created earlier in the same assign().
"""

# In the second expression, x['C'] will refer to the newly created column, that’s equal to dfa['A'] + dfa['B'].
dfa = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
dfb = dfa.assign(C=lambda x: x["A"] + x["B"], D=lambda x: x["A"] + x["C"])
print("dfa:", dfa)
print("dfb:", dfb)

dfa:    A  B
0 1 4
1 2 5
2 3 6
dfb: A B C D
0 1 4 5 6
1 2 5 7 9
2 3 6 9 12

Indexing / selection

Operation                           Syntax                  Result
select column : df[col] Series
select row by label : df.loc[label] Series
Select row by integer location: df.iloc[loc] Series
Slice rows: df[5:10] DataFrame
Select rows by boolean vector: df[bool_vec] DataFrame
# Row selection, for example, returns a Series whose index is the columns of the DataFrame:
print("df30:", df1)
## 選出 某一行(b 是一個索引值,選出這個索引的行)Select row by label
print("df31:", df1.loc["b"])


df30:    one  insert_bar   flag  foo  one_trunc
a 1.0 1.0 False bar 1.0
b 2.0 2.0 False bar 2.0
c 3.0 3.0 True bar NaN
d NaN NaN False bar NaN
df31: one 2
insert_bar 2
flag False
foo bar
one_trunc 2
Name: b, dtype: object
# Select row by integer location
print("df32:", df1.iloc[2])

df32: one              3
insert_bar 3
flag True
foo bar
one_trunc NaN
Name: c, dtype: object

Data alignment and arithmetic

"""
Data alignment between DataFrame objects automatically align on both the columns and the index (row labels).
Again, the resulting object will have the union of the column and row labels.
"""


df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=["A", "B", "C"])
print("df40:", df)
print("df41:", df2)
print("df42:", df + df2)
print("df43:", df - df.iloc[0])

print("df44:", df * 5 + 2)
print("df45:", 1 / df)
print("df46:", df ** 4)


df40:           A         B         C         D
0 -1.084084 0.183785 -1.153985 0.055283
1 0.253552 0.077291 -0.303460 0.701300
2 0.821357 -1.116865 0.610512 -1.327411
3 -0.251630 -0.341660 0.730774 -0.584376
4 -1.215528 -0.691270 -0.552341 0.278375
5 -0.009694 -0.963907 1.593190 -0.991862
6 0.377235 -1.089611 -0.515518 -0.252789
7 -1.158782 -1.405582 -0.189190 -1.787201
8 1.469154 0.924453 -0.305886 -0.527754
9 -0.416995 1.469462 -1.107226 0.941600
df41: A B C
0 -0.184298 1.094119 -0.623001
1 -0.531990 -0.025734 -0.948708
2 0.877716 -1.547748 -0.753285
3 -0.248297 -1.370722 1.646786
4 0.958594 -0.373161 1.166930
5 -0.626382 1.731893 0.521530
6 -0.008678 0.955742 0.463842
df42: A B C D
0 -1.268382 1.277904 -1.776987 NaN
1 -0.278439 0.051558 -1.252168 NaN
2 1.699073 -2.664612 -0.142773 NaN
3 -0.499928 -1.712383 2.377560 NaN
4 -0.256934 -1.064431 0.614589 NaN
5 -0.636076 0.767986 2.114720 NaN
6 0.368556 -0.133869 -0.051676 NaN
7 NaN NaN NaN NaN
8 NaN NaN NaN NaN
9 NaN NaN NaN NaN
df43: A B C D
0 0.000000 0.000000 0.000000 0.000000
1 1.337636 -0.106494 0.850525 0.646017
2 1.905441 -1.300650 1.764497 -1.382694
3 0.832454 -0.525445 1.884759 -0.639659
4 -0.131444 -0.875055 0.601645 0.223092
5 1.074390 -1.147693 2.747175 -1.047145
6 1.461319 -1.273396 0.638467 -0.308072
7 -0.074698 -1.589367 0.964795 -1.842484
8 2.553238 0.740668 0.848100 -0.583037
9 0.667089 1.285677 0.046759 0.886317
df44: A B C D
0 -3.420421 2.918925 -3.769926 2.276415
1 3.267758 2.386457 0.482700 5.506499
2 6.106784 -3.584324 5.052559 -4.637055
3 0.741848 0.291698 5.653871 -0.921882
4 -4.077641 -1.456350 -0.761703 3.391873
5 1.951529 -2.819537 9.965949 -2.959312
6 3.886173 -3.448053 -0.577592 0.736056
7 -3.793910 -5.027910 1.054048 -6.936004
8 9.345768 6.622265 0.470572 -0.638769
9 -0.084974 9.347311 -3.536131 6.708002
df45: A B C D
0 -0.922438 5.441138 -0.866562 18.088755
1 3.943970 12.938049 -3.295328 1.425924
2 1.217498 -0.895364 1.637970 -0.753346
3 -3.974082 -2.926883 1.368412 -1.711226
4 -0.822688 -1.446613 -1.810477 3.592281
5 -103.155091 -1.037444 0.627672 -1.008204
6 2.650870 -0.917759 -1.939795 -3.955872
7 -0.862975 -0.711449 -5.285678 -0.559534
8 0.680664 1.081721 -3.269195 -1.894823
9 -2.398111 0.680521 -0.903158 1.062022
df46: A B C D
0 1.381186e+00 0.001141 1.773377 0.000009
1 4.133002e-03 0.000036 0.008480 0.241888
2 4.551214e-01 1.555974 0.138924 3.104715
3 4.009152e-03 0.013626 0.285189 0.116619
4 2.183033e+00 0.228345 0.093074 0.006005
5 8.831558e-09 0.863260 6.442732 0.967845
6 2.025099e-02 1.409565 0.070628 0.004083
7 1.803047e+00 3.903235 0.001281 10.202189
8 4.658744e+00 0.730364 0.008755 0.077576
9 3.023589e-02 4.662659 1.502954 0.786080


df1 = pd.DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}, dtype=bool)
df2 = pd.DataFrame({"a": [0, 1, 1], "b": [1, 1, 0]}, dtype=bool)
print("df1:", df1)
print("df2:", df2)
print("df47:", df1 & df2)
print("df48:", df1 | df2)
print("df49:", df1 ^ df2)
print("df50:", -df1)

df1:        a      b
0 True False
1 False True
2 True True
df2: a b
0 False True
1 True True
2 True False
df47: a b
0 False False
1 False True
2 True False
df48: a b
0 True True
1 True True
2 True True
df49: a b
0 True True
1 True False
2 False True
df50: a b
0 False True
1 True False
2 False False

## 旋轉 DataFrame 裏面的數據
print("df", df)
print("df[:5]", df[:5].T)



df A B C D
0 -1.084084 0.183785 -1.153985 0.055283
1 0.253552 0.077291 -0.303460 0.701300
2 0.821357 -1.116865 0.610512 -1.327411
3 -0.251630 -0.341660 0.730774 -0.584376
4 -1.215528 -0.691270 -0.552341 0.278375
5 -0.009694 -0.963907 1.593190 -0.991862
6 0.377235 -1.089611 -0.515518 -0.252789
7 -1.158782 -1.405582 -0.189190 -1.787201
8 1.469154 0.924453 -0.305886 -0.527754
9 -0.416995 1.469462 -1.107226 0.941600
df[:5] 0 1 2 3 4
A -1.084084 0.253552 0.821357 -0.251630 -1.215528
B 0.183785 0.077291 -1.116865 -0.341660 -0.691270
C -1.153985 -0.303460 0.610512 0.730774 -0.552341
D 0.055283 0.701300 -1.327411 -0.584376 0.278375

 

完整的腳本

#!/usr/bin/evn python

import numpy as np
import pandas as pd

# ------------------------ Series

"""
Series

Series is a one-dimensional labeled array capable of holding any data type 
(integers, strings, floating point numbers, Python objects, etc.). 

The axis labels are collectively referred to as the index. 
The basic method to create a Series is to call:
s = pd.Series(data, index=index)
"""
# from ndarray
s1 = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
print("pd s1:\n", s1)
print("pd s1.index:", s1.index)
print("pd s1.values:", s1.values)

# from dict
d = {"b": 1, "a": 0, "c": 2}
s2 = pd.Series(d)
print("pd s2:\n", s2)

s3 = pd.Series(d, index=["b", "c", "d", "a"])
print("pd s3:\n", s3)

# from scalar value
s4 = pd.Series(5.0, index=["a", "b", "c", "d", "e"])
print("pd s4:\n", s4)

# Series is ndarray-like,可以像數組一樣訪問 Series 裏面的數據
print("pd s3.array:\n", s3.array[1])
# Series is dic-like. 可以像數組一樣訪問 Series 裏面的數據
print("pd s3.['b']:\n", s3['b'])
print("test s3 key:", "b" in s3)
print("test s3 key:", "f" in s3)

# Using the  Series.get() method, a missing lable will return None or specified default:
print("Series.get() method:", s3.get("name"))
print("Series.get() method:", s3.get("name", np.nan))
print("Series.get() method:", s3.get("name", "henry"))

# Series also has a name attribute:
s5 = pd.Series(np.random.randn(5), name="henry")
print("pd s5:\n", s5)

s6 = s5.rename("henry2")
print("pd s6:\n", s6)
print("pd s6.head():\n", s6.head())
print("pd s6.head(2):\n", s6.head(2))

# ------------------------ DataFrame
"""
DataFrame

DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. 
You can think of it like a spreadsheet or SQL table, 
or a dict of Series objects. 

It is generally the most commonly used pandas object. 
Like Series, DataFrame accepts many different kinds of input:

  Dict of 1D ndarrays, lists, dicts, or Series
  2-D numpy.ndarray
  Structured or record ndarray
  A Series
  Another DataFrame

Along with the data, you can optionally pass index (row labels) and columns (column labels) arguments. 
If you pass an index and / or columns, 
you are guaranteeing the index and / or columns of the resulting DataFrame. 
Thus, a dict of Series plus a specific index will discard all data 
not matching up to the passed index.

If axis labels are not passed, 
they will be constructed from the input data based on common sense rules.

"""

# from dic of Series or dicts
d2 = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([10.0, 20.0, 30.0, 40.0], index=["a", "b", "c", "d"])
}
df1 = pd.DataFrame(d2)
print("DataFrame df1:", df1)

df2 = pd.DataFrame(d2, index=["d", "b", "a"])
print("DataFrame df2:", df2)
df2.rename(columns={'two': 'symbol'}, inplace=True)
print("DataFrame df2.rename:", df2)

df3 = pd.DataFrame(d2, index=["d", "b", "a"], columns=["two", "three"])
print("DataFrame df3:", df3)

# the row and colunm lables can be accessed respectively by acessing the index and columns attributes:
#  索引名稱
print("df3.index:", df3.index)
# 列 名稱
print("df3.columns:", df3.columns)


# from dict of ndarrays / lists
d = {
    "one":[1.0, 2.0, 3.0, 4.0],
    "two":[4.0, 3.0, 2.0, 1.0]
}
df4 = pd.DataFrame(d)
print("DataFrame df4:", df4)

df5 = pd.DataFrame(d, index=["a", "b", "c", "d"])
print("DataFrame df5:", df5)

# from structed or record array
data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")])
print("DataFrame data1:", data)
data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
print("DataFrame data2:", data)
df6 = pd.DataFrame(data)
print("DataFrame df6:", df6)

df7 = pd.DataFrame(data, index=["first", "second"])
print("DataFrame df7:", df7)

df8 = pd.DataFrame(data, columns=["C", "A", "B"])
print("DataFrame df8:", df8)


# from a list of dicts
data2 = [
    {"a":1, "b":2},
    {"a":5,"b":10,"c":20}
]
df9 = pd.DataFrame(data2)
print("DataFrame df9:", df9)


print("DataFrame df10:", pd.DataFrame(data2, index=["first", "second"]))

# 只獲取columns 列出的那幾列數據
print("DataFrame df11:", pd.DataFrame(data2, columns=["a", "b"]))

# from a dict of tuples
df12 = pd.DataFrame(
    {
        ("a","b"):{("A", "B"):1, ("A", "C"):2},
        ("a", "a"): {("A", "C"): 3, ("A", "B"): 4},
        ("a", "c"): {("A", "B"): 5, ("A", "C"): 6},
        ("b", "a"): {("A", "C"): 7, ("A", "B"): 7},
        ("b", "b"): {("A", "D"): 9, ("A", "B"): 10}
    })
print("DataFrame df12:", df12)

# from a Series
ser = pd.Series(range(3), index=list("abc"), name="ser")
print("ser:", ser)
print("DataFrame df13:", pd.DataFrame(ser))
# Series 裏面定義的name,就是DataFrame裏面的列 名稱
print("DataFrame df14:", pd.DataFrame(ser, columns=["ser"]))
print("DataFrame df15:", pd.DataFrame(ser, columns=["ser", "name2"]))


# from a list of namedtuples
from collections import  namedtuple
Point = namedtuple("Point", "x y")
print("DataFrame df16:", pd.DataFrame([Point(0,0), Point(0,3), (2,3)]))
Point3D = namedtuple("Point3D", "x y z")
print("DataFrame df17:", pd.DataFrame([Point3D(0,0,0), Point3D(0,3,5), Point(2,3)]))

# from a list of dataclasses
from dataclasses import make_dataclass
Point = make_dataclass("Point", [("x", int), ("y", int)])
print("DataFrame df18:", pd.DataFrame([Point(0,0), Point(0,3), Point(2,3)]))


#--------------------------------------- Alternate constructors
"""
DataFrame.from_dict

DataFrame.from_dict() takes a dict of dicts or a dict of array-like sequences and returns a DataFrame. 
It operates like the DataFrame constructor except for the orient parameter 
which is 'columns' by default, 
but which can be set to 'index' in order to use the dict keys as row labels.
"""
print("df19:",pd.DataFrame.from_dict(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])])))

# orient="index", 把 列明 和索引名交換了,相當於旋轉了數組
# If you pass orient='index', the keys will be the row labels.
# In this case, you can also pass the desired column names:
print("df20:",pd.DataFrame.from_dict(
    dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]),
    orient="index",
    columns=["one", "two", "three"],))

print("df21:",pd.DataFrame.from_dict(
    dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]),
    orient="index"))


"""
DataFrame.from_records

DataFrame.from_records() takes a list of tuples or an ndarray with structured dtype. 
It works analogously to the normal DataFrame constructor,
except that the resulting DataFrame index may be a specific field of the structured dtype.

"""


# ----------------- Column selection, addition, deletion
"""
You can treat a DataFrame semantically like a dict of like-indexed Series objects. 
Getting, setting, and deleting columns works with the same syntax as the analogous dict operations:

"""
# 訪問df 的某列,df的某列就是一個 Series
print("df1", df1)
print("df22", df1["one"])

df1["three"] = df1["one"] * df1["two"]
# 判斷df1["one"]裏面每個元素是否 大於2,結果是 一個Bool類型變量
df1["flag"] = df1["one"]>2
print("df23:", df1)


# Columns can be deleted or popped like with a dict:
del df1["two"]
three = df1.pop("three")
print("df three:", three)
print("df24:", df1)

# when inserting a scalar value, it will naturally be propagated to fill the column.
df1["foo"] = "bar"
print("df25:", df1)

# when inserting a Series that does not have the same index as the DataFrame,it will be conformed to the DataFrame's index

## slicing 切片知識補充
tag = "henry, hello slicing!"
print("tag[:]", tag[:])
# [) 切片是一個 半閉半開 的區間
print("tag[1:2]", tag[1:2])
print("tag[0:2]", tag[0:2])
print("tag[:2]", tag[:2])

print("Series [:]", df1["one"][:])
print("Series [:2]", df1["one"][:2])
df1["one_trunc"] = df1["one"][:2]
print("DataFrame df25:", df1)


# 指定位置插入一列
# You can insert raw ndarrays but their length must match the length of the DataFrame’s index.
# By default, columns get inserted at the end. DataFrame.insert() inserts at a particular location in the column
df1.insert(1,"insert_bar", df1["one"])
print("DataFrame df26:",df1)


# 通過 assign() 方法,從已有的列中 創造一個新的列
print("df1.head()", df1.head())
print("df1['one'].head()", df1["one"].head())
## assign  方法創造了新的列,但是不會改變之前的df數據,新的列是在 返回的數據裏面
df2 = df1.assign(new_col=df1["one"]/df1["one_trunc"])
print("DataFrame df27:",df1) # df1 還是之前的結構,沒有改變
print("DataFrame df28:",df2) # df2 纔是改變後的結構

#  通過函數的方式來創建新的列
# In the example above, we inserted a precomputed value.
# We can also pass in a function of one argument to be evaluated on the DataFrame being assigned to.
df3 = df1.assign(func_col=lambda x:(x["one"]+10))
print("DataFrame df29:", df3)

# assign() always returns a copy of the data, leaving the original DataFrame untouched.
# assign 通常不會改變 原來的DataFrame數據,而是返回數據的拷貝
iris = pd.read_csv("20230310.csv")
print("csv data:", iris)
print("iris.assign:",iris.assign(sepal_ratio=iris["SepalWidth"] + 10).head() )
print("iris cloumns:", iris.columns)
## 注意,從csv讀取的屬性列 有空格
# iris cloumns: Index(['SepalLength', 'SepalWidth', 'PetalLength', '  PetalWidth', 'Name'], dtype='object')
print("csv data:[PetalWidth]", iris["PetalWidth"])
print("csv data:[PetalLength]", iris["PetalLength"])


print("PetalRatio:",iris.assign(PetalRatio=lambda x: x.PetalWidth / x.PetalLength).head())
"""
(
    iris.query("SepalLength > 5")
.assign(
        SepalRatio=lambda x: x.SepalWidth / x.SepalLength,
PetalRatio=lambda x: x.PetalLength / x.PetalLength,
).plot(kind="scatter", x="SepalRatio", y="PetalRatio")
)
"""



"""
The function signature for assign() is simply **kwargs. 
The keys are the column names for the new fields, 
and the values are either a value to be inserted (for example, a Series or NumPy array), 
or a function of one argument to be called on the DataFrame. 
A copy of the original DataFrame is returned, with the new values inserted.

The order of **kwargs is preserved. This allows for dependent assignment, 
where an expression later in **kwargs can refer to a column created earlier in the same assign().
"""
# In the second expression, x['C'] will refer to the newly created column, that’s equal to dfa['A'] + dfa['B'].
dfa = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
dfb = dfa.assign(C=lambda x: x["A"] + x["B"], D=lambda x: x["A"] + x["C"])
print("dfa:", dfa)
print("dfb:", dfb)

# ---------------------------- Indexing / selection
"""
Operation                           Syntax                  Result
select column :                     df[col]                   Series
select row by label :               df.loc[label]             Series
Select row by integer location:     df.iloc[loc]              Series
Slice rows:                         df[5:10]                  DataFrame
Select rows by boolean vector:      df[bool_vec]              DataFrame
"""
# Row selection, for example, returns a Series whose index is the columns of the DataFrame:
print("df30:", df1)
## 選出 某一行(b 是一個索引值,選出這個索引的行)Select row by label
print("df31:", df1.loc["b"])

# Select row by integer location
print("df32:", df1.iloc[2])


# --------------------------------- Data alignment and arithmetic

"""
Data alignment between DataFrame objects automatically align on both the columns and the index (row labels).
Again, the resulting object will have the union of the column and row labels.
"""
df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=["A", "B", "C"])
print("df40:", df)
print("df41:", df2)
print("df42:", df + df2)
print("df43:", df - df.iloc[0])

print("df44:", df * 5 + 2)
print("df45:", 1 / df)
print("df46:", df ** 4)


df1 = pd.DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}, dtype=bool)
df2 = pd.DataFrame({"a": [0, 1, 1], "b": [1, 1, 0]}, dtype=bool)
print("df1:", df1)
print("df2:", df2)
print("df47:", df1 & df2)
print("df48:", df1 | df2)
print("df49:", df1 ^ df2)
print("df50:", -df1)


## 旋轉 DataFrame 裏面的數據
print("df", df)
print("df[:5]", df[:5].T)
View Code

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章