2.9 - 3.2 pandas - 台部落

2.9 結構化數據

import numpy as np

# 使用符合數據結構的結構化數組

data = np.zeros(4, dtype={'names': ('name', 'age', 'weight'),

                         'formats': ('U10', 'i4', 'f8')})

data.dtype

dtype([('name', '<U10'), ('age', '<i4'), ('weight', '<f8')])

# 填入數據

name = ['Alice', 'Bob', 'Cathy', 'Doug']

age = [25, 45, 34, 19]

weight = [55.0, 86.5, 68, 61.5]

data['name'] = name

data['age'] = age

data['weight'] = weight

data

array([('Alice', 25, 55. ), ('Bob', 45, 86.5), ('Cathy', 34, 68. ),
       ('Doug', 19, 61.5)],
      dtype=[('name', '<U10'), ('age', '<i4'), ('weight', '<f8')])

# 訪問

# 獲取所有名字

data['name']

array(['Alice', 'Bob', 'Cathy', 'Doug'], dtype='<U10')

# 獲取第一行

data[0]

('Alice', 25, 55.)

# 獲取最後一行的名字

data[-1]['name']

'Doug'

# 利用掩碼篩選

# 年齡小於30的數據行的名字字段

data[data['age'] < 30]['name']

array(['Alice', 'Doug'], dtype='<U10')

3.2 Pandas 對象

import pandas as pd

pd.__version__

'0.23.3'

Pandas 的 Series 對象

是一個帶索引數據構成的一維數組，可用一個數組創建之。

data = pd.Series([0.25, 0.5, 0.75, 1.0])

data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

data.values

array([0.25, 0.5 , 0.75, 1.  ])

data.index

RangeIndex(start=0, stop=4, step=1)

# 索引訪問

data[1]

0.5

data[1:3]

1    0.50
2    0.75
dtype: float64

# 索引可以顯示定義

data = pd.Series([0.25, 0.5, 0.75, 1], index=['a', 'b', 'c', 'd'])

data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

# Series 是特殊的字典

# 可以用python的字典創建Series對象

popu_dict={'e': 6622, 'b': 5644, 'c': 9022, 'd': 1222111}

popu = pd.Series(popu_dict)

popu

e       6622
b       5644
c       9022
d    1222111
dtype: int64

# 與一般字典不同，Series支持切片

popu['b':'d']

b       5644
c       9022
d    1222111
dtype: int64

# 創建Series對象的其他方法

# 值爲標量，填充到索引

pd.Series(5, index=[100, 200, 10])

100    5
200    5
10     5
dtype: int64

# 值爲字典，索引默認爲排序的字典鍵

pd.Series({2: 'a', 1: 'b', 3: 'c'})

2    a
1    b
3    c
dtype: object

# 每種形式都可以顯示指定索引，從而篩選需要的結果

pd.Series({2: 'a', 1: 'b', 3: 'c'}, index=[3, 2])

3    c
2    a
dtype: object

Pandas 的 DataFrame 對象

既可以作爲通用的 Numpy 數組，又可以作爲特殊的 Python 字典來看待

# DF 是既有靈活行索引，又有靈活列名的二維數組

area_dict = {'e': 50, 'b': 46, 'c': 66, 'd': 211}

area = pd.Series(area_dict)

area

e     50
b     46
c     66
d    211
dtype: int64

# 結合上邊的popu和area創建一個DF對象

# 用字典作爲參數創建

states = pd.DataFrame({'popu': popu, 'area': area})

states

	popu	area
e	6622	50
b	5644	46
c	9022	66
d	1222111	211

# DF的行索引

states.index

Index(['e', 'b', 'c', 'd'], dtype='object')

# DF的列名

states.columns

Index(['popu', 'area'], dtype='object')

# DF可看作字典，用列名當索引，返回此列的字典鍵值對

states['area']

e     50
b     46
c     66
d    211
Name: area, dtype: int64

創建DF對象的多個方法：

# 1 通過單個Series對象創建單列DF

pd.DataFrame(popu, columns=['population'])

	population
e	6622
b	5644
c	9022
d	1222111

# 2 通過字典列表創建。任何元素是字典的列表都可以變爲DF

# 行索引未指定，默認爲整數

data = [{'a': i+5, 'b': 2*(i+5)} for i in range(3)]

data

[{'a': 5, 'b': 10}, {'a': 6, 'b': 12}, {'a': 7, 'b': 14}]

pd.DataFrame(data)

	a	b
0	5	10
1	6	12
2	7	14

# 即使字典有缺少的值，DF也用NaN（不是數字）代替

pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

	a	b	c
0	1.0	2	NaN
1	NaN	3	4.0

# 3 通過Series對象字典創建DF，如前

pd.DataFrame({'population': popu, 'area': area})

	population	area
e	6622	50
b	5644	46
c	9022	66
d	1222111	211

# 4 通過Numpy二維數組創建。

# 如不顯示指定行列的索引值，則默認均爲整數索引值

pd.DataFrame(np.random.rand(3, 2), columns=['foo', 'bar'], index=['a','b','c'])

	foo	bar
a	0.761191	0.647341
b	0.147634	0.271593
c	0.438096	0.308335

# 5 通過Numpy結構化數組創建。

A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

pd.DataFrame(A)

	A	B
0	0	0.0
1	0	0.0
2	0	0.0

Pandas 的 Index 對象

可看作是不可變數組或有序集合（實爲多集，因值可重複）

ind = pd.Index([2, 3, 5, 7, 11])

ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

# 可索引可切片

ind[3]

ind[::2]

Int64Index([2, 5, 11], dtype='int64')

# 屬性與np數組相似

print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64

# index對象的值是不可更改的，如下句會出錯：

ind[0] = 8

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-41-11ff608b5603> in <module>()
      1 # index對象的值是不可更改的，如下句會出錯：
----> 2 ind[0] = 8

c:\program files\python36-32\lib\site-packages\pandas\core\indexes\base.py in __setitem__(self, key, value)
   2063 
   2064     def __setitem__(self, key, value):
-> 2065         raise TypeError("Index does not support mutable operations")
   2066 
   2067     def __getitem__(self, key):

TypeError: Index does not support mutable operations

# Index 對象的集合操作

indA = pd.Index([1, 3, 5, 7, 9])

indB = pd.Index([2, 3, 5, 7, 11])

# 交集

indA & indB

Int64Index([3, 5, 7], dtype='int64')

# 並集

indA | indB

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

# 異或

indA ^ indB

Int64Index([1, 2, 9, 11], dtype='int64')

2.9 - 3.2 pandas

2.9 結構化數據

3.2 Pandas 對象

Pandas 的 Series 對象

Pandas 的 DataFrame 對象

Pandas 的 Index 對象

3.1 用ffmpeg解決音畫不同步問題

視頻編輯任務大綱

2.1 用ffmpeg分割視頻

2.2 用ffmpeg粗略分割視頻的快速方法

3.11 向量化字符串操作

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結