pandas基礎學習

import numpy as np
import pandas as pd

print(pd.__version__)#檢查版本
print(pd.show_versions(as_json=True))#檢查版本

# 如何從列表，numpy數組和字典創建系列
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

# 解
s1=pd.Series(mylist)
s2=pd.Series(myarr)
s3=pd.Series(mydict)
print("*"*50)
print(s1.head())#獲取前五個元素
print(s2.head())
print(s3.head())

0.25.1
{'system': {'commit': None, 'python': '3.7.4.final.0', 'python-bits': 64, 'OS': 'Windows', 'OS-release': '10', 'machine': 'AMD64', 'processor': 'Intel64 Family 6 Model 78 Stepping 3, GenuineIntel', 'byteorder': 'little', 'LC_ALL': 'None', 'LANG': 'None', 'LOCALE': 'None.None'}, 'dependencies': {'pandas': '0.25.1', 'numpy': '1.17.2', 'pytz': '2019.2', 'dateutil': '2.8.0', 'pip': '19.3.1', 'setuptools': '41.4.0', 'Cython': None, 'pytest': '4.3.0', 'hypothesis': None, 'sphinx': None, 'blosc': None, 'feather': None, 'xlsxwriter': '1.2.5', 'lxml.etree': '4.4.1', 'html5lib': None, 'pymysql': '0.9.2', 'psycopg2': None, 'jinja2': '2.10.1', 'IPython': '7.9.0', 'pandas_datareader': None, 'bs4': '4.8.0', 'bottleneck': None, 'fastparquet': None, 'gcsfs': None, 'matplotlib': '3.1.1', 'numexpr': None, 'odfpy': None, 'openpyxl': None, 'pandas_gbq': None, 'pyarrow': None, 'pytables': None, 's3fs': None, 'scipy': '1.3.1', 'sqlalchemy': '1.3.10', 'tables': None, 'xarray': None, 'xlrd': None, 'xlwt': '1.3.0'}}
None
**************************************************
0    a
1    b
2    c
3    e
4    d
dtype: object
0    0
1    1
2    2
3    3
4    4
dtype: int32
a    0
b    1
c    2
e    3
d    4
dtype: int64

# 將序列ser轉換爲數據框，並將其索引作爲數據框的另一列
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)

# 解
df=ser.to_frame().reset_index()
df.head()

	index	0
0	a	0
1	b	1
2	c	2
3	e	3
4	d	4

# 如何結合多個系列形成一個數據框
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

# 解1
df1=pd.concat([ser1,ser2],axis=1)

# 解2
df2=pd.DataFrame({"col_1":ser1,"col_2":ser2})
print(df2.head())

  col_1  col_2
0     a      0
1     b      1
2     c      2
3     e      3
4     d      4

# 如何爲系列索引指定名稱
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))

# 解
ser.name="alphabets"
print(ser.head())

0    a
1    b
2    c
3    e
4    d
Name: alphabets, dtype: object

# 如何從中ser1刪除存在的項目ser2
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# 解
print(ser1[~ser1.isin(ser2)])

0    1
1    2
2    3
dtype: int64

# 如何獲得A系列和B系列都不通用的物品
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# 解
ser_u = pd.Series(np.union1d(ser1, ser2))  # 聯合
print(ser_u)
ser_i = pd.Series(np.intersect1d(ser1, ser2))  # 相交
print("-"*50)
print(ser_i)
print("-"*50)
print(ser_u[~ser_u.isin(ser_i)])#從【聯合中的數據】刪除【相交中的數據】

0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
dtype: int64
--------------------------------------------------
0    4
1    5
dtype: int64
--------------------------------------------------
0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64

# 如何獲得數字序列【計算的最小值，第25個百分點，中位數，第75個和最大值ser】
state = np.random.RandomState(100)
ser = pd.Series(state.normal(10, 5, 25))
print(ser)

# 解
result=np.percentile(ser, q=[0, 25, 50, 75, 100])
print(result)

0      1.251173
1     11.713402
2     15.765179
3      8.737820
4     14.906604
5     12.571094
6     11.105898
7      4.649783
8      9.052521
9     11.275007
10     7.709865
11    12.175817
12     7.082025
13    14.084235
14    13.363604
15     9.477944
16     7.343598
17    15.148663
18     7.809322
19     4.408409
20    18.094908
21    17.708026
22     8.740604
23     5.787821
24    10.922593
dtype: float64
[ 1.25117263  7.70986507 10.92259345 13.36360403 18.0949083 ]

# 如何獲得系列中唯一項目的頻率計數
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))
# print(ser)

# 解
print(ser.value_counts())

g    7
e    6
c    5
f    5
h    3
a    2
b    2
dtype: int64

# 如何僅保留最常見的前2個值並將其他所有值替換爲“Other”
np.random.RandomState(100)
ser=pd.Series(np.random.randint(1,5,12))#1到5的12個數
# print(ser)

# 解
# 獲取最多的前面兩個數值，並把其他數據變成"Other"
ser[~ser.isin(ser.value_counts().index[:2])]="Other"
print(ser)

0     Other
1         1
2         1
3     Other
4         4
5         4
6     Other
7         4
8     Other
9         1
10        4
11        1
dtype: object

# 將系列重塑ser爲具有7行5列的數據框
ser = pd.Series(np.random.randint(1, 10, 35))
# print(ser)

# 解
df=pd.DataFrame(ser.values.reshape(7,5))
print(df)

   0  1  2  3  4
0  7  9  2  3  9
1  4  8  9  9  3
2  7  7  8  6  5
3  4  2  8  4  6
4  1  9  4  2  5
5  8  4  8  3  1
6  4  4  5  2  7

# 從中找到除於3等於0的下標
ser = pd.Series(np.random.randint(1, 10, 7))
print(ser)

# 解
result=np.argwhere(ser%3==0)
result

0    1
1    6
2    5
3    4
4    9
5    6
6    1
dtype: int32





array([[1],
       [4],
       [5]], dtype=int64)

# 從中ser，提取列表中位置的項目pos
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]

# 解
result=ser.take(pos)
result

0     a
4     e
8     i
14    o
20    u
dtype: object

# 如何垂直和水平堆疊兩個系列
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

# 解
ser1.append(ser2)#垂直
print(ser1)

df = pd.concat([ser1, ser2], axis=1)#水平
df

0    0
1    1
2    2
3    3
4    4
dtype: int64

	0	1
0	0	a
1	1	b
2	2	c
3	3	d
4	4	e

# 對【ser2系列】的值，查找【ser1系列】對應的下標
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

# 解
result=[np.where(i==ser1)[0].tolist()[0] for i in ser2]
print(result)
# 解
result=[pd.Index(ser1).get_loc(i) for i in ser2]
result

[5, 4, 0, 8]





[5, 4, 0, 8]

# 計算truth和pred系列的均方誤差
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)

np.mean((truth-pred)**2)

0.42318488444073726

# 將中每個單詞的第一個字符更改爲大寫ser
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

# 解
pd.Series([i.title() for i in ser])

0     How
1      To
2    Kick
3    Ass?
dtype: object

# 計算系列中每個單詞的字符數
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

# 解
ser.map(lambda x:len(x))

0    3
1    2
2    4
3    4
dtype: int64

# 計算一系列結果數之間的差異(兩值之間的差值)
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])

# 解
print(ser.tolist())
print(ser.diff().tolist())

[1, 3, 6, 10, 15, 21, 27, 35]
[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]

# 將一系列日期字符串轉換爲時間序列
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303',
                 '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

# 解
pd.to_datetime(ser)

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

# 對時間的操作
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', 
                 '2013/04/04', '2015-11-27', '2019-11-27T12:20'])

# 解
from dateutil.parser import parse
ser_ts = ser.map(lambda x: parse(x))#轉換成統一日期

print("這個月的第幾天: ", ser_ts.dt.day.tolist())#這個月的第幾天
print("這一年的第幾周: ", ser_ts.dt.weekofyear.tolist())#這一年的第幾周
print("這一年的第幾天: ", ser_ts.dt.dayofyear.tolist())#這一年的第幾天
print("這一天的星期幾: ", ser_ts.dt.weekday_name.tolist())#這一天的星期幾

這個月的第幾天:  [1, 2, 3, 4, 27, 27]
這一年的第幾周:  [53, 5, 9, 14, 48, 48]
這一年的第幾天:  [1, 33, 63, 94, 331, 331]
這一天的星期幾:  ['Friday', 'Wednesday', 'Saturday', 'Thursday', 'Friday', 'Wednesday']

# 已知年月的數值，現在設置天數爲4
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])

# 解
from dateutil.parser import parse
ser.map(lambda x: parse('04 ' + x))

0   2010-01-04
1   2011-02-04
2   2012-03-04
dtype: datetime64[ns]

# 從中ser，提取包含至少2個元音的單詞
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

# 解
from collections import Counter
mask = ser.map(lambda x: sum([Counter(x.lower()).get(i, 0) 
                              for i in list('aeiou')]) >= 2)
ser[mask]

0     Apple
1    Orange
4     Money
dtype: object

import re
# 從系列中過濾有效的電子郵件
emails = pd.Series(['buying books at amazom.com', '[email protected]',
                    '[email protected]', '[email protected]'])
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'

# 解
emails.str.findall(pattern, flags=re.IGNORECASE)

0                     []
1    [[email protected]]
2            [[email protected]]
3    [[email protected]]
dtype: object

# 計算weights每個的平均值fruit
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))

# 解
weights.groupby(fruit).mean()

apple     5.428571
banana    7.500000
carrot    2.000000
dtype: float64

# 計算兩個序列之間的歐式距離
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

# 解
result=sum((p - q)**2)**.5#方式一
print(result)
result=np.linalg.norm(p-q)#方式二
result

18.16590212458495





18.16590212458495

# 找到一個數值序列中的所有局部最大值（或峯值）
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])

# 解
dd=np.diff(np.sign(np.diff(ser)))
print(dd)# -2 就是某個點，兩邊的值比它小
peak_locs=np.where(dd==-2)[0]+1
print(peak_locs)

[-2  2  0  0 -2  2 -2]
[1 5 7]

# 用最不頻繁的字符替換字符串中的缺失空格
my_str = 'dbc deb abed gade'
ser = pd.Series(list('dbc deb abed gade'))

# 解
freq = ser.value_counts()#查看數值出現的頻率
print(freq)
least_freq = freq.dropna().index[-1]#獲取頻率最小那個
result="".join(ser.replace(' ', least_freq))
result

d    4
     3
e    3
b    3
a    2
g    1
c    1
dtype: int64





'dbccdebcabedcgade'

# 如何創建一個從2000年1月1日開始和之後的10個週末（星期六）的TimeSeries，
#【並使用隨機數作爲值】
ser = pd.Series(np.random.randint(1,10,10), 
                pd.date_range('2000-01-01', periods=10, freq='W-SAT'))
print(ser)

2000-01-01    7
2000-01-08    7
2000-01-15    3
2000-01-22    1
2000-01-29    4
2000-02-05    8
2000-02-12    7
2000-02-19    8
2000-02-26    3
2000-03-04    4
Freq: W-SAT, dtype: int32

# ser缺少日期和值。顯示所有缺少的日期，並填充上一個日期的值
ser = pd.Series([1,10,3,np.nan], index=pd.to_datetime([
                    '2000-01-01', '2000-01-03', '2000-01-06', '2000-01-08']))
print(ser)
result=ser.resample('D').ffill()#降序替換
print(result)
# ser.resample('D').bfill()#逆序替換
# ser.resample('D').bfill().ffill()#替換成前面的值（比如：都是3.0）

2000-01-01     1.0
2000-01-03    10.0
2000-01-06     3.0
2000-01-08     NaN
dtype: float64
2000-01-01     1.0
2000-01-02     1.0
2000-01-03    10.0
2000-01-04    10.0
2000-01-05    10.0
2000-01-06     3.0
2000-01-07     3.0
2000-01-08     NaN
Freq: D, dtype: float64

# 從csv文件中的每第n行導入一個數據框
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', chunksize=50)

df2 = pd.concat([chunk.iloc[0] for chunk in df], axis=1)
df2 = df2.transpose()#轉置操作
df2

	crim	zn	indus	nox	rm	age	dis	rad	tax	ptratio	b	lstat	medv
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98	24.0
50	0.08873	21.0	5.64	0.439	5.963	45.7	6.8147	4.0	243.0	16.8	395.56	13.45	19.7
100	0.14866	0.0	8.56	0.520	6.727	79.9	2.7778	5.0	384.0	20.9	394.76	9.42	27.5
150	1.65660	0.0	19.58	0.871	6.122	97.3	1.6180	5.0	403.0	14.7	372.80	14.10	21.5
200	0.01778	95.0	1.47	0.403	7.135	13.9	7.6534	3.0	402.0	17.0	384.30	4.45	32.9
250	0.14030	22.0	5.86	0.431	6.487	13.0	7.3967	7.0	330.0	19.1	396.28	5.90	24.4
300	0.04417	70.0	2.24	0.400	6.871	47.4	7.8278	5.0	358.0	14.8	390.86	6.07	24.8
350	0.06211	40.0	1.25	0.429	6.490	44.4	8.7921	1.0	335.0	19.7	396.90	5.98	22.9
400	25.04610	0.0	18.10	0.693	5.987	100.0	1.5888	24.0	666.0	20.2	396.90	26.77	5.6
450	6.71772	0.0	18.10	0.713	6.749	92.6	2.3236	24.0	666.0	20.2	0.32	17.44	13.4
500	0.22438	0.0	9.69	0.585	6.027	79.7	2.4982	6.0	391.0	19.2	396.90	14.33	16.8

# 將csv導入數據框時更改列值
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', 
                converters={'medv': lambda x: 'High' if float(x) > 25 else 'Low'})
df

	crim	zn	indus	chas	nox	rm	age	dis	rad	tax	ptratio	b	lstat	medv
0	0.00632	18.0	2.31	0	0.538	6.575	65.2	4.0900	1	296	15.3	396.90	4.98	Low
1	0.02731	0.0	7.07	0	0.469	6.421	78.9	4.9671	2	242	17.8	396.90	9.14	Low
2	0.02729	0.0	7.07	0	0.469	7.185	61.1	4.9671	2	242	17.8	392.83	4.03	High
3	0.03237	0.0	2.18	0	0.458	6.998	45.8	6.0622	3	222	18.7	394.63	2.94	High
4	0.06905	0.0	2.18	0	0.458	7.147	54.2	6.0622	3	222	18.7	396.90	5.33	High
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
501	0.06263	0.0	11.93	0	0.573	6.593	69.1	2.4786	1	273	21.0	391.99	9.67	Low
502	0.04527	0.0	11.93	0	0.573	6.120	76.7	2.2875	1	273	21.0	396.90	9.08	Low
503	0.06076	0.0	11.93	0	0.573	6.976	91.0	2.1675	1	273	21.0	396.90	5.64	Low
504	0.10959	0.0	11.93	0	0.573	6.794	89.3	2.3889	1	273	21.0	393.45	6.48	Low
505	0.04741	0.0	11.93	0	0.573	6.030	80.8	2.5050	1	273	21.0	396.90	7.88	Low

506 rows × 14 columns

# 創建一個數據框，將行作爲給定序列的跨步
L = pd.Series(range(15))

def gen_strides(a, stride_len=5, window_len=5):
    n_strides = ((a.size-window_len)//stride_len) + 1
    return np.array([a[s:(s+window_len)] for s in np.arange(0, a.size, stride_len)[:n_strides]])

gen_strides(L, stride_len=2, window_len=4)

array([[ 0,  1,  2,  3],
       [ 2,  3,  4,  5],
       [ 4,  5,  6,  7],
       [ 6,  7,  8,  9],
       [ 8,  9, 10, 11],
       [10, 11, 12, 13]], dtype=int64)

# 僅從csv文件導入指定的列
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv',
                 usecols=['crim', 'medv'])
df.head()

	crim	medv
0	0.00632	24.0
1	0.02731	21.6
2	0.02729	34.7
3	0.03237	33.4
4	0.06905	36.2

# 獲取數據幀每列的n 行，n列，數據類型和摘要狀態
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
# print(df.shape)#數據的行，列
# print(df.dtypes)#打印每列數據的類型
# print(df.dtypes.value_counts())#每個dtype下有多少個
# print(df.values.tolist())#把數據合成列表
df.describe()#統計信息

	Min.Price	Price	Max.Price	MPG.city	MPG.highway	EngineSize	Horsepower	RPM	Rev.per.mile	Fuel.tank.capacity	Passengers	Length	Wheelbase	Width	Turn.circle	Rear.seat.room	Luggage.room	Weight
count	86.000000	91.000000	88.000000	84.000000	91.000000	91.000000	86.000000	90.000000	87.000000	85.000000	91.000000	89.000000	92.000000	87.000000	88.000000	89.000000	74.000000	86.000000
mean	17.118605	19.616484	21.459091	22.404762	29.065934	2.658242	144.000000	5276.666667	2355.000000	16.683529	5.076923	182.865169	103.956522	69.448276	38.954545	27.853933	13.986486	3104.593023
std	8.828290	9.724280	10.696563	5.841520	5.370293	1.045845	53.455204	605.554811	486.916616	3.375748	1.045953	14.792651	6.856317	3.778023	3.304157	3.018129	3.120824	600.129993
min	6.700000	7.400000	7.900000	15.000000	20.000000	1.000000	55.000000	3800.000000	1320.000000	9.200000	2.000000	141.000000	90.000000	60.000000	32.000000	19.000000	6.000000	1695.000000
25%	10.825000	12.350000	14.575000	18.000000	26.000000	1.800000	100.750000	4800.000000	2017.500000	14.500000	4.000000	174.000000	98.000000	67.000000	36.000000	26.000000	12.000000	2647.500000
50%	14.600000	17.700000	19.150000	21.000000	28.000000	2.300000	140.000000	5200.000000	2360.000000	16.500000	5.000000	181.000000	103.000000	69.000000	39.000000	27.500000	14.000000	3085.000000
75%	20.250000	23.500000	24.825000	25.000000	31.000000	3.250000	170.000000	5787.500000	2565.000000	19.000000	6.000000	192.000000	110.000000	72.000000	42.000000	30.000000	16.000000	3567.500000
max	45.400000	61.900000	80.000000	46.000000	50.000000	5.700000	300.000000	6500.000000	3755.000000	27.000000	8.000000	219.000000	119.000000	78.000000	45.000000	36.000000	22.000000	4105.000000

# 哪個製造商，型號和類型最高Price？最高Price值的單元格的行號和列號是多少？
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

max_price=df.loc[df.Price == np.max(df.Price), ['Manufacturer', 'Model', 'Type']]
print(max_price)

row, col = np.where(df.values == np.max(df.Price))
print(row,col)

# print(df.iat[row[0], col[0]])#獲取左上角的數值
print(df.iloc[row[0], col[0]])#獲取左上角的數值

print(df.at[row[0], 'Price'])#獲取Price列的第一個數值

     Manufacturer Model     Type
58  Mercedes-Benz  300E  Midsize
[58] [4]
61.9
61.9

# 重命名數據框中的特定列
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
# print(df.columns)
df.columns = df.columns.map(lambda x: x.replace('.', '_'))
print(df.columns)

Index(['Manufacturer', 'Model', 'Type', 'Min_Price', 'Price', 'Max_Price',
       'MPG_city', 'MPG_highway', 'AirBags', 'DriveTrain', 'Cylinders',
       'EngineSize', 'Horsepower', 'RPM', 'Rev_per_mile', 'Man_trans_avail',
       'Fuel_tank_capacity', 'Passengers', 'Length', 'Wheelbase', 'Width',
       'Turn_circle', 'Rear_seat_room', 'Luggage_room', 'Weight', 'Origin',
       'Make'],
      dtype='object')

# 檢查數據框是否缺少任何值
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

df.isnull().values.any()

True

# 計算每列中缺失值的數量,哪一列的缺失值最大
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

count=df.apply(lambda x: x.isnull().sum())
print(count)#缺失值的數量
print(count.idxmax())#缺失值最大

# 從數據框中選擇特定列作爲數據框而不是序列
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
print(df)

print(type(df[['a']]))
print(type(df.loc[:, ['a']]))
print(type(df.iloc[:, [0]]))

print(type(df.a))
print(type(df['a']))
print(type(df.loc[:, 'a']))
print(type(df.iloc[:, 1]))

    a   b   c   d   e
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>

df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
print(df)#沒有改變之前
# print(df[list('cbade')])#交換兩列的位置(交換a,c兩列)

# 自定義函數交換
def switch_columns(df,col_1=None,col_2=None):
    colnames=df.columns.tolist()
    i_1,i_2=colnames.index(col_1),colnames.index(col_2)
    colnames[i_2],colnames[i_1]=colnames[i_1],colnames[i_2]
    return df[colnames]
df1=switch_columns(df,'a','c')#交換兩列的位置
print(df1)
print(sorted(df.columns,reverse=True))#降序排列

    a   b   c   d   e
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
    c   b   a   d   e
0   2   1   0   3   4
1   7   6   5   8   9
2  12  11  10  13  14
3  17  16  15  18  19
['e', 'd', 'c', 'b', 'a']

import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
pd.set_option("display.max_columns",10)
pd.set_option('display.max_rows',10)
df

	Manufacturer	Model	Type	Min.Price	Price	...	Rear.seat.room	Luggage.room	Weight	Origin	Make
0	Acura	Integra	Small	12.9	15.9	...	26.5	NaN	2705.0	non-USA	Acura Integra
1	NaN	Legend	Midsize	29.2	33.9	...	30.0	15.0	3560.0	non-USA	Acura Legend
2	Audi	90	Compact	25.9	29.1	...	28.0	14.0	3375.0	non-USA	Audi 90
3	Audi	100	Midsize	NaN	37.7	...	31.0	17.0	3405.0	non-USA	Audi 100
4	BMW	535i	Midsize	NaN	30.0	...	27.0	13.0	3640.0	non-USA	BMW 535i
...	...	...	...	...	...	...	...	...	...	...	...
88	Volkswagen	Eurovan	Van	16.6	19.7	...	34.0	NaN	3960.0	NaN	Volkswagen Eurovan
89	Volkswagen	Passat	Compact	17.6	20.0	...	31.5	14.0	2985.0	non-USA	Volkswagen Passat
90	Volkswagen	Corrado	Sporty	22.9	23.3	...	26.0	15.0	2810.0	non-USA	Volkswagen Corrado
91	Volvo	240	Compact	21.8	22.7	...	29.5	14.0	2985.0	non-USA	Volvo 240
92	NaN	850	Midsize	24.8	26.7	...	30.0	15.0	3245.0	non-USA	Volvo 850

93 rows × 27 columns

df = pd.DataFrame(np.random.random(4), columns=['random'])
# 格式化或隱藏熊貓數據框中的科學計數法
print(df.round(4))#顯示後面4個小數
# 將數據框中的所有值格式化爲百分比
out=df.style.format({
    'random':'{0:.2%}'.format,
})
out

   random
0  0.8620
1  0.7903
2  0.0159
3  0.5417

            <tr>
                    <th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row0" class="row_heading level0 row0" >0</th>
                    <td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row0_col0" class="data row0 col0" >86.20%</td>
        </tr>
        <tr>
                    <th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row1" class="row_heading level0 row1" >1</th>
                    <td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row1_col0" class="data row1 col0" >79.03%</td>
        </tr>
        <tr>
                    <th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row2" class="row_heading level0 row2" >2</th>
                    <td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row2_col0" class="data row2 col0" >1.59%</td>
        </tr>
        <tr>
                    <th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row3" class="row_heading level0 row3" >3</th>
                    <td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row3_col0" class="data row3 col0" >54.17%</td>
        </tr>
</tbody></table>

	random

df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

df.iloc[::20, :][['Manufacturer', 'Model', 'Type']]#間隔20取一行

	Manufacturer	Model	Type
0	Acura	Integra	Small
20	Chrysler	LeBaron	Compact
40	Honda	Prelude	Sporty
60	Mercury	Cougar	Midsize
80	Subaru	Loyale	Small

import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv', 
                 usecols=[0,1,2,3,5])
# df.head()
df[['Manufacturer', 'Model', 'Type']] = df[['Manufacturer', 'Model', 'Type']].fillna('missing')
# df.head()
df.index = df.Manufacturer + '_' + df.Model + '_' + df.Type
print(df.index.is_unique)#通過組合相關列來創建主鍵索引
# df.head()

import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randint(1, 30, 30).reshape(10,-1), columns=list('abc'))

print(df['a'])
print(df['a'].argsort())#通過比較值，獲取索引
df['a'].argsort()[::-1][5]#獲取列中第n個最大值的行號

0    25
1    16
2    12
3     8
4     6
5    17
6    15
7    24
8    16
9    28
Name: a, dtype: int32
0    4
1    3
2    2
3    6
4    1
5    8
6    5
7    7
8    0
9    9
Name: a, dtype: int64





8

ser = pd.Series(np.random.randint(1, 100, 15))

print('ser: ', ser.tolist(), 'mean: ', round(ser.mean()))
np.argwhere(ser>ser.mean())#在中ser，找到第二個最大值的位置大於平均值

ser:  [54, 77, 49, 74, 24, 95, 94, 14, 7, 50, 69, 65, 72, 72, 58] mean:  58.0





array([[ 1],
       [ 3],
       [ 5],
       [ 6],
       [10],
       [11],
       [12],
       [13]], dtype=int64)

# 獲得行總和> 100的數據幀的最後n行
df = pd.DataFrame(np.random.randint(10, 40, 60).reshape(-1, 4))

rowsums = df.apply(np.sum, axis=1)# 每行相加
print(np.where(rowsums > 100)[0][-2:])#獲取大於100的最後2行【返回的是索引】
last_two_rows = df.iloc[np.where(rowsums > 100)[0][-2:], :]
last_two_rows

[11 14]

	0	1	2	3
11	27	32	22	32
14	21	35	37	30

# 將ser較低的5％ile和大於95％ile中的所有值分別替換爲第5個和第95％ile值。
ser = pd.Series(np.logspace(-2, 2, 30))

def cap_outliers(ser, low_perc, high_perc):
    low, high = ser.quantile([low_perc, high_perc])
    print(low_perc, '%ile: ', low, '|', high_perc, '%ile: ', high)
    ser[ser < low] = low
    ser[ser > high] = high
    return ser

capped_ser = cap_outliers(ser, .05, .95)
print(capped_ser)

0.05 %ile:  0.016049294076965887 | 0.95 %ile:  63.876672220183934
0      0.016049
1      0.016049
2      0.018874
3      0.025929
4      0.035622
5      0.048939
6      0.067234
7      0.092367
8      0.126896
9      0.174333
10     0.239503
11     0.329034
12     0.452035
13     0.621017
14     0.853168
15     1.172102
16     1.610262
17     2.212216
18     3.039195
19     4.175319
20     5.736153
21     7.880463
22    10.826367
23    14.873521
24    20.433597
25    28.072162
26    38.566204
27    52.983169
28    63.876672
29    63.876672
dtype: float64

# 在去除負值後將數據框重塑爲最大可能的正方形
df = pd.DataFrame(np.random.randint(-20, 50, 100).reshape(10,-1))
# print(df)
arr = df[df > 0].values.flatten()#把數據展平後，小於0的數變成nan
# print(arr)
arr_qualified = arr[~np.isnan(arr)]#去掉nan
print(arr_qualified)
#比較值，返回索引【倒敘索引】
top_indexes = np.argsort(arr_qualified)[::]
# print(top_indexes)
print(top_indexes[:n**2])
#以元素爲單位返回輸入的底限
n = int(np.floor(arr_qualified.shape[0]**.5))
# 提取指定索引位置的數據,並以一維數組或者矩陣返回
output = np.take(arr_qualified, sorted(top_indexes[:n**2])).reshape(n, -1)
print(output)

[ 6. 29. 48. 22. 14. 10. 49.  9. 18. 42. 31. 42. 16. 35. 45. 10.  2. 27.
 48.  2. 16. 48. 22. 12. 23. 13. 34. 38. 18. 10. 12. 48. 39. 18. 49. 24.
 35. 13. 16. 30. 35. 22. 44. 46.  8. 30.  1.  5. 30.  7. 15. 22.  6. 43.
 47.  8. 32. 21. 46.  5. 20. 39.  9. 17.]
[46 19 16 59 47  0 52 49 44 55 62  7  5 15 29 30 23 25 37  4 50 12 38 20
 63  8 33 28 60 57 22 51  3 41 24 35 17  1 45 39 48 10 56 26 40 13 36 27
 32 61 11  9 53 42 14 43 58 54  2 18 21 31 34  6]
[[ 6. 29. 48. 22. 14. 10. 49.  9.]
 [18. 42. 31. 42. 16. 35. 45. 10.]
 [ 2. 27. 48.  2. 16. 48. 22. 12.]
 [23. 13. 34. 38. 18. 10. 12. 48.]
 [39. 18. 49. 24. 35. 13. 16. 30.]
 [35. 22. 44. 46.  8. 30.  1.  5.]
 [30.  7. 15. 22.  6. 43. 47.  8.]
 [32. 21. 46.  5. 20. 39.  9. 17.]]

df=pd.DataFrame(np.arange(25).reshape(5,-1))
print(df)

# 交換數據幀的兩行
def swap_rows(df,i1,i2):
    df.iloc[i1,:],df.iloc[i2,:]=df.iloc[i2,:].copy(),df.iloc[i1,:].copy()
    return df
result=swap_rows(df,1,2)
result

    0   1   2   3   4
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
4  20  21  22  23  24

	0	1	2	3	4
0	0	1	2	3	4
1	10	11	12	13	14
2	5	6	7	8	9
3	15	16	17	18	19
4	20	21	22	23	24

df = pd.DataFrame(np.arange(25).reshape(5, -1))
print(df)
df.iloc[::-1, :]#反轉數據框的行

    0   1   2   3   4
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
4  20  21  22  23  24

	0	1	2	3	4
4	20	21	22	23	24
3	15	16	17	18	19
2	10	11	12	13	14
1	5	6	7	8	9
0	0	1	2	3	4

df = pd.DataFrame(np.arange(25).reshape(5,-1), columns=list('abcde'))
print(df)
# 獲取'a'數據框中列的一鍵編碼df，並將其附加爲列
result=pd.get_dummies(df['a'])
df_onehot=pd.concat([result,df[list('bcde')]],axis=1)#合併列顯示
df_onehot

    a   b   c   d   e
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
4  20  21  22  23  24

	0	5	10	15	20	b	c	d	e
0	1	0	0	0	0	1	2	3	4
1	0	1	0	0	0	6	7	8	9
2	0	0	1	0	0	11	12	13	14
3	0	0	0	1	0	16	17	18	19
4	0	0	0	0	1	21	22	23	24

df = pd.DataFrame(np.random.randint(1,100, 40).reshape(10, -1))
print(df)
print(df.apply(np.argmax, axis=1))#每行的最大值【這裏axis是相反的功能】
print(df.apply(np.argmax, axis=1).value_counts())#計算出現的次數【降序排序】
print(df.apply(np.argmax, axis=1).value_counts().index[0])#獲取最多出現的

    0   1   2   3
0  10  87  19  43
1   5  83  50  80
2  19  24  10  77
3  36  15  95  78
4   8  20  89  48
5  17  17  81  46
6  88  74  52  72
7  91  53  36  61
8  25  53  22  90
9   3  93  86  63
0    1
1    1
2    3
3    2
4    2
5    2
6    0
7    0
8    3
9    1
dtype: int64
2    3
1    3
3    2
0    2
dtype: int64
2

df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1), 
                 index=list('abcdefgh'), columns=list('pqrstuvwxy'))
print(df)
print(df.corr())#各類型之間的相關係數DataFrame表格
abs_corrmat = np.abs(df.corr())#區絕對值
print(abs_corrmat)
max_corr = abs_corrmat.apply(lambda x: sorted(x)[-2])#取倒數第二位
print(max_corr)#
print(np.round(max_corr.tolist(), 2))#取小數點後面兩位

    p   q   r   s   t   u   v   w   x   y
a  41  72   5  31  67  26  45  65  21  60
b  15  56  72  91  99  32  38  14  52  36
c   7  92  96  84  26  79  81  12  75  50
d  73  46  42  15  80  76  10  34  45   5
e  15  72  55  14  17  54   9  35  36  18
f  12  73  47  84  85   9  31  67  13  64
g  25  43  56  76  62  43  93  25  53  99
h  80  70  30  68  40  74   2  41   7  47
          p         q         r         s         t         u         v  \
p  1.000000 -0.348616 -0.606492 -0.406070  0.056932  0.461015 -0.543905   
q -0.348616  1.000000  0.213753  0.159442 -0.554585  0.088275  0.023752   
r -0.606492  0.213753  1.000000  0.501096 -0.190063  0.290160  0.440666   
s -0.406070  0.159442  0.501096  1.000000  0.260183 -0.243481  0.505580   
t  0.056932 -0.554585 -0.190063  0.260183  1.000000 -0.568596 -0.009954   
u  0.461015  0.088275  0.290160 -0.243481 -0.568596  1.000000 -0.118254   
v -0.543905  0.023752  0.440666  0.505580 -0.009954 -0.118254  1.000000   
w  0.207508  0.125992 -0.797659 -0.285267  0.192809 -0.562124 -0.358931   
x -0.452943 -0.029441  0.798096  0.187066 -0.122999  0.376021  0.637259   
y -0.294716 -0.043568 -0.043181  0.579817  0.083632 -0.434149  0.729595   

          w         x         y  
p  0.207508 -0.452943 -0.294716  
q  0.125992 -0.029441 -0.043568  
r -0.797659  0.798096 -0.043181  
s -0.285267  0.187066  0.579817  
t  0.192809 -0.122999  0.083632  
u -0.562124  0.376021 -0.434149  
v -0.358931  0.637259  0.729595  
w  1.000000 -0.835494  0.145546  
x -0.835494  1.000000 -0.030812  
y  0.145546 -0.030812  1.000000  
          p         q         r         s         t         u         v  \
p  1.000000  0.348616  0.606492  0.406070  0.056932  0.461015  0.543905   
q  0.348616  1.000000  0.213753  0.159442  0.554585  0.088275  0.023752   
r  0.606492  0.213753  1.000000  0.501096  0.190063  0.290160  0.440666   
s  0.406070  0.159442  0.501096  1.000000  0.260183  0.243481  0.505580   
t  0.056932  0.554585  0.190063  0.260183  1.000000  0.568596  0.009954   
u  0.461015  0.088275  0.290160  0.243481  0.568596  1.000000  0.118254   
v  0.543905  0.023752  0.440666  0.505580  0.009954  0.118254  1.000000   
w  0.207508  0.125992  0.797659  0.285267  0.192809  0.562124  0.358931   
x  0.452943  0.029441  0.798096  0.187066  0.122999  0.376021  0.637259   
y  0.294716  0.043568  0.043181  0.579817  0.083632  0.434149  0.729595   

          w         x         y  
p  0.207508  0.452943  0.294716  
q  0.125992  0.029441  0.043568  
r  0.797659  0.798096  0.043181  
s  0.285267  0.187066  0.579817  
t  0.192809  0.122999  0.083632  
u  0.562124  0.376021  0.434149  
v  0.358931  0.637259  0.729595  
w  1.000000  0.835494  0.145546  
x  0.835494  1.000000  0.030812  
y  0.145546  0.030812  1.000000  
p    0.606492
q    0.554585
r    0.798096
s    0.579817
t    0.568596
u    0.568596
v    0.729595
w    0.835494
x    0.835494
y    0.729595
dtype: float64
[0.61 0.55 0.8  0.58 0.57 0.57 0.73 0.84 0.84 0.73]

df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
print(df)
# 獲取df對象中的值【最小值/最大值】
min_by_max = np.min(df, axis=1)/np.max(df, axis=1)
min_by_max

    0   1   2   3   4   5   6   7   8   9
0  85  63  99  34  13  14  64  33  58  16
1  64  45  77  68  19  45  61   2  11  15
2  78  66  76  51  51  52  20  53  35  64
3  68  85   2  81  52  66  14  28  41  34
4  37  40  99  62  57  70  37  15  14  56
5  13  88  12  51  43   1  54  18  70  67
6  55  19  79  43  19   8  52   6  15  77
7  79  93  54  68  78  61  80  33  72  92





0    0.131313
1    0.025974
2    0.256410
3    0.023529
4    0.141414
5    0.011364
6    0.075949
7    0.354839
dtype: float64

# 創建一個新列'penultimate'，該列的第二行的值第二大df
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
result=df.apply(lambda x:x.sort_values().unique()[-2],axis=1)
df['penultimate'] = result
df

	0	1	2	3	4	5	6	7	8	9	penultimate
0	50	12	77	25	22	97	49	40	27	18	77
1	14	52	78	3	67	5	77	17	43	53	77
2	92	53	10	39	55	34	63	89	60	41	89
3	9	89	66	50	88	4	46	19	87	75	88
4	97	95	75	50	91	60	65	3	24	59	95
5	31	38	4	81	9	1	52	71	84	57	81
6	59	7	19	33	49	40	54	60	48	4	59
7	90	21	77	44	3	50	98	23	84	30	90

# 規範數據框中的所有列
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
print(df)

# df通過減去列均值併除以標準偏差來歸一化所有列
result=df.apply(lambda x:(
    (x - x.mean())/x.std()
).round(2))
result

    0   1   2   3   4   5   6   7   8   9
0  73  77  53  35   9  80  96  47  35  26
1  58  72  39  80  86  57  41  98  31  90
2  45  76  22  27   5  15  78  90  87  92
3  89  84  97  78  29  70  23  95  97  90
4  55  32  83  49  99  63  22  75  44  26
5  74  42  70  49  57  26  88  77   1   5
6  56  29  42  28  75  16  21  11  38  50
7  99  26  74  74  39  50  61   3  23   3

	0	1	2	3	4	5	6	7	8	9
0	0.24	0.90	-0.28	-0.79	-1.16	1.31	1.36	-0.40	-0.30	-0.57
1	-0.57	0.70	-0.83	1.24	1.03	0.39	-0.41	0.96	-0.42	1.10
2	-1.28	0.86	-1.51	-1.15	-1.28	-1.28	0.78	0.75	1.32	1.15
3	1.10	1.18	1.47	1.15	-0.59	0.91	-0.99	0.88	1.63	1.10
4	-0.74	-0.92	0.91	-0.16	1.40	0.63	-1.02	0.35	-0.02	-0.57
5	0.29	-0.52	0.40	-0.16	0.20	-0.84	1.10	0.40	-1.35	-1.11
6	-0.68	-1.04	-0.71	-1.10	0.72	-1.24	-1.05	-1.36	-0.20	0.06
7	1.64	-1.16	0.56	0.97	-0.31	0.11	0.23	-1.57	-0.67	-1.17

# 規範數據框中的所有列
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
print(df)

# 排列所有列的範圍df，以使每一列的最小值爲0且最大值爲1。
result = df.apply(lambda x: (
    1 - (x.max() - x)/(x.max() - x.min())
).round(2))
result

    0   1   2   3   4   5   6   7   8   9
0  78   7  59   4  77  81  93  66  39  28
1  60  51  88  19  23  29  70  82  10  24
2   2  80   7  59  72  51  82  28  38  25
3  36  88   3   8  43   7  87  60  28  99
4  29  69  89  84  87  15  95  87  75  54
5  82  78  60  57  15  29  41  93  57  13
6  72  28  63   2  20  25   6  72  71  32
7  60   2  13  87  82  97  41  23  81  16

	0	1	2	3	4	5	6	7	8	9
0	0.95	0.06	0.65	0.02	0.86	0.82	0.98	0.61	0.41	0.17
1	0.72	0.57	0.99	0.20	0.11	0.24	0.72	0.84	0.00	0.13
2	0.00	0.91	0.05	0.67	0.79	0.49	0.85	0.07	0.39	0.14
3	0.43	1.00	0.00	0.07	0.39	0.00	0.91	0.53	0.25	1.00
4	0.34	0.78	1.00	0.96	1.00	0.09	1.00	0.91	0.92	0.48
5	1.00	0.88	0.66	0.65	0.00	0.24	0.39	1.00	0.66	0.00
6	0.88	0.30	0.70	0.00	0.07	0.20	0.00	0.70	0.86	0.22
7	0.72	0.00	0.12	1.00	0.93	1.00	0.39	0.00	1.00	0.03

# 計算每一行與順序行的相關係數
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
print(df)
print([i for i in range(df.shape[0])[:-1]])#去除最後一行

result=[df.iloc[i].corr(df.iloc[i+1]) for i in range(df.shape[0])[:-1]]
result

    0   1   2   3   4   5   6   7   8   9
0  27   3  37   9  76  68  91  31  44   7
1  11  15  20  47  33  86  65  47   9  30
2  39   1  72  19  35  42  87  77  55  40
3  60   7   8  28  37  14  17   5   3   7
4  47  99  76  28  77  57  32  57  24  16
5   2  50  95  89  84  46  59  84   1   2
6  78  27  58  67  78   1   7  28  89  20
7  12  86  54  81  20  19  77   1   8  56
[0, 1, 2, 3, 4, 5, 6]





[0.5182965633327684,
 0.2595376913412023,
 -0.23874062518280761,
 0.005261734793477499,
 0.4687394611664755,
 -0.06555011633952691,
 -0.30907671467693215]

df = pd.DataFrame(np.random.randint(1,100, 100).reshape(10, -1))
rows=df.shape[0]#數據框的行數
for i in range(rows):
    df.iat[i,i]=0
    df.iat[rows-i-1,i]=0
df

	0	1	2	3	4	5	6	7	8	9
0	0	65	92	82	10	1	51	71	32	0
1	79	0	11	99	28	68	24	8	0	83
2	34	4	0	35	11	91	83	0	41	29
3	84	72	5	0	65	76	0	25	25	64
4	98	14	2	10	0	0	2	94	40	84
5	75	8	8	27	0	0	23	62	73	95
6	23	43	38	0	36	43	0	7	65	6
7	80	96	0	82	92	79	64	0	61	67
8	29	0	96	96	76	21	94	72	0	4
9	0	26	27	65	95	19	19	1	90	0

# 通過密鑰獲取groupby數據幀的特定組
df = pd.DataFrame({'col1': ['apple', 'banana', 'orange'] * 3,
                   'col2': np.random.rand(9),
                   'col3': np.random.randint(0, 15, 9)})
print(df)
# 通過數據框的df['col1']，顯示有apple的行數
df.groupby(df['col1']).get_group('apple')

     col1      col2  col3
0   apple  0.703158    12
1  banana  0.535815    13
2  orange  0.177147     8
3   apple  0.159570     2
4  banana  0.411271    10
5  orange  0.279007    11
6   apple  0.576264     4
7  banana  0.578607     9
8  orange  0.242959     6

	col1	col2	col3
0	apple	0.703158	12
3	apple	0.159570	2
6	apple	0.576264	4

# 當按另一列分組時，如何獲得某列的第n個最大值
df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 4,
                   'taste': np.random.rand(12),
                   'price': np.random.randint(0, 15, 12)})
# 通過df['fruit']下的banana值，查詢df['taste']的對應值
banana=df['taste'].groupby(df['fruit']).get_group('banana')
print(banana)
print("特定結果：",banana.sort_values().iloc[-2])#升序排序，找出倒數第二個值
df

1     0.209485
4     0.549818
7     0.498802
10    0.006632
Name: taste, dtype: float64
特定結果： 0.4988018517868045

	fruit	taste	price
0	apple	0.510446	7
1	banana	0.209485	1
2	orange	0.632166	1
3	apple	0.865764	4
4	banana	0.549818	9
5	orange	0.744718	5
6	apple	0.069171	0
7	banana	0.498802	14
8	orange	0.011808	2
9	apple	0.103222	13
10	banana	0.006632	6
11	orange	0.017787	13

# 計算熊貓數據框上的分組均值，並將分組列保留爲另一列（而不是索引）
df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                   'rating': np.random.rand(9),
                   'price': np.random.randint(0, 15, 9)})
print(df)
# 通過df['fruit']的值，查詢price的均值
out = df.groupby(df['fruit'], as_index=False)['price'].mean()
out

    fruit    rating  price
0   apple  0.090672      2
1  banana  0.019506      0
2  orange  0.354463      5
3   apple  0.466694     14
4  banana  0.807733      8
5  orange  0.488868      4
6   apple  0.640913      8
7  banana  0.977691      8
8  orange  0.390033      0

	fruit	price
0	apple	8.000000
1	banana	5.333333
2	orange	3.000000

# 通過2列聯接兩個數據框，使它們只有公共行
df1 = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                    'weight': ['high', 'medium', 'low'] * 3,
                    'price': np.random.randint(0, 15, 9)})

df2 = pd.DataFrame({'pazham': ['apple', 'orange', 'pine'] * 2,
                    'pounds': ['high', 'low'] * 3,
                    'price': np.random.randint(0, 15, 6)})
print(df1)
print(df2)
pd.merge(df1, df2, how='inner', 
         left_on=['fruit', 'weight'], 
         right_on=['pazham', 'pounds'], 
         suffixes=['_left', '_right'])

    fruit  weight  price
0   apple    high     13
1  banana  medium      5
2  orange     low     14
3   apple    high      2
4  banana  medium      2
5  orange     low      6
6   apple    high      7
7  banana  medium      1
8  orange     low      6
   pazham pounds  price
0   apple   high     10
1  orange    low     14
2    pine   high     12
3   apple    low     12
4  orange   high     11
5    pine    low      6

	fruit	weight	price_left	pazham	pounds	price_right
0	apple	high	13	apple	high	10
1	apple	high	2	apple	high	10
2	apple	high	7	apple	high	10
3	orange	low	14	orange	low	14
4	orange	low	6	orange	low	14
5	orange	low	6	orange	low	14

# 獲得兩列值匹配的位置
df = pd.DataFrame({'fruit1': np.random.choice(['apple', 'orange', 'banana'], 10),
                    'fruit2': np.random.choice(['apple', 'orange', 'banana'], 10)})
print(df)
np.where(df.fruit1 == df.fruit2)[0]#如果兩個值相等，返回下標

   fruit1  fruit2
0  orange  orange
1  orange   apple
2  orange   apple
3  banana  banana
4   apple  banana
5  orange  orange
6  orange  banana
7  banana  orange
8   apple  banana
9  orange  banana





array([0, 3, 5], dtype=int64)

# 在數據框中創建列的滯後和超前
df = pd.DataFrame(np.random.randint(1, 100, 20).reshape(-1, 4),
                  columns = list('abcd'))

print(df)
df['a_lag'] = df['a'].shift(1)#把a列中的前一個索引放到a_lag
print(df)
df['b_lead'] = df['b'].shift(-1)#把b列中的後一個索引放到a_lag
df

    a   b   c   d
0  31  79  72  32
1   8  18  82  25
2  98  23  41  79
3   2  87  74  76
4  16  89  12  86
    a   b   c   d  a_lag
0  31  79  72  32    NaN
1   8  18  82  25   31.0
2  98  23  41  79    8.0
3   2  87  74  76   98.0
4  16  89  12  86    2.0

	a	b	c	d	a_lag	b_lead
0	31	79	72	32	NaN	18.0
1	8	18	82	25	31.0	23.0
2	98	23	41	79	8.0	87.0
3	2	87	74	76	98.0	89.0
4	16	89	12	86	2.0	NaN

# 獲得整個數據幀中唯一值的頻率
df = pd.DataFrame(np.random.randint(1, 10, 20).reshape(-1, 4), 
                  columns = list('abcd'))
ravel=df.values.ravel()#把數據框中的值變成一維數組
print(ravel)
pd.value_counts(ravel)#計算每個值出現的頻率

[6 1 6 8 7 8 3 4 1 8 7 7 1 1 9 8 5 8 5 5]





8    5
1    4
7    3
5    3
6    2
9    1
4    1
3    1
dtype: int64

# 將文本列拆分爲兩個單獨的列
df = pd.DataFrame([ "STD, City,State",
                    "33, Kolkata,West Bengal",
                    "44, Chennai,Tamil Nadu",
                    "40, Hyderabad  ,Telengana",
                    "80, Bangalore,Karnataka"], 
                  columns=['row'])
# print(df)
# 把每一行數據根據","分開
df_out = df.row.str.split(",", expand=True)
# print(df_out)

new_header = df_out.iloc[0]#獲取第一行數據框的值【前面】
df_out = df_out[1:]#把後面的每一行變爲數據框的值【後面】

df_out.columns = new_header#把第一行數據變爲列名
# print(new_header)
# print(new_values)
print(df_out)

0 STD          City        State
1  33       Kolkata  West Bengal
2  44       Chennai   Tamil Nadu
3  40   Hyderabad      Telengana
4  80     Bangalore    Karnataka

迷心兔

發佈了47 篇原創文章 · 獲贊 3 · 訪問量 7827

私信關注

pandas基礎學習

985 碩士程序員，空窗 4 個月沒有 Offer！

一文搞懂 Spring 循環依賴

賽博鬥地主——使用大語言模型扮演Agent智能體玩牌類遊戲。

VScode右鍵打開(添加到右鍵)

記一次 .NET某工控視覺自動化系統卡死分析

WindowsServer--SQL Server搭建主從同步實現讀寫分離 - 事務性分發

java由於越界導致的報錯

numpy中級學習

異步【ThreadPoolExecutor】和【ProcessPoolExecutor】運算比較

生成器還可以疊加來組成生成器管道

運用staticmethod裝飾器的簡單方式

Python threading Lock同步線程

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結

	0	5	10	15	20	b	c	d	e
0	1	0	0	0	0	1	2	3	4
1	0	1	0	0	0	6	7	8	9
2	0	0	1	0	0	11	12	13	14
3	0	0	0	1	0	16	17	18	19
4	0	0	0	0	1	21	22	23	24

	0	1	2	3	4	5	6	7	8	9	penultimate
0	50	12	77	25	22	97	49	40	27	18	77
1	14	52	78	3	67	5	77	17	43	53	77
2	92	53	10	39	55	34	63	89	60	41	89
3	9	89	66	50	88	4	46	19	87	75	88
4	97	95	75	50	91	60	65	3	24	59	95
5	31	38	4	81	9	1	52	71	84	57	81
6	59	7	19	33	49	40	54	60	48	4	59
7	90	21	77	44	3	50	98	23	84	30	90

	0	1	2	3	4	5	6	7	8	9
0	0	65	92	82	10	1	51	71	32	0
1	79	0	11	99	28	68	24	8	0	83
2	34	4	0	35	11	91	83	0	41	29
3	84	72	5	0	65	76	0	25	25	64
4	98	14	2	10	0	0	2	94	40	84
5	75	8	8	27	0	0	23	62	73	95
6	23	43	38	0	36	43	0	7	65	6
7	80	96	0	82	92	79	64	0	61	67
8	29	0	96	96	76	21	94	72	0	4
9	0	26	27	65	95	19	19	1	90	0

	0	5	10	15	20	b	c	d	e
0	1	0	0	0	0	1	2	3	4
1	0	1	0	0	0	6	7	8	9
2	0	0	1	0	0	11	12	13	14
3	0	0	0	1	0	16	17	18	19
4	0	0	0	0	1	21	22	23	24

	0	1	2	3	4	5	6	7	8	9	penultimate
0	50	12	77	25	22	97	49	40	27	18	77
1	14	52	78	3	67	5	77	17	43	53	77
2	92	53	10	39	55	34	63	89	60	41	89
3	9	89	66	50	88	4	46	19	87	75	88
4	97	95	75	50	91	60	65	3	24	59	95
5	31	38	4	81	9	1	52	71	84	57	81
6	59	7	19	33	49	40	54	60	48	4	59
7	90	21	77	44	3	50	98	23	84	30	90

	0	1	2	3	4	5	6	7	8	9
0	0	65	92	82	10	1	51	71	32	0
1	79	0	11	99	28	68	24	8	0	83
2	34	4	0	35	11	91	83	0	41	29
3	84	72	5	0	65	76	0	25	25	64
4	98	14	2	10	0	0	2	94	40	84
5	75	8	8	27	0	0	23	62	73	95
6	23	43	38	0	36	43	0	7	65	6
7	80	96	0	82	92	79	64	0	61	67
8	29	0	96	96	76	21	94	72	0	4
9	0	26	27	65	95	19	19	1	90	0

	0	5	10	15	20	b	c	d	e
0	1	0	0	0	0	1	2	3	4
1	0	1	0	0	0	6	7	8	9
2	0	0	1	0	0	11	12	13	14
3	0	0	0	1	0	16	17	18	19
4	0	0	0	0	1	21	22	23	24

	0	1	2	3	4	5	6	7	8	9	penultimate
0	50	12	77	25	22	97	49	40	27	18	77
1	14	52	78	3	67	5	77	17	43	53	77
2	92	53	10	39	55	34	63	89	60	41	89
3	9	89	66	50	88	4	46	19	87	75	88
4	97	95	75	50	91	60	65	3	24	59	95
5	31	38	4	81	9	1	52	71	84	57	81
6	59	7	19	33	49	40	54	60	48	4	59
7	90	21	77	44	3	50	98	23	84	30	90

	0	1	2	3	4	5	6	7	8	9
0	0	65	92	82	10	1	51	71	32	0
1	79	0	11	99	28	68	24	8	0	83
2	34	4	0	35	11	91	83	0	41	29
3	84	72	5	0	65	76	0	25	25	64
4	98	14	2	10	0	0	2	94	40	84
5	75	8	8	27	0	0	23	62	73	95
6	23	43	38	0	36	43	0	7	65	6
7	80	96	0	82	92	79	64	0	61	67
8	29	0	96	96	76	21	94	72	0	4
9	0	26	27	65	95	19	19	1	90	0