pandas基礎學習

import numpy as np
import pandas as pd

print(pd.__version__)#檢查版本
print(pd.show_versions(as_json=True))#檢查版本

# 如何從列表,numpy數組和字典創建系列
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

# 解
s1=pd.Series(mylist)
s2=pd.Series(myarr)
s3=pd.Series(mydict)
print("*"*50)
print(s1.head())#獲取前五個元素
print(s2.head())
print(s3.head())
0.25.1
{'system': {'commit': None, 'python': '3.7.4.final.0', 'python-bits': 64, 'OS': 'Windows', 'OS-release': '10', 'machine': 'AMD64', 'processor': 'Intel64 Family 6 Model 78 Stepping 3, GenuineIntel', 'byteorder': 'little', 'LC_ALL': 'None', 'LANG': 'None', 'LOCALE': 'None.None'}, 'dependencies': {'pandas': '0.25.1', 'numpy': '1.17.2', 'pytz': '2019.2', 'dateutil': '2.8.0', 'pip': '19.3.1', 'setuptools': '41.4.0', 'Cython': None, 'pytest': '4.3.0', 'hypothesis': None, 'sphinx': None, 'blosc': None, 'feather': None, 'xlsxwriter': '1.2.5', 'lxml.etree': '4.4.1', 'html5lib': None, 'pymysql': '0.9.2', 'psycopg2': None, 'jinja2': '2.10.1', 'IPython': '7.9.0', 'pandas_datareader': None, 'bs4': '4.8.0', 'bottleneck': None, 'fastparquet': None, 'gcsfs': None, 'matplotlib': '3.1.1', 'numexpr': None, 'odfpy': None, 'openpyxl': None, 'pandas_gbq': None, 'pyarrow': None, 'pytables': None, 's3fs': None, 'scipy': '1.3.1', 'sqlalchemy': '1.3.10', 'tables': None, 'xarray': None, 'xlrd': None, 'xlwt': '1.3.0'}}
None
**************************************************
0    a
1    b
2    c
3    e
4    d
dtype: object
0    0
1    1
2    2
3    3
4    4
dtype: int32
a    0
b    1
c    2
e    3
d    4
dtype: int64
# 將序列ser轉換爲數據框,並將其索引作爲數據框的另一列
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)

# 解
df=ser.to_frame().reset_index()
df.head()
index 0
0 a 0
1 b 1
2 c 2
3 e 3
4 d 4
# 如何結合多個系列形成一個數據框
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

# 解1
df1=pd.concat([ser1,ser2],axis=1)

# 解2
df2=pd.DataFrame({"col_1":ser1,"col_2":ser2})
print(df2.head())
  col_1  col_2
0     a      0
1     b      1
2     c      2
3     e      3
4     d      4
# 如何爲系列索引指定名稱
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))

# 解
ser.name="alphabets"
print(ser.head())
0    a
1    b
2    c
3    e
4    d
Name: alphabets, dtype: object
# 如何從中ser1刪除存在的項目ser2
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# 解
print(ser1[~ser1.isin(ser2)])
0    1
1    2
2    3
dtype: int64
# 如何獲得A系列和B系列都不通用的物品
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# 解
ser_u = pd.Series(np.union1d(ser1, ser2))  # 聯合
print(ser_u)
ser_i = pd.Series(np.intersect1d(ser1, ser2))  # 相交
print("-"*50)
print(ser_i)
print("-"*50)
print(ser_u[~ser_u.isin(ser_i)])#從【聯合中的數據】刪除【相交中的數據】
0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
dtype: int64
--------------------------------------------------
0    4
1    5
dtype: int64
--------------------------------------------------
0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64
# 如何獲得數字序列【計算的最小值,第25個百分點,中位數,第75個和最大值ser】
state = np.random.RandomState(100)
ser = pd.Series(state.normal(10, 5, 25))
print(ser)

# 解
result=np.percentile(ser, q=[0, 25, 50, 75, 100])
print(result)
0      1.251173
1     11.713402
2     15.765179
3      8.737820
4     14.906604
5     12.571094
6     11.105898
7      4.649783
8      9.052521
9     11.275007
10     7.709865
11    12.175817
12     7.082025
13    14.084235
14    13.363604
15     9.477944
16     7.343598
17    15.148663
18     7.809322
19     4.408409
20    18.094908
21    17.708026
22     8.740604
23     5.787821
24    10.922593
dtype: float64
[ 1.25117263  7.70986507 10.92259345 13.36360403 18.0949083 ]
# 如何獲得系列中唯一項目的頻率計數
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))
# print(ser)

# 解
print(ser.value_counts())
g    7
e    6
c    5
f    5
h    3
a    2
b    2
dtype: int64
# 如何僅保留最常見的前2個值並將其他所有值替換爲“Other”
np.random.RandomState(100)
ser=pd.Series(np.random.randint(1,5,12))#1到5的12個數
# print(ser)

# 解
# 獲取最多的前面兩個數值,並把其他數據變成"Other"
ser[~ser.isin(ser.value_counts().index[:2])]="Other"
print(ser)
0     Other
1         1
2         1
3     Other
4         4
5         4
6     Other
7         4
8     Other
9         1
10        4
11        1
dtype: object
# 將系列重塑ser爲具有7行5列的數據框
ser = pd.Series(np.random.randint(1, 10, 35))
# print(ser)

# 解
df=pd.DataFrame(ser.values.reshape(7,5))
print(df)
   0  1  2  3  4
0  7  9  2  3  9
1  4  8  9  9  3
2  7  7  8  6  5
3  4  2  8  4  6
4  1  9  4  2  5
5  8  4  8  3  1
6  4  4  5  2  7
# 從中找到除於3等於0的下標
ser = pd.Series(np.random.randint(1, 10, 7))
print(ser)

# 解
result=np.argwhere(ser%3==0)
result
0    1
1    6
2    5
3    4
4    9
5    6
6    1
dtype: int32





array([[1],
       [4],
       [5]], dtype=int64)
# 從中ser,提取列表中位置的項目pos
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]

# 解
result=ser.take(pos)
result
0     a
4     e
8     i
14    o
20    u
dtype: object
# 如何垂直和水平堆疊兩個系列
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

# 解
ser1.append(ser2)#垂直
print(ser1)

df = pd.concat([ser1, ser2], axis=1)#水平
df
0    0
1    1
2    2
3    3
4    4
dtype: int64
0 1
0 0 a
1 1 b
2 2 c
3 3 d
4 4 e
# 對【ser2系列】的值,查找【ser1系列】對應的下標
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

# 解
result=[np.where(i==ser1)[0].tolist()[0] for i in ser2]
print(result)
# 解
result=[pd.Index(ser1).get_loc(i) for i in ser2]
result
[5, 4, 0, 8]





[5, 4, 0, 8]
# 計算truth和pred系列的均方誤差
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)

np.mean((truth-pred)**2)
0.42318488444073726
# 將中每個單詞的第一個字符更改爲大寫ser
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

# 解
pd.Series([i.title() for i in ser])
0     How
1      To
2    Kick
3    Ass?
dtype: object
# 計算系列中每個單詞的字符數
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

# 解
ser.map(lambda x:len(x))
0    3
1    2
2    4
3    4
dtype: int64
# 計算一系列結果數之間的差異(兩值之間的差值)
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])

# 解
print(ser.tolist())
print(ser.diff().tolist())
[1, 3, 6, 10, 15, 21, 27, 35]
[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]
# 將一系列日期字符串轉換爲時間序列
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303',
                 '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

# 解
pd.to_datetime(ser)
0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]
# 對時間的操作
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', 
                 '2013/04/04', '2015-11-27', '2019-11-27T12:20'])

# 解
from dateutil.parser import parse
ser_ts = ser.map(lambda x: parse(x))#轉換成統一日期

print("這個月的第幾天: ", ser_ts.dt.day.tolist())#這個月的第幾天
print("這一年的第幾周: ", ser_ts.dt.weekofyear.tolist())#這一年的第幾周
print("這一年的第幾天: ", ser_ts.dt.dayofyear.tolist())#這一年的第幾天
print("這一天的星期幾: ", ser_ts.dt.weekday_name.tolist())#這一天的星期幾
這個月的第幾天:  [1, 2, 3, 4, 27, 27]
這一年的第幾周:  [53, 5, 9, 14, 48, 48]
這一年的第幾天:  [1, 33, 63, 94, 331, 331]
這一天的星期幾:  ['Friday', 'Wednesday', 'Saturday', 'Thursday', 'Friday', 'Wednesday']
# 已知年月的數值,現在設置天數爲4
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])

# 解
from dateutil.parser import parse
ser.map(lambda x: parse('04 ' + x))
0   2010-01-04
1   2011-02-04
2   2012-03-04
dtype: datetime64[ns]
# 從中ser,提取包含至少2個元音的單詞
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

# 解
from collections import Counter
mask = ser.map(lambda x: sum([Counter(x.lower()).get(i, 0) 
                              for i in list('aeiou')]) >= 2)
ser[mask]
0     Apple
1    Orange
4     Money
dtype: object
import re
# 從系列中過濾有效的電子郵件
emails = pd.Series(['buying books at amazom.com', '[email protected]',
                    '[email protected]', '[email protected]'])
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'

# 解
emails.str.findall(pattern, flags=re.IGNORECASE)
0                     []
1    [[email protected]]
2            [[email protected]]
3    [[email protected]]
dtype: object
# 計算weights每個的平均值fruit
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))

# 解
weights.groupby(fruit).mean()
apple     5.428571
banana    7.500000
carrot    2.000000
dtype: float64
# 計算兩個序列之間的歐式距離
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

# 解
result=sum((p - q)**2)**.5#方式一
print(result)
result=np.linalg.norm(p-q)#方式二
result
18.16590212458495





18.16590212458495
# 找到一個數值序列中的所有局部最大值(或峯值)
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])

# 解
dd=np.diff(np.sign(np.diff(ser)))
print(dd)# -2 就是某個點,兩邊的值比它小
peak_locs=np.where(dd==-2)[0]+1
print(peak_locs)
[-2  2  0  0 -2  2 -2]
[1 5 7]
# 用最不頻繁的字符替換字符串中的缺失空格
my_str = 'dbc deb abed gade'
ser = pd.Series(list('dbc deb abed gade'))

# 解
freq = ser.value_counts()#查看數值出現的頻率
print(freq)
least_freq = freq.dropna().index[-1]#獲取頻率最小那個
result="".join(ser.replace(' ', least_freq))
result
d    4
     3
e    3
b    3
a    2
g    1
c    1
dtype: int64





'dbccdebcabedcgade'
# 如何創建一個從2000年1月1日開始和之後的10個週末(星期六)的TimeSeries,
#【並使用隨機數作爲值】
ser = pd.Series(np.random.randint(1,10,10), 
                pd.date_range('2000-01-01', periods=10, freq='W-SAT'))
print(ser)

2000-01-01    7
2000-01-08    7
2000-01-15    3
2000-01-22    1
2000-01-29    4
2000-02-05    8
2000-02-12    7
2000-02-19    8
2000-02-26    3
2000-03-04    4
Freq: W-SAT, dtype: int32
# ser缺少日期和值。顯示所有缺少的日期,並填充上一個日期的值
ser = pd.Series([1,10,3,np.nan], index=pd.to_datetime([
                    '2000-01-01', '2000-01-03', '2000-01-06', '2000-01-08']))
print(ser)
result=ser.resample('D').ffill()#降序替換
print(result)
# ser.resample('D').bfill()#逆序替換
# ser.resample('D').bfill().ffill()#替換成前面的值(比如:都是3.0)
2000-01-01     1.0
2000-01-03    10.0
2000-01-06     3.0
2000-01-08     NaN
dtype: float64
2000-01-01     1.0
2000-01-02     1.0
2000-01-03    10.0
2000-01-04    10.0
2000-01-05    10.0
2000-01-06     3.0
2000-01-07     3.0
2000-01-08     NaN
Freq: D, dtype: float64
# 從csv文件中的每第n行導入一個數據框
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', chunksize=50)

df2 = pd.concat([chunk.iloc[0] for chunk in df], axis=1)
df2 = df2.transpose()#轉置操作
df2
crim zn indus chas nox rm age dis rad tax ptratio b lstat medv
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98 24.0
50 0.08873 21.0 5.64 0.0 0.439 5.963 45.7 6.8147 4.0 243.0 16.8 395.56 13.45 19.7
100 0.14866 0.0 8.56 0.0 0.520 6.727 79.9 2.7778 5.0 384.0 20.9 394.76 9.42 27.5
150 1.65660 0.0 19.58 0.0 0.871 6.122 97.3 1.6180 5.0 403.0 14.7 372.80 14.10 21.5
200 0.01778 95.0 1.47 0.0 0.403 7.135 13.9 7.6534 3.0 402.0 17.0 384.30 4.45 32.9
250 0.14030 22.0 5.86 0.0 0.431 6.487 13.0 7.3967 7.0 330.0 19.1 396.28 5.90 24.4
300 0.04417 70.0 2.24 0.0 0.400 6.871 47.4 7.8278 5.0 358.0 14.8 390.86 6.07 24.8
350 0.06211 40.0 1.25 0.0 0.429 6.490 44.4 8.7921 1.0 335.0 19.7 396.90 5.98 22.9
400 25.04610 0.0 18.10 0.0 0.693 5.987 100.0 1.5888 24.0 666.0 20.2 396.90 26.77 5.6
450 6.71772 0.0 18.10 0.0 0.713 6.749 92.6 2.3236 24.0 666.0 20.2 0.32 17.44 13.4
500 0.22438 0.0 9.69 0.0 0.585 6.027 79.7 2.4982 6.0 391.0 19.2 396.90 14.33 16.8
# 將csv導入數據框時更改列值
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', 
                converters={'medv': lambda x: 'High' if float(x) > 25 else 'Low'})
df
crim zn indus chas nox rm age dis rad tax ptratio b lstat medv
0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 Low
1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 Low
2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 High
3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 High
4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33 High
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
501 0.06263 0.0 11.93 0 0.573 6.593 69.1 2.4786 1 273 21.0 391.99 9.67 Low
502 0.04527 0.0 11.93 0 0.573 6.120 76.7 2.2875 1 273 21.0 396.90 9.08 Low
503 0.06076 0.0 11.93 0 0.573 6.976 91.0 2.1675 1 273 21.0 396.90 5.64 Low
504 0.10959 0.0 11.93 0 0.573 6.794 89.3 2.3889 1 273 21.0 393.45 6.48 Low
505 0.04741 0.0 11.93 0 0.573 6.030 80.8 2.5050 1 273 21.0 396.90 7.88 Low

506 rows × 14 columns

# 創建一個數據框,將行作爲給定序列的跨步
L = pd.Series(range(15))

def gen_strides(a, stride_len=5, window_len=5):
    n_strides = ((a.size-window_len)//stride_len) + 1
    return np.array([a[s:(s+window_len)] for s in np.arange(0, a.size, stride_len)[:n_strides]])

gen_strides(L, stride_len=2, window_len=4)

array([[ 0,  1,  2,  3],
       [ 2,  3,  4,  5],
       [ 4,  5,  6,  7],
       [ 6,  7,  8,  9],
       [ 8,  9, 10, 11],
       [10, 11, 12, 13]], dtype=int64)
# 僅從csv文件導入指定的列
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv',
                 usecols=['crim', 'medv'])
df.head()
crim medv
0 0.00632 24.0
1 0.02731 21.6
2 0.02729 34.7
3 0.03237 33.4
4 0.06905 36.2
# 獲取數據幀每列的n 行,n列,數據類型和摘要狀態
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
# print(df.shape)#數據的行,列
# print(df.dtypes)#打印每列數據的類型
# print(df.dtypes.value_counts())#每個dtype下有多少個
# print(df.values.tolist())#把數據合成列表
df.describe()#統計信息
Min.Price Price Max.Price MPG.city MPG.highway EngineSize Horsepower RPM Rev.per.mile Fuel.tank.capacity Passengers Length Wheelbase Width Turn.circle Rear.seat.room Luggage.room Weight
count 86.000000 91.000000 88.000000 84.000000 91.000000 91.000000 86.000000 90.000000 87.000000 85.000000 91.000000 89.000000 92.000000 87.000000 88.000000 89.000000 74.000000 86.000000
mean 17.118605 19.616484 21.459091 22.404762 29.065934 2.658242 144.000000 5276.666667 2355.000000 16.683529 5.076923 182.865169 103.956522 69.448276 38.954545 27.853933 13.986486 3104.593023
std 8.828290 9.724280 10.696563 5.841520 5.370293 1.045845 53.455204 605.554811 486.916616 3.375748 1.045953 14.792651 6.856317 3.778023 3.304157 3.018129 3.120824 600.129993
min 6.700000 7.400000 7.900000 15.000000 20.000000 1.000000 55.000000 3800.000000 1320.000000 9.200000 2.000000 141.000000 90.000000 60.000000 32.000000 19.000000 6.000000 1695.000000
25% 10.825000 12.350000 14.575000 18.000000 26.000000 1.800000 100.750000 4800.000000 2017.500000 14.500000 4.000000 174.000000 98.000000 67.000000 36.000000 26.000000 12.000000 2647.500000
50% 14.600000 17.700000 19.150000 21.000000 28.000000 2.300000 140.000000 5200.000000 2360.000000 16.500000 5.000000 181.000000 103.000000 69.000000 39.000000 27.500000 14.000000 3085.000000
75% 20.250000 23.500000 24.825000 25.000000 31.000000 3.250000 170.000000 5787.500000 2565.000000 19.000000 6.000000 192.000000 110.000000 72.000000 42.000000 30.000000 16.000000 3567.500000
max 45.400000 61.900000 80.000000 46.000000 50.000000 5.700000 300.000000 6500.000000 3755.000000 27.000000 8.000000 219.000000 119.000000 78.000000 45.000000 36.000000 22.000000 4105.000000
# 哪個製造商,型號和類型最高Price?最高Price值的單元格的行號和列號是多少?
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

max_price=df.loc[df.Price == np.max(df.Price), ['Manufacturer', 'Model', 'Type']]
print(max_price)

row, col = np.where(df.values == np.max(df.Price))
print(row,col)

# print(df.iat[row[0], col[0]])#獲取左上角的數值
print(df.iloc[row[0], col[0]])#獲取左上角的數值

print(df.at[row[0], 'Price'])#獲取Price列的第一個數值
     Manufacturer Model     Type
58  Mercedes-Benz  300E  Midsize
[58] [4]
61.9
61.9
# 重命名數據框中的特定列
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
# print(df.columns)
df.columns = df.columns.map(lambda x: x.replace('.', '_'))
print(df.columns)
Index(['Manufacturer', 'Model', 'Type', 'Min_Price', 'Price', 'Max_Price',
       'MPG_city', 'MPG_highway', 'AirBags', 'DriveTrain', 'Cylinders',
       'EngineSize', 'Horsepower', 'RPM', 'Rev_per_mile', 'Man_trans_avail',
       'Fuel_tank_capacity', 'Passengers', 'Length', 'Wheelbase', 'Width',
       'Turn_circle', 'Rear_seat_room', 'Luggage_room', 'Weight', 'Origin',
       'Make'],
      dtype='object')
# 檢查數據框是否缺少任何值
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

df.isnull().values.any()
True
# 計算每列中缺失值的數量,哪一列的缺失值最大
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

count=df.apply(lambda x: x.isnull().sum())
print(count)#缺失值的數量
print(count.idxmax())#缺失值最大
# 從數據框中選擇特定列作爲數據框而不是序列
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
print(df)

print(type(df[['a']]))
print(type(df.loc[:, ['a']]))
print(type(df.iloc[:, [0]]))

print(type(df.a))
print(type(df['a']))
print(type(df.loc[:, 'a']))
print(type(df.iloc[:, 1]))
    a   b   c   d   e
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
print(df)#沒有改變之前
# print(df[list('cbade')])#交換兩列的位置(交換a,c兩列)

# 自定義函數交換
def switch_columns(df,col_1=None,col_2=None):
    colnames=df.columns.tolist()
    i_1,i_2=colnames.index(col_1),colnames.index(col_2)
    colnames[i_2],colnames[i_1]=colnames[i_1],colnames[i_2]
    return df[colnames]
df1=switch_columns(df,'a','c')#交換兩列的位置
print(df1)
print(sorted(df.columns,reverse=True))#降序排列

    a   b   c   d   e
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
    c   b   a   d   e
0   2   1   0   3   4
1   7   6   5   8   9
2  12  11  10  13  14
3  17  16  15  18  19
['e', 'd', 'c', 'b', 'a']
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
pd.set_option("display.max_columns",10)
pd.set_option('display.max_rows',10)
df
Manufacturer Model Type Min.Price Price ... Rear.seat.room Luggage.room Weight Origin Make
0 Acura Integra Small 12.9 15.9 ... 26.5 NaN 2705.0 non-USA Acura Integra
1 NaN Legend Midsize 29.2 33.9 ... 30.0 15.0 3560.0 non-USA Acura Legend
2 Audi 90 Compact 25.9 29.1 ... 28.0 14.0 3375.0 non-USA Audi 90
3 Audi 100 Midsize NaN 37.7 ... 31.0 17.0 3405.0 non-USA Audi 100
4 BMW 535i Midsize NaN 30.0 ... 27.0 13.0 3640.0 non-USA BMW 535i
... ... ... ... ... ... ... ... ... ... ... ...
88 Volkswagen Eurovan Van 16.6 19.7 ... 34.0 NaN 3960.0 NaN Volkswagen Eurovan
89 Volkswagen Passat Compact 17.6 20.0 ... 31.5 14.0 2985.0 non-USA Volkswagen Passat
90 Volkswagen Corrado Sporty 22.9 23.3 ... 26.0 15.0 2810.0 non-USA Volkswagen Corrado
91 Volvo 240 Compact 21.8 22.7 ... 29.5 14.0 2985.0 non-USA Volvo 240
92 NaN 850 Midsize 24.8 26.7 ... 30.0 15.0 3245.0 non-USA Volvo 850

93 rows × 27 columns

df = pd.DataFrame(np.random.random(4), columns=['random'])
# 格式化或隱藏熊貓數據框中的科學計數法
print(df.round(4))#顯示後面4個小數
# 將數據框中的所有值格式化爲百分比
out=df.style.format({
    'random':'{0:.2%}'.format,
})
out
   random
0  0.8620
1  0.7903
2  0.0159
3  0.5417
            <tr>
                    <th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row0" class="row_heading level0 row0" >0</th>
                    <td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row0_col0" class="data row0 col0" >86.20%</td>
        </tr>
        <tr>
                    <th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row1" class="row_heading level0 row1" >1</th>
                    <td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row1_col0" class="data row1 col0" >79.03%</td>
        </tr>
        <tr>
                    <th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row2" class="row_heading level0 row2" >2</th>
                    <td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row2_col0" class="data row2 col0" >1.59%</td>
        </tr>
        <tr>
                    <th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row3" class="row_heading level0 row3" >3</th>
                    <td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row3_col0" class="data row3 col0" >54.17%</td>
        </tr>
</tbody></table>
random
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

df.iloc[::20, :][['Manufacturer', 'Model', 'Type']]#間隔20取一行
Manufacturer Model Type
0 Acura Integra Small
20 Chrysler LeBaron Compact
40 Honda Prelude Sporty
60 Mercury Cougar Midsize
80 Subaru Loyale Small
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv', 
                 usecols=[0,1,2,3,5])
# df.head()
df[['Manufacturer', 'Model', 'Type']] = df[['Manufacturer', 'Model', 'Type']].fillna('missing')
# df.head()
df.index = df.Manufacturer + '_' + df.Model + '_' + df.Type
print(df.index.is_unique)#通過組合相關列來創建主鍵索引
# df.head()
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randint(1, 30, 30).reshape(10,-1), columns=list('abc'))

print(df['a'])
print(df['a'].argsort())#通過比較值,獲取索引
df['a'].argsort()[::-1][5]#獲取列中第n個最大值的行號
0    25
1    16
2    12
3     8
4     6
5    17
6    15
7    24
8    16
9    28
Name: a, dtype: int32
0    4
1    3
2    2
3    6
4    1
5    8
6    5
7    7
8    0
9    9
Name: a, dtype: int64





8
ser = pd.Series(np.random.randint(1, 100, 15))

print('ser: ', ser.tolist(), 'mean: ', round(ser.mean()))
np.argwhere(ser>ser.mean())#在中ser,找到第二個最大值的位置大於平均值
ser:  [54, 77, 49, 74, 24, 95, 94, 14, 7, 50, 69, 65, 72, 72, 58] mean:  58.0





array([[ 1],
       [ 3],
       [ 5],
       [ 6],
       [10],
       [11],
       [12],
       [13]], dtype=int64)
# 獲得行總和> 100的數據幀的最後n行
df = pd.DataFrame(np.random.randint(10, 40, 60).reshape(-1, 4))

rowsums = df.apply(np.sum, axis=1)# 每行相加
print(np.where(rowsums > 100)[0][-2:])#獲取大於100的最後2行【返回的是索引】
last_two_rows = df.iloc[np.where(rowsums > 100)[0][-2:], :]
last_two_rows
[11 14]
0 1 2 3
11 27 32 22 32
14 21 35 37 30
# 將ser較低的5%ile和大於95%ile中的所有值分別替換爲第5個和第95%ile值。
ser = pd.Series(np.logspace(-2, 2, 30))

def cap_outliers(ser, low_perc, high_perc):
    low, high = ser.quantile([low_perc, high_perc])
    print(low_perc, '%ile: ', low, '|', high_perc, '%ile: ', high)
    ser[ser < low] = low
    ser[ser > high] = high
    return ser

capped_ser = cap_outliers(ser, .05, .95)
print(capped_ser)
0.05 %ile:  0.016049294076965887 | 0.95 %ile:  63.876672220183934
0      0.016049
1      0.016049
2      0.018874
3      0.025929
4      0.035622
5      0.048939
6      0.067234
7      0.092367
8      0.126896
9      0.174333
10     0.239503
11     0.329034
12     0.452035
13     0.621017
14     0.853168
15     1.172102
16     1.610262
17     2.212216
18     3.039195
19     4.175319
20     5.736153
21     7.880463
22    10.826367
23    14.873521
24    20.433597
25    28.072162
26    38.566204
27    52.983169
28    63.876672
29    63.876672
dtype: float64
# 在去除負值後將數據框重塑爲最大可能的正方形
df = pd.DataFrame(np.random.randint(-20, 50, 100).reshape(10,-1))
# print(df)
arr = df[df > 0].values.flatten()#把數據展平後,小於0的數變成nan
# print(arr)
arr_qualified = arr[~np.isnan(arr)]#去掉nan
print(arr_qualified)
#比較值,返回索引【倒敘索引】
top_indexes = np.argsort(arr_qualified)[::]
# print(top_indexes)
print(top_indexes[:n**2])
#以元素爲單位返回輸入的底限
n = int(np.floor(arr_qualified.shape[0]**.5))
# 提取指定索引位置的數據,並以一維數組或者矩陣返回
output = np.take(arr_qualified, sorted(top_indexes[:n**2])).reshape(n, -1)
print(output)

[ 6. 29. 48. 22. 14. 10. 49.  9. 18. 42. 31. 42. 16. 35. 45. 10.  2. 27.
 48.  2. 16. 48. 22. 12. 23. 13. 34. 38. 18. 10. 12. 48. 39. 18. 49. 24.
 35. 13. 16. 30. 35. 22. 44. 46.  8. 30.  1.  5. 30.  7. 15. 22.  6. 43.
 47.  8. 32. 21. 46.  5. 20. 39.  9. 17.]
[46 19 16 59 47  0 52 49 44 55 62  7  5 15 29 30 23 25 37  4 50 12 38 20
 63  8 33 28 60 57 22 51  3 41 24 35 17  1 45 39 48 10 56 26 40 13 36 27
 32 61 11  9 53 42 14 43 58 54  2 18 21 31 34  6]
[[ 6. 29. 48. 22. 14. 10. 49.  9.]
 [18. 42. 31. 42. 16. 35. 45. 10.]
 [ 2. 27. 48.  2. 16. 48. 22. 12.]
 [23. 13. 34. 38. 18. 10. 12. 48.]
 [39. 18. 49. 24. 35. 13. 16. 30.]
 [35. 22. 44. 46.  8. 30.  1.  5.]
 [30.  7. 15. 22.  6. 43. 47.  8.]
 [32. 21. 46.  5. 20. 39.  9. 17.]]
df=pd.DataFrame(np.arange(25).reshape(5,-1))
print(df)

# 交換數據幀的兩行
def swap_rows(df,i1,i2):
    df.iloc[i1,:],df.iloc[i2,:]=df.iloc[i2,:].copy(),df.iloc[i1,:].copy()
    return df
result=swap_rows(df,1,2)
result
    0   1   2   3   4
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
4  20  21  22  23  24
0 1 2 3 4
0 0 1 2 3 4
1 10 11 12 13 14
2 5 6 7 8 9
3 15 16 17 18 19
4 20 21 22 23 24
df = pd.DataFrame(np.arange(25).reshape(5, -1))
print(df)
df.iloc[::-1, :]#反轉數據框的行
    0   1   2   3   4
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
4  20  21  22  23  24
0 1 2 3 4
4 20 21 22 23 24
3 15 16 17 18 19
2 10 11 12 13 14
1 5 6 7 8 9
0 0 1 2 3 4
df = pd.DataFrame(np.arange(25).reshape(5,-1), columns=list('abcde'))
print(df)
# 獲取'a'數據框中列的一鍵編碼df,並將其附加爲列
result=pd.get_dummies(df['a'])
df_onehot=pd.concat([result,df[list('bcde')]],axis=1)#合併列顯示
df_onehot
    a   b   c   d   e
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
4  20  21  22  23  24
0 5 10 15 20 b c d e
0 1 0 0 0 0 1 2 3 4
1 0 1 0 0 0 6 7 8 9
2 0 0 1 0 0 11 12 13 14
3 0 0 0 1 0 16 17 18 19
4 0 0 0 0 1 21 22 23 24
df = pd.DataFrame(np.random.randint(1,100, 40).reshape(10, -1))
print(df)
print(df.apply(np.argmax, axis=1))#每行的最大值【這裏axis是相反的功能】
print(df.apply(np.argmax, axis=1).value_counts())#計算出現的次數【降序排序】
print(df.apply(np.argmax, axis=1).value_counts().index[0])#獲取最多出現的
    0   1   2   3
0  10  87  19  43
1   5  83  50  80
2  19  24  10  77
3  36  15  95  78
4   8  20  89  48
5  17  17  81  46
6  88  74  52  72
7  91  53  36  61
8  25  53  22  90
9   3  93  86  63
0    1
1    1
2    3
3    2
4    2
5    2
6    0
7    0
8    3
9    1
dtype: int64
2    3
1    3
3    2
0    2
dtype: int64
2
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1), 
                 index=list('abcdefgh'), columns=list('pqrstuvwxy'))
print(df)
print(df.corr())#各類型之間的相關係數DataFrame表格
abs_corrmat = np.abs(df.corr())#區絕對值
print(abs_corrmat)
max_corr = abs_corrmat.apply(lambda x: sorted(x)[-2])#取倒數第二位
print(max_corr)#
print(np.round(max_corr.tolist(), 2))#取小數點後面兩位
    p   q   r   s   t   u   v   w   x   y
a  41  72   5  31  67  26  45  65  21  60
b  15  56  72  91  99  32  38  14  52  36
c   7  92  96  84  26  79  81  12  75  50
d  73  46  42  15  80  76  10  34  45   5
e  15  72  55  14  17  54   9  35  36  18
f  12  73  47  84  85   9  31  67  13  64
g  25  43  56  76  62  43  93  25  53  99
h  80  70  30  68  40  74   2  41   7  47
          p         q         r         s         t         u         v  \
p  1.000000 -0.348616 -0.606492 -0.406070  0.056932  0.461015 -0.543905   
q -0.348616  1.000000  0.213753  0.159442 -0.554585  0.088275  0.023752   
r -0.606492  0.213753  1.000000  0.501096 -0.190063  0.290160  0.440666   
s -0.406070  0.159442  0.501096  1.000000  0.260183 -0.243481  0.505580   
t  0.056932 -0.554585 -0.190063  0.260183  1.000000 -0.568596 -0.009954   
u  0.461015  0.088275  0.290160 -0.243481 -0.568596  1.000000 -0.118254   
v -0.543905  0.023752  0.440666  0.505580 -0.009954 -0.118254  1.000000   
w  0.207508  0.125992 -0.797659 -0.285267  0.192809 -0.562124 -0.358931   
x -0.452943 -0.029441  0.798096  0.187066 -0.122999  0.376021  0.637259   
y -0.294716 -0.043568 -0.043181  0.579817  0.083632 -0.434149  0.729595   

          w         x         y  
p  0.207508 -0.452943 -0.294716  
q  0.125992 -0.029441 -0.043568  
r -0.797659  0.798096 -0.043181  
s -0.285267  0.187066  0.579817  
t  0.192809 -0.122999  0.083632  
u -0.562124  0.376021 -0.434149  
v -0.358931  0.637259  0.729595  
w  1.000000 -0.835494  0.145546  
x -0.835494  1.000000 -0.030812  
y  0.145546 -0.030812  1.000000  
          p         q         r         s         t         u         v  \
p  1.000000  0.348616  0.606492  0.406070  0.056932  0.461015  0.543905   
q  0.348616  1.000000  0.213753  0.159442  0.554585  0.088275  0.023752   
r  0.606492  0.213753  1.000000  0.501096  0.190063  0.290160  0.440666   
s  0.406070  0.159442  0.501096  1.000000  0.260183  0.243481  0.505580   
t  0.056932  0.554585  0.190063  0.260183  1.000000  0.568596  0.009954   
u  0.461015  0.088275  0.290160  0.243481  0.568596  1.000000  0.118254   
v  0.543905  0.023752  0.440666  0.505580  0.009954  0.118254  1.000000   
w  0.207508  0.125992  0.797659  0.285267  0.192809  0.562124  0.358931   
x  0.452943  0.029441  0.798096  0.187066  0.122999  0.376021  0.637259   
y  0.294716  0.043568  0.043181  0.579817  0.083632  0.434149  0.729595   

          w         x         y  
p  0.207508  0.452943  0.294716  
q  0.125992  0.029441  0.043568  
r  0.797659  0.798096  0.043181  
s  0.285267  0.187066  0.579817  
t  0.192809  0.122999  0.083632  
u  0.562124  0.376021  0.434149  
v  0.358931  0.637259  0.729595  
w  1.000000  0.835494  0.145546  
x  0.835494  1.000000  0.030812  
y  0.145546  0.030812  1.000000  
p    0.606492
q    0.554585
r    0.798096
s    0.579817
t    0.568596
u    0.568596
v    0.729595
w    0.835494
x    0.835494
y    0.729595
dtype: float64
[0.61 0.55 0.8  0.58 0.57 0.57 0.73 0.84 0.84 0.73]
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
print(df)
# 獲取df對象中的值【最小值/最大值】
min_by_max = np.min(df, axis=1)/np.max(df, axis=1)
min_by_max
    0   1   2   3   4   5   6   7   8   9
0  85  63  99  34  13  14  64  33  58  16
1  64  45  77  68  19  45  61   2  11  15
2  78  66  76  51  51  52  20  53  35  64
3  68  85   2  81  52  66  14  28  41  34
4  37  40  99  62  57  70  37  15  14  56
5  13  88  12  51  43   1  54  18  70  67
6  55  19  79  43  19   8  52   6  15  77
7  79  93  54  68  78  61  80  33  72  92





0    0.131313
1    0.025974
2    0.256410
3    0.023529
4    0.141414
5    0.011364
6    0.075949
7    0.354839
dtype: float64
# 創建一個新列'penultimate',該列的第二行的值第二大df
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
result=df.apply(lambda x:x.sort_values().unique()[-2],axis=1)
df['penultimate'] = result
df
0 1 2 3 4 5 6 7 8 9 penultimate
0 50 12 77 25 22 97 49 40 27 18 77
1 14 52 78 3 67 5 77 17 43 53 77
2 92 53 10 39 55 34 63 89 60 41 89
3 9 89 66 50 88 4 46 19 87 75 88
4 97 95 75 50 91 60 65 3 24 59 95
5 31 38 4 81 9 1 52 71 84 57 81
6 59 7 19 33 49 40 54 60 48 4 59
7 90 21 77 44 3 50 98 23 84 30 90
# 規範數據框中的所有列
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
print(df)

# df通過減去列均值併除以標準偏差來歸一化所有列
result=df.apply(lambda x:(
    (x - x.mean())/x.std()
).round(2))
result
    0   1   2   3   4   5   6   7   8   9
0  73  77  53  35   9  80  96  47  35  26
1  58  72  39  80  86  57  41  98  31  90
2  45  76  22  27   5  15  78  90  87  92
3  89  84  97  78  29  70  23  95  97  90
4  55  32  83  49  99  63  22  75  44  26
5  74  42  70  49  57  26  88  77   1   5
6  56  29  42  28  75  16  21  11  38  50
7  99  26  74  74  39  50  61   3  23   3
0 1 2 3 4 5 6 7 8 9
0 0.24 0.90 -0.28 -0.79 -1.16 1.31 1.36 -0.40 -0.30 -0.57
1 -0.57 0.70 -0.83 1.24 1.03 0.39 -0.41 0.96 -0.42 1.10
2 -1.28 0.86 -1.51 -1.15 -1.28 -1.28 0.78 0.75 1.32 1.15
3 1.10 1.18 1.47 1.15 -0.59 0.91 -0.99 0.88 1.63 1.10
4 -0.74 -0.92 0.91 -0.16 1.40 0.63 -1.02 0.35 -0.02 -0.57
5 0.29 -0.52 0.40 -0.16 0.20 -0.84 1.10 0.40 -1.35 -1.11
6 -0.68 -1.04 -0.71 -1.10 0.72 -1.24 -1.05 -1.36 -0.20 0.06
7 1.64 -1.16 0.56 0.97 -0.31 0.11 0.23 -1.57 -0.67 -1.17
# 規範數據框中的所有列
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
print(df)

# 排列所有列的範圍df,以使每一列的最小值爲0且最大值爲1。
result = df.apply(lambda x: (
    1 - (x.max() - x)/(x.max() - x.min())
).round(2))
result
    0   1   2   3   4   5   6   7   8   9
0  78   7  59   4  77  81  93  66  39  28
1  60  51  88  19  23  29  70  82  10  24
2   2  80   7  59  72  51  82  28  38  25
3  36  88   3   8  43   7  87  60  28  99
4  29  69  89  84  87  15  95  87  75  54
5  82  78  60  57  15  29  41  93  57  13
6  72  28  63   2  20  25   6  72  71  32
7  60   2  13  87  82  97  41  23  81  16
0 1 2 3 4 5 6 7 8 9
0 0.95 0.06 0.65 0.02 0.86 0.82 0.98 0.61 0.41 0.17
1 0.72 0.57 0.99 0.20 0.11 0.24 0.72 0.84 0.00 0.13
2 0.00 0.91 0.05 0.67 0.79 0.49 0.85 0.07 0.39 0.14
3 0.43 1.00 0.00 0.07 0.39 0.00 0.91 0.53 0.25 1.00
4 0.34 0.78 1.00 0.96 1.00 0.09 1.00 0.91 0.92 0.48
5 1.00 0.88 0.66 0.65 0.00 0.24 0.39 1.00 0.66 0.00
6 0.88 0.30 0.70 0.00 0.07 0.20 0.00 0.70 0.86 0.22
7 0.72 0.00 0.12 1.00 0.93 1.00 0.39 0.00 1.00 0.03
# 計算每一行與順序行的相關係數
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
print(df)
print([i for i in range(df.shape[0])[:-1]])#去除最後一行

result=[df.iloc[i].corr(df.iloc[i+1]) for i in range(df.shape[0])[:-1]]
result
    0   1   2   3   4   5   6   7   8   9
0  27   3  37   9  76  68  91  31  44   7
1  11  15  20  47  33  86  65  47   9  30
2  39   1  72  19  35  42  87  77  55  40
3  60   7   8  28  37  14  17   5   3   7
4  47  99  76  28  77  57  32  57  24  16
5   2  50  95  89  84  46  59  84   1   2
6  78  27  58  67  78   1   7  28  89  20
7  12  86  54  81  20  19  77   1   8  56
[0, 1, 2, 3, 4, 5, 6]





[0.5182965633327684,
 0.2595376913412023,
 -0.23874062518280761,
 0.005261734793477499,
 0.4687394611664755,
 -0.06555011633952691,
 -0.30907671467693215]
df = pd.DataFrame(np.random.randint(1,100, 100).reshape(10, -1))
rows=df.shape[0]#數據框的行數
for i in range(rows):
    df.iat[i,i]=0
    df.iat[rows-i-1,i]=0
df
0 1 2 3 4 5 6 7 8 9
0 0 65 92 82 10 1 51 71 32 0
1 79 0 11 99 28 68 24 8 0 83
2 34 4 0 35 11 91 83 0 41 29
3 84 72 5 0 65 76 0 25 25 64
4 98 14 2 10 0 0 2 94 40 84
5 75 8 8 27 0 0 23 62 73 95
6 23 43 38 0 36 43 0 7 65 6
7 80 96 0 82 92 79 64 0 61 67
8 29 0 96 96 76 21 94 72 0 4
9 0 26 27 65 95 19 19 1 90 0
# 通過密鑰獲取groupby數據幀的特定組
df = pd.DataFrame({'col1': ['apple', 'banana', 'orange'] * 3,
                   'col2': np.random.rand(9),
                   'col3': np.random.randint(0, 15, 9)})
print(df)
# 通過數據框的df['col1'],顯示有apple的行數
df.groupby(df['col1']).get_group('apple')
     col1      col2  col3
0   apple  0.703158    12
1  banana  0.535815    13
2  orange  0.177147     8
3   apple  0.159570     2
4  banana  0.411271    10
5  orange  0.279007    11
6   apple  0.576264     4
7  banana  0.578607     9
8  orange  0.242959     6
col1 col2 col3
0 apple 0.703158 12
3 apple 0.159570 2
6 apple 0.576264 4
# 當按另一列分組時,如何獲得某列的第n個最大值
df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 4,
                   'taste': np.random.rand(12),
                   'price': np.random.randint(0, 15, 12)})
# 通過df['fruit']下的banana值,查詢df['taste']的對應值
banana=df['taste'].groupby(df['fruit']).get_group('banana')
print(banana)
print("特定結果:",banana.sort_values().iloc[-2])#升序排序,找出倒數第二個值
df
1     0.209485
4     0.549818
7     0.498802
10    0.006632
Name: taste, dtype: float64
特定結果: 0.4988018517868045
fruit taste price
0 apple 0.510446 7
1 banana 0.209485 1
2 orange 0.632166 1
3 apple 0.865764 4
4 banana 0.549818 9
5 orange 0.744718 5
6 apple 0.069171 0
7 banana 0.498802 14
8 orange 0.011808 2
9 apple 0.103222 13
10 banana 0.006632 6
11 orange 0.017787 13
# 計算熊貓數據框上的分組均值,並將分組列保留爲另一列(而不是索引)
df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                   'rating': np.random.rand(9),
                   'price': np.random.randint(0, 15, 9)})
print(df)
# 通過df['fruit']的值,查詢price的均值
out = df.groupby(df['fruit'], as_index=False)['price'].mean()
out
    fruit    rating  price
0   apple  0.090672      2
1  banana  0.019506      0
2  orange  0.354463      5
3   apple  0.466694     14
4  banana  0.807733      8
5  orange  0.488868      4
6   apple  0.640913      8
7  banana  0.977691      8
8  orange  0.390033      0
fruit price
0 apple 8.000000
1 banana 5.333333
2 orange 3.000000
# 通過2列聯接兩個數據框,使它們只有公共行
df1 = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                    'weight': ['high', 'medium', 'low'] * 3,
                    'price': np.random.randint(0, 15, 9)})

df2 = pd.DataFrame({'pazham': ['apple', 'orange', 'pine'] * 2,
                    'pounds': ['high', 'low'] * 3,
                    'price': np.random.randint(0, 15, 6)})
print(df1)
print(df2)
pd.merge(df1, df2, how='inner', 
         left_on=['fruit', 'weight'], 
         right_on=['pazham', 'pounds'], 
         suffixes=['_left', '_right'])
    fruit  weight  price
0   apple    high     13
1  banana  medium      5
2  orange     low     14
3   apple    high      2
4  banana  medium      2
5  orange     low      6
6   apple    high      7
7  banana  medium      1
8  orange     low      6
   pazham pounds  price
0   apple   high     10
1  orange    low     14
2    pine   high     12
3   apple    low     12
4  orange   high     11
5    pine    low      6
fruit weight price_left pazham pounds price_right
0 apple high 13 apple high 10
1 apple high 2 apple high 10
2 apple high 7 apple high 10
3 orange low 14 orange low 14
4 orange low 6 orange low 14
5 orange low 6 orange low 14
# 獲得兩列值匹配的位置
df = pd.DataFrame({'fruit1': np.random.choice(['apple', 'orange', 'banana'], 10),
                    'fruit2': np.random.choice(['apple', 'orange', 'banana'], 10)})
print(df)
np.where(df.fruit1 == df.fruit2)[0]#如果兩個值相等,返回下標
   fruit1  fruit2
0  orange  orange
1  orange   apple
2  orange   apple
3  banana  banana
4   apple  banana
5  orange  orange
6  orange  banana
7  banana  orange
8   apple  banana
9  orange  banana





array([0, 3, 5], dtype=int64)
# 在數據框中創建列的滯後和超前
df = pd.DataFrame(np.random.randint(1, 100, 20).reshape(-1, 4),
                  columns = list('abcd'))

print(df)
df['a_lag'] = df['a'].shift(1)#把a列中的前一個索引放到a_lag
print(df)
df['b_lead'] = df['b'].shift(-1)#把b列中的後一個索引放到a_lag
df
    a   b   c   d
0  31  79  72  32
1   8  18  82  25
2  98  23  41  79
3   2  87  74  76
4  16  89  12  86
    a   b   c   d  a_lag
0  31  79  72  32    NaN
1   8  18  82  25   31.0
2  98  23  41  79    8.0
3   2  87  74  76   98.0
4  16  89  12  86    2.0
a b c d a_lag b_lead
0 31 79 72 32 NaN 18.0
1 8 18 82 25 31.0 23.0
2 98 23 41 79 8.0 87.0
3 2 87 74 76 98.0 89.0
4 16 89 12 86 2.0 NaN
# 獲得整個數據幀中唯一值的頻率
df = pd.DataFrame(np.random.randint(1, 10, 20).reshape(-1, 4), 
                  columns = list('abcd'))
ravel=df.values.ravel()#把數據框中的值變成一維數組
print(ravel)
pd.value_counts(ravel)#計算每個值出現的頻率
[6 1 6 8 7 8 3 4 1 8 7 7 1 1 9 8 5 8 5 5]





8    5
1    4
7    3
5    3
6    2
9    1
4    1
3    1
dtype: int64
# 將文本列拆分爲兩個單獨的列
df = pd.DataFrame([ "STD, City,State",
                    "33, Kolkata,West Bengal",
                    "44, Chennai,Tamil Nadu",
                    "40, Hyderabad  ,Telengana",
                    "80, Bangalore,Karnataka"], 
                  columns=['row'])
# print(df)
# 把每一行數據根據","分開
df_out = df.row.str.split(",", expand=True)
# print(df_out)

new_header = df_out.iloc[0]#獲取第一行數據框的值【前面】
df_out = df_out[1:]#把後面的每一行變爲數據框的值【後面】

df_out.columns = new_header#把第一行數據變爲列名
# print(new_header)
# print(new_values)
print(df_out)
0 STD          City        State
1  33       Kolkata  West Bengal
2  44       Chennai   Tamil Nadu
3  40   Hyderabad      Telengana
4  80     Bangalore    Karnataka
發佈了47 篇原創文章 · 獲贊 3 · 訪問量 7827
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章