import functools import pandas as pd import numpy as np df = pd.read_excel("examples.xls") # review what learned yesterday df["level"] = np.where(df.年級 <= 2013, "old", "new") df.to_excel("example_new.xls") # spliting # select the index satisfy some condition df_new = df.loc[df.年級 > 2013] # Building Criteria # 選擇滿足多個條件的行, 這其實也是昨天的內容 df_new = df[(df.年級 == 2013) & (df.是否在職生 == 0)] # 根據條件修改某列 df.loc[(df.年級 == 2013) | (df.學習形式代碼 == 1), "註冊狀態"] = 1 # 根據條件增加某列 df["滿足條件"] = np.where((df.年級 == 2013) | (df.學習形式代碼 == 1), "是", "否") # 根據條件進行排序 df2 = pd.DataFrame({'AAA': [4,5,6,7], 'BBB': [10,20,30,40], 'CCC': [100,50,-30,-50]}) df2_sort = df2.loc[(df2.AAA-5.5).abs().argsort()] df2_sort2 = df2.loc[(df2.AAA-5.5).argsort()] a = df2.AAA # 這得到的是一個Series print(df2_sort2) # 多個條件選擇 Crit1 = df2.AAA <= 5.5 Crit2 = df2.BBB == 10 Crit3 = df2.CCC > -40.0 CritList = [Crit1, Crit2, Crit3] AllCrit = functools.reduce(lambda x, y: x & y, CritList) # reduce: x&y&z print(df2.loc[AllCrit])
今天的很多知識都是昨天提到過的,僅增加兩個知識點: 1. 根據某一列排序更快的寫法: df.loc[df.AAA.argsort()] #事實上這就是用argsort()函數先生成一個index的array 2. 根據多個條件篩選,更快的寫法: df.loc[functools.reduce(lambda x, y: x & y, CritList)] # lambda x, y: x & y是一個整體,作爲一個function # CritList作爲sequence # 對於reduce的解釋:For example, reduce(lambda x, y: x+y, [1, 2, 3, 4, 5]) calculates ((((1+2)+3)+4)+5)
代碼在https://github.com/zhangjipinggom/pandas_learning