In[1]: import pandas as pd import numpy as np import matplotlib.pyplot as plt %matplotlib inline
1. DataFrame添加新的行
# 讀取names數據集 In[2]: names = pd.read_csv('data/names.csv') names Out[2]:
# 用loc直接賦值新的行 In[3]: new_data_list = ['Aria', 1] names.loc[4] = new_data_list names Out[3]:
# 用loc的標籤直接賦值新的行 In[4]: names.loc['five'] = ['Zach', 3] names Out[4]:
# 也可以用字典賦值新行 In[5]: names.loc[len(names)] = {'Name':'Zayd', 'Age':2} names Out[5]:
In[6]: names Out[6]:
# 字典可以打亂列名的順序 In[7]: names.loc[len(names)] = pd.Series({'Age':32, 'Name':'Dean'}) names Out[7]:
# 直接append一個字典 In[8]: names = pd.read_csv('data/names.csv') names.append({'Name':'Aria', 'Age':1}) --------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-8-562aecc73587> in <module>() 1 # Use append with fresh copy of names 2 names = pd.read_csv('data/names.csv') ----> 3 names.append({'Name':'Aria', 'Age':1}) /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in append(self, other, ignore_index, verify_integrity) 4515 other = Series(other) 4516 if other.name is None and not ignore_index: -> 4517 raise TypeError('Can only append a Series if ignore_index=True' 4518 ' or if the Series has a name') 4519 TypeError: Can only append a Series if ignore_index=True or if the Series has a name
# 按照錯誤提示,加上ignore_index=True In[9]: names.append({'Name':'Aria', 'Age':1}, ignore_index=True) Out[9]:
# 設定行索引 In[10]: names.index = ['Canada', 'Canada', 'USA', 'USA'] names Out[10]:
# 添加一行 In[11]: names.append({'Name':'Aria', 'Age':1}, ignore_index=True) Out[11]:
# 創建一個Series對象 In[12]: s = pd.Series({'Name': 'Zach', 'Age': 3}, name=len(names)) s Out[12]: Age 3 Name Zach Name: 4, dtype: object
# append方法可以將DataFrame和Series相連 In[13]: names.append(s) Out[13]:
# append方法可以同時連接多行,只要將對象放到列表中 In[14]: s1 = pd.Series({'Name': 'Zach', 'Age': 3}, name=len(names)) s2 = pd.Series({'Name': 'Zayd', 'Age': 2}, name='USA') names.append([s1, s2]) Out[14]:
# 讀取baseball16數據集 In[15]: bball_16 = pd.read_csv('data/baseball16.csv') bball_16.head() Out[15]:
# 選取一行,並將其轉換爲字典 In[16]: data_dict = bball_16.iloc[0].to_dict() print(data_dict) {'playerID': 'altuvjo01', 'yearID': 2016, 'stint': 1, 'teamID': 'HOU', 'lgID': 'AL', 'G': 161, 'AB': 640, 'R': 108, 'H': 216, '2B': 42, '3B': 5, 'HR': 24, 'RBI': 96.0, 'SB': 30.0, 'CS': 10.0, 'BB': 60, 'SO': 70.0, 'IBB': 11.0, 'HBP': 7.0, 'SH': 3.0, 'SF': 7.0, 'GIDP': 15.0}
# 對這個字典做格式處理,如果是字符串則爲空,否則爲缺失值 In[17]: new_data_dict = {k: '' if isinstance(v, str) else np.nan for k, v in data_dict.items()} print(new_data_dict) {'playerID': '', 'yearID': nan, 'stint': nan, 'teamID': '', 'lgID': '', 'G': nan, 'AB': nan, 'R': nan, 'H': nan, '2B': nan, '3B': nan, 'HR': nan, 'RBI': nan, 'SB': nan, 'CS': nan, 'BB': nan, 'SO': nan, 'IBB': nan, 'HBP': nan, 'SH': nan, 'SF': nan, 'GIDP': nan}
更多
# 將一行數據添加到DataFrame是非常消耗資源的,不能通過循環的方法來做。下面是創建一千行的新數據,用作Series的列表: In[18]: random_data = [] for i in range(1000): d = dict() for k, v in data_dict.items(): if isinstance(v, str): d[k] = np.random.choice(list('abcde')) else: d[k] = np.random.randint(10) random_data.append(pd.Series(d, name=i + len(bball_16))) random_data[0].head() Out[18]: 2B 2 3B 6 AB 8 BB 2 CS 0 Name: 16, dtype: object
# 給上面的append操作計時,1000行的數據用了5秒鐘 In[19]: %%timeit bball_16_copy = bball_16.copy() for row in random_data: bball_16_copy = bball_16_copy.append(row) 5.36 s ± 298 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# 如果是通過列表的方式append,可以大大節省時間 In[20]: %%timeit bball_16_copy = bball_16.copy() bball_16_copy = bball_16_copy.append(random_data) 86.2 ms ± 3.71 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
2. 連接多個DataFrame
# 讀取stocks_2016和stocks_2017兩個數據集,用Symbol作爲行索引名 In[21]: stocks_2016 = pd.read_csv('data/stocks_2016.csv', index_col='Symbol') stocks_2017 = pd.read_csv('data/stocks_2017.csv', index_col='Symbol') In[22]: stocks_2016 Out[22]:
In[23]: stocks_2017 Out[23]:
# 將兩個DataFrame放到一個列表中,用pandas的concat方法將它們連接起來 In[24]: s_list = [stocks_2016, stocks_2017] pd.concat(s_list) Out[24]:
# keys參數可以給兩個DataFrame命名,該標籤會出現在行索引的最外層,會生成多層索引,names參數可以重命名每個索引層 In[25]: pd.concat(s_list, keys=['2016', '2017'], names=['Year', 'Symbol']) Out[25]:
# 也可以橫向連接。只要將axis參數設爲columns或1 In[26]: pd.concat(s_list, keys=['2016', '2017'], axis='columns', names=['Year', None]) Out[26]:
# concat函數默認使用的是外連接,會保留每個DataFrame中的所有行。也可以通過設定join參數,使用內連接: In[27]: pd.concat(s_list, join='inner', keys=['2016', '2017'], axis='columns', names=['Year', None]) Out[27]:
更多
# append是concat方法的超簡化版本,append內部其實就是調用concat。前本節的第二個例子,pd.concat也可以如下實現: In[28]: stocks_2016.append(stocks_2017) Out[28]:
# 原書沒有下面三行代碼 In[29]: stocks_2015 = stocks_2016.copy() In[30]: stocks_2017 Out[30]:
3. 比較特朗普和奧巴馬的支持率
# pandas的read_html函數可以從網頁抓取表格數據 In[31]: base_url = 'http://www.presidency.ucsb.edu/data/popularity.php?pres={}' trump_url = base_url.format(45) df_list = pd.read_html(trump_url) len(df_list) Out[31]: 14
# 一共返回了14個表的DataFrame,取第一個 In[32]: df0 = df_list[0] df0.shape Out[32]: (324, 1906) In[33]: df0.head(7) Out[33]:
# 用match參數匹配table中的字符串 In[34]: df_list = pd.read_html(trump_url, match='Start Date') len(df_list) Out[34]: 3
# 通過檢查頁面元素的屬性,用attrs參數進行匹配 In[35]: df_list = pd.read_html(trump_url, match='Start Date', attrs={'align':'center'}) len(df_list) Out[35]: 1
# 查看DataFrame的形狀 In[36]: trump = df_list[0] trump.shape Out[36]: (265, 19)
In[37]: trump.head(8) Out[37]:
# skiprows可以指定跳過一些行,header參數可以指定列名,用parse_dates指定開始和結束日期 In[38]: df_list = pd.read_html(trump_url, match='Start Date', attrs={'align':'center'}, header=0, skiprows=[0,1,2,3,5], parse_dates=['Start Date', 'End Date']) trump = df_list[0] trump.head() Out[38]:
# 刪除所有值都是缺失值的列 In[39]: trump = trump.dropna(axis=1, how='all') trump.head() Out[39]:
# 統計各列的缺失值個數 In[40]: trump.isnull().sum() Out[40]: President 258 Start Date 0 End Date 0 Approving 0 Disapproving 0 unsure/no data 0 dtype: int64 # 缺失值向前填充 In[41]: trump = trump.ffill() trump.head() Out[41]:
# 確認數據類型 In[42]: trump.dtypes Out[42]: President object Start Date datetime64[ns] End Date datetime64[ns] Approving int64 Disapproving int64 unsure/no data int64 dtype: object
# 將前面的步驟做成一個函數,用於獲取任意總統的信息 In[43]: def get_pres_appr(pres_num): base_url = 'http://www.presidency.ucsb.edu/data/popularity.php?pres={}' pres_url = base_url.format(pres_num) df_list = pd.read_html(pres_url, match='Start Date', attrs={'align':'center'}, header=0, skiprows=[0,1,2,3,5], parse_dates=['Start Date', 'End Date']) pres = df_list[0].copy() pres = pres.dropna(axis=1, how='all') pres['President'] = pres['President'].ffill() return pres.sort_values('End Date').reset_index(drop=True) # 括號中的數字是總統的編號,奧巴馬是44 In[44]: obama = get_pres_appr(44) obama.head() Out[44]:
# 獲取最近五位總統的數據,輸出每位的前三行數據 In[45]: pres_41_45 = pd.concat([get_pres_appr(x) for x in range(41,46)], ignore_index=True) pres_41_45.groupby('President').head(3) Out[45]:
# 確認一下是否有一個日期對應多個支持率 In[46]: pres_41_45['End Date'].value_counts().head(8) Out[46]: 1990-03-11 2 1990-08-12 2 1990-08-26 2 2013-10-10 2 1999-02-09 2 1992-11-22 2 1990-05-22 2 2005-01-05 1 Name: End Date, dtype: int64
# 去除重複值 In[47]: pres_41_45 = pres_41_45.drop_duplicates(subset='End Date') In[48]: pres_41_45.shape Out[48]: (3695, 6)
# 對數據做簡單的統計 In[49]: pres_41_45['President'].value_counts() Out[49]: Barack Obama 2786 George W. Bush 270 Donald J. Trump 259 William J. Clinton 227 George Bush 153 Name: President, dtype: int64 In[50]: pres_41_45.groupby('President', sort=False).median().round(1) Out[50]:
# 畫出每任總統的支持率變化 In[51]: from matplotlib import cm fig, ax = plt.subplots(figsize=(16,6)) styles = ['-.', '-', ':', '-', ':'] colors = [.9, .3, .7, .3, .9] groups = pres_41_45.groupby('President', sort=False) for style, color, (pres, df) in zip(styles, colors, groups): df.plot('End Date', 'Approving', ax=ax, label=pres, style=style, color=cm.Greys(color), title='Presedential Approval Rating')
# 上面的圖是將數據前後串起來,也可以用支持率對在職天數作圖 In[52]: days_func = lambda x: x - x.iloc[0] pres_41_45['Days in Office'] = pres_41_45.groupby('President') \ ['End Date'] \ .transform(days_func) In[82]: pres_41_45['Days in Office'] = pres_41_45.groupby('President')['End Date'].transform(lambda x: x - x.iloc[0]) pres_41_45.groupby('President').head(3) Out[82]:
# 查看數據類型 In[83]: pres_41_45.dtypes Out[83]: President object Start Date datetime64[ns] End Date datetime64[ns] Approving int64 Disapproving int64 unsure/no data int64 Days in Office timedelta64[ns] dtype: object
# Days in Office的數據類型是timedelta64[ns],單位是納秒,將其轉換爲整數 In[86]: pres_41_45['Days in Office'] = pres_41_45['Days in Office'].dt.days pres_41_45['Days in Office'].head() Out[86]: 0 0 1 32 2 35 3 43 4 46 Name: Days in Office, dtype: int64
# 轉換數據,使每位總統的支持率各成一列 In[87]: pres_pivot = pres_41_45.pivot(index='Days in Office', columns='President', values='Approving') pres_pivot.head() Out[87]:
# 只畫出特朗普和奧巴馬的支持率 In[88]: plot_kwargs = dict(figsize=(16,6), color=cm.gray([.3, .7]), style=['-', '--'], title='Approval Rating') pres_pivot.loc[:250, ['Donald J. Trump', 'Barack Obama']].ffill().plot(**plot_kwargs) Out[88]: <matplotlib.axes._subplots.AxesSubplot at 0x1152254a8>
更多
# rolling average方法可以平滑曲線,在這個例子中,使用的是90天求平均,參數on指明瞭滾動窗口是從哪列計算的 In[89]: pres_rm = pres_41_45.groupby('President', sort=False) \ .rolling('90D', on='End Date')['Approving'] \ .mean() pres_rm.head() Out[89]: President End Date George Bush 1989-01-26 51.000000 1989-02-27 55.500000 1989-03-02 57.666667 1989-03-10 58.750000 1989-03-13 58.200000 Name: Approving, dtype: float64
# 對數據的行和列做調整,然後作圖 In[90]: styles = ['-.', '-', ':', '-', ':'] colors = [.9, .3, .7, .3, .9] color = cm.Greys(colors) title='90 Day Approval Rating Rolling Average' plot_kwargs = dict(figsize=(16,6), style=styles, color = color, title=title) correct_col_order = pres_41_45.President.unique() pres_rm.unstack('President')[correct_col_order].plot(**plot_kwargs) Out[90]: <matplotlib.axes._subplots.AxesSubplot at 0x1162d0780>
4. concat, join, 和merge的區別
concat
:
- Pandas函數
- 可以垂直和水平地連接兩個或多個pandas對象
- 只用索引對齊
- 索引出現重複值時會報錯
- 默認是外連接(也可以設爲內連接)
join
:
- DataFrame方法
- 只能水平連接兩個或多個pandas對象
- 對齊是靠被調用的DataFrame的列索引或行索引和另一個對象的行索引(不能是列索引)
- 通過笛卡爾積處理重複的索引值
- 默認是左連接(也可以設爲內連接、外連接和右連接)
merge
:
- DataFrame方法
- 只能水平連接兩個DataFrame對象
- 對齊是靠被調用的DataFrame的列或行索引和另一個DataFrame的列或行索引
- 通過笛卡爾積處理重複的索引值
- 默認是內連接(也可以設爲左連接、外連接、右連接)
# 用戶自定義的display_frames函數,可以接收一列DataFrame,然後在一行中顯示: In[91]: from IPython.display import display_html years = 2016, 2017, 2018 stock_tables = [pd.read_csv('data/stocks_{}.csv'.format(year), index_col='Symbol') for year in years] def display_frames(frames, num_spaces=0): t_style = '<table style="display: inline;"' tables_html = [df.to_html().replace('<table', t_style) for df in frames] space = ' ' * num_spaces display_html(space.join(tables_html), raw=True) display_frames(stock_tables, 30) stocks_2016, stocks_2017, stocks_2018 = stock_tables
# concat是唯一一個可以將DataFrames垂直連接起來的函數 In[92]: pd.concat(stock_tables, keys=[2016, 2017, 2018]) Out[92]:
# concat也可以將DataFrame水平連起來 In[93]: pd.concat(dict(zip(years,stock_tables)), axis='columns') Out[93]:
# 用join將DataFrame連起來;如果列名有相同的,需要設置lsuffix或rsuffix以進行區分 In[94]: stocks_2016.join(stocks_2017, lsuffix='_2016', rsuffix='_2017', how='outer') Out[94]:
In[95]: stocks_2016 Out[95]:
# 要重現前面的concat方法,可以將一個DataFrame列表傳入join In[96]: other = [stocks_2017.add_suffix('_2017'), stocks_2018.add_suffix('_2018')] stocks_2016.add_suffix('_2016').join(other, how='outer') Out[96]:
# 檢驗這兩個方法是否相同 In[97]: stock_join = stocks_2016.add_suffix('_2016').join(other, how='outer') stock_concat = pd.concat(dict(zip(years,stock_tables)), axis='columns') In[98]: stock_concat.columns = stock_concat.columns.get_level_values(1) + '_' + \ stock_concat.columns.get_level_values(0).astype(str) In[99]: stock_concat Out[99]:
In[100]: step1 = stocks_2016.merge(stocks_2017, left_index=True, right_index=True, how='outer', suffixes=('_2016', '_2017')) stock_merge = step1.merge(stocks_2018.add_suffix('_2018'), left_index=True, right_index=True, how='outer') stock_concat.equals(stock_merge) Out[100]: True
# 查看food_prices和food_transactions兩個小數據集 In[101]: names = ['prices', 'transactions'] food_tables = [pd.read_csv('data/food_{}.csv'.format(name)) for name in names] food_prices, food_transactions = food_tables display_frames(food_tables, 30)
# 通過鍵item和store,將food_transactions和food_prices兩個數據集融合 In[102]: food_transactions.merge(food_prices, on=['item', 'store']) Out[102]:
# 因爲steak在兩張表中分別出現了兩次,融合時產生了笛卡爾積,造成結果中出現了四行steak;因爲coconut沒有對應的價格,造成結果中沒有coconut # 下面只融合2017年的數據 In[103]: food_transactions.merge(food_prices.query('Date == 2017'), how='left') Out[103]:
# 使用join復現上面的方法,需要需要將要連接的food_prices列轉換爲行索引 In[104]: food_prices_join = food_prices.query('Date == 2017').set_index(['item', 'store']) food_prices_join Out[104]:
# join方法只對齊傳入DataFrame的行索引,但可以對齊調用DataFrame的行索引和列索引; # 要使用列做對齊,需要將其傳給參數on In[105]: food_transactions.join(food_prices_join, on=['item', 'store']) Out[105]:
# 要使用concat,需要將item和store兩列放入兩個DataFrame的行索引。但是,因爲行索引值有重複,造成了錯誤 In[106]: pd.concat([food_transactions.set_index(['item', 'store']), food_prices.set_index(['item', 'store'])], axis='columns') --------------------------------------------------------------------------- Exception Traceback (most recent call last) <ipython-input-106-8aa3223bf3d1> in <module>() 1 pd.concat([food_transactions.set_index(['item', 'store']), ----> 2 food_prices.set_index(['item', 'store'])], axis='columns') /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/reshape/concat.py in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy) 205 verify_integrity=verify_integrity, 206 copy=copy) --> 207 return op.get_result() 208 209 /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/reshape/concat.py in get_result(self) 399 obj_labels = mgr.axes[ax] 400 if not new_labels.equals(obj_labels): --> 401 indexers[ax] = obj_labels.reindex(new_labels)[1] 402 403 mgrs_indexers.append((obj._data, indexers)) /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/multi.py in reindex(self, target, method, level, limit, tolerance) 1861 tolerance=tolerance) 1862 else: -> 1863 raise Exception("cannot handle a non-unique multi-index!") 1864 1865 if not isinstance(target, MultiIndex): Exception: cannot handle a non-unique multi-index!
# glob模塊的glob函數可以將文件夾中的文件迭代取出,取出的是文件名字符串列表,可以直接傳給read_csv函數 In[107]: import glob df_list = [] for filename in glob.glob('data/gas prices/*.csv'): df_list.append(pd.read_csv(filename, index_col='Week', parse_dates=['Week'])) gas = pd.concat(df_list, axis='columns') gas.head() Out[107]:
5. 連接SQL數據庫
# 在讀取chinook數據庫之前,需要創建SQLAlchemy引擎 In[108]: from sqlalchemy import create_engine engine = create_engine('sqlite:///data/chinook.db') In[109]: tracks = pd.read_sql_table('tracks', engine) tracks.head() Out[109]:
# read_sql_table函數可以讀取一張表,第一個參數是表名,第二個參數是引擎 In[110]: genres = pd.read_sql_table('genres', engine) genres.head() Out[110]:
# 找到每種類型歌曲的平均時長 In[111]: genre_track = genres.merge(tracks[['GenreId', 'Milliseconds']], on='GenreId', how='left') \ .drop('GenreId', axis='columns') genre_track.head() Out[111]:
# 將Milliseconds列轉變爲timedelta數據類型 In[112]: genre_time = genre_track.groupby('Name')['Milliseconds'].mean() pd.to_timedelta(genre_time, unit='ms').dt.floor('s').sort_values() Out[112]: Name Rock And Roll 00:02:14 Opera 00:02:54 Hip Hop/Rap 00:02:58 Easy Listening 00:03:09 Bossa Nova 00:03:39 R&B/Soul 00:03:40 World 00:03:44 Pop 00:03:49 Latin 00:03:52 Alternative & Punk 00:03:54 Soundtrack 00:04:04 Reggae 00:04:07 Alternative 00:04:24 Blues 00:04:30 Rock 00:04:43 Jazz 00:04:51 Classical 00:04:53 Heavy Metal 00:04:57 Electronica/Dance 00:05:02 Metal 00:05:09 Comedy 00:26:25 TV Shows 00:35:45 Drama 00:42:55 Science Fiction 00:43:45 Sci Fi & Fantasy 00:48:31 Name: Milliseconds, dtype: timedelta64[ns]
# 找到每名顧客花費的總時長 In[113]: cust = pd.read_sql_table('customers', engine, columns=['CustomerId', 'FirstName', 'LastName']) invoice = pd.read_sql_table('invoices', engine, columns=['InvoiceId','CustomerId']) ii = pd.read_sql_table('invoice_items', engine, columns=['InvoiceId', 'UnitPrice', 'Quantity']) In[114]: cust_inv = cust.merge(invoice, on='CustomerId') \ .merge(ii, on='InvoiceId') cust_inv.head() Out[114]:
# 現在可以用總量乘以單位價格,找到每名顧客的總消費 In[115]: total = cust_inv['Quantity'] * cust_inv['UnitPrice'] cols = ['CustomerId', 'FirstName', 'LastName'] cust_inv.assign(Total = total).groupby(cols)['Total'] \ .sum() \ .sort_values(ascending=False).head() Out[115]:
更多
# sql語句查詢方法read_sql_query In[116]: pd.read_sql_query('select * from tracks limit 5', engine) Out[116]:
# 可以將長字符串傳給read_sql_query In[117]: sql_string1 = ''' select Name, time(avg(Milliseconds) / 1000, 'unixepoch') as avg_time from ( select g.Name, t.Milliseconds from genres as g join tracks as t on g.genreid == t.genreid ) group by Name order by avg_time ''' pd.read_sql_query(sql_string1, engine) Out[117]:
In[118]: sql_string2 = ''' select c.customerid, c.FirstName, c.LastName, sum(ii.quantity * ii.unitprice) as Total from customers as c join invoices as i on c.customerid = i.customerid join invoice_items as ii on i.invoiceid = ii.invoiceid group by c.customerid, c.FirstName, c.LastName order by Total desc ''' pd.read_sql_query(sql_string2, engine) Out[118]: