groupby選擇列的幾個位置,當然效率是不同的:
p_prs_23.groupby('NAT_AN')['is_failed'].sum()
p_prs_23[['NAT_AN','is_failed']].groupby('NAT_AN').sum()
p_prs_23.groupby('NAT_AN')[['NAT_AN','is_failed']].sum()
p_prs_23.groupby('NAT_AN').sum().reset_index()[['NAT_AN','is_failed']]
第9章代碼:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
# p265
df=pd.DataFrame({'key1':['a','a','b','b','a'],
'key2':['one','two','one','two','one'],
'data1':np.random.randn(5),
'data2':np.random.randn(5)})
grouped = df['data1'].groupby(df['key1'])
grouped.mean()
means = df['data1'].groupby([df['key1'],df['key2']]).mean()
means.unstack()
states = np.array(['ohio','california','california','ohio','ohio'])
years = np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()
df.groupby('key1').mean()
df.groupby(['key1','key2']).mean()
df.groupby(['key1','key2']).size()
for name,group in df.groupby('key1'):
print name
print group
for (k1,k2),group in df.groupby(['key1','key2']):
print k1,k2
print group
pieces = dict(list(df.groupby('key1')))
pieces['b']
df.dtypes
grouped = df.groupby(df.dtypes,axis=1)
dict(list(grouped))
df.groupby('key1')['data1'].mean()
df['data1'].groupby(df['key1']).mean()
df.groupby('key1')[['data1']].mean()
df[['data1']].groupby(df['key1']).mean()
df.groupby(['key1','key2'])[['data1']].mean()
s_group = df.groupby(['key1','key2'])['data2']
s_group.mean()
people=pd.DataFrame(np.random.randn(5,5),
columns=['a','b','c','d','e'],
index=['joe','steve','wes','jim','travis'])
people.ix[2:3,['b','c']]=np.nan
mapping = {'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}
by_column = people.groupby(mapping,axis=1)
by_column.sum()
map_series=pd.Series(mapping)
people.groupby(map_series,axis=1).count()
people.groupby(len).sum()
key_list=['one','one','one','two','two']
people.groupby([len,key_list]).min()
columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names=['cty','tensor'])
hief_df = pd.DataFrame(np.random.randn(4,5), columns=columns)
hief_df.groupby(level='cty',axis=1).count()
grouped=df.groupby('key1')
grouped['data1'].quantile(0.9)
grouped['data1'].quantile(0)
grouped['data1'].quantile(1)
def peak_to_peak(arr):
return arr.max()-arr.min()
grouped.agg(peak_to_peak)
grouped.describe()
tips = pd.read_csv('g:/tips.csv')
tips['tip_pct']=tips['tip']/tips['total_bill']
tips[:6]
grouped = tips.groupby(['sex','smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')
grouped_pct.agg(['mean','std',peak_to_peak])
grouped_pct.agg([('foo','mean'),('bar',np.std)])
functions=['count','mean','max']
result = grouped['tip_pct','total_bill'].agg(functions)
result = grouped[['tip_pct','total_bill']].agg(functions) # 同上一條
result['tip_pct']
ftuples = [('durchschnitt','mean'),('abweichung',np.var)]
grouped['tip_pct','total_bill'].agg(ftuples)
grouped.agg({'tip':np.max,'size':'sum'})
grouped.agg({'tip_pct':['min','max','mean','std'],'size':'sum'})
tips.groupby(['sex','smoker'], as_index=False).mean()
tips.groupby(['sex','smoker']).mean().reset_index()
k1_means = df.groupby('key1').mean().add_prefix('mean_')
pd.merge(df,k1_means,left_on='key1',right_index=True)
key = ['one','two','one','two','one']
people.groupby(key).mean()
people.groupby(key).transform(np.mean)
people.groupby(key).transform().mean() # ×
def demean(arr):
return arr-arr.mean()
demeaned = people.groupby(key).transform(demean)
demeaned.groupby(key).mean()
def top(df,n=5,column='tip_pct'):
return df.sort_index(by=column)[-n:] # 默認升續
top(tips, n=6)
tips.groupby('smoker').apply(top)
tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill') # ×
result=tips.groupby('smoker')['tip_pct'].describe()
result.unstack('smoker')
f=lambda x:x.describe()
grouped.apply(f)
tips.groupby('smoker',group_keys=False).apply(top)
frame=pd.DataFrame({'data1':np.random.randn(1000),'data2':np.random.randn(1000)})
factor=pd.cut(frame.data1,4)
factor[:10]
def get_stats(group):
return {'min':group.min(),'max':group.max(),'count':group.count(),'mean':group.mean()}
grouped=frame.data2.groupby(factor)
grouped.apply(get_stats).unstack()
grouping = pd.qcut(frame.data1,10,labels=False)
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()
s=pd.Series(np.random.randn(6))
s[::2]=np.nan
s.fillna(s.mean())
states=['ohio','new york','vermont','florida','oregon','nevada','california','idaho']
group_key=['east']*4+['west']*4
data=pd.Series(np.random.randn(8),index=states)
data[['vermont','nevada','idaho']]=np.nan
data.groupby(group_key).mean()
fill_mean = lambda g:g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)
def xx(g):
return g.fillna(g.mean())
data.groupby(group_key).apply(xx)
fill_values={'east':0.5,'west':-1}
fill_func=lambda g:g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)
def yy(g):
return g.fillna(fill_values[g.name])
data.groupby(group_key).apply(yy)
suits=['h','s','c','d']
card_val=(range(1,11)+[10]*3)*4
base_names=['a']+range(2,11)+['j','q','k']
cards=[]
for suit in suits:
cards.extend(str(num)+suit for num in base_names)
cards=[]
for suit in suits:
for num in base_names:
cards.extend(str(num)+suit) # ×
deck=pd.Series(card_val,index=cards)
def draw(deck,n=5):
return deck.take(np.random.permutation(len(deck))[:n])
draw(deck)
get_suit=lambda card: card[-1]
deck.groupby(get_suit).apply(draw,n=2)
deck.groupby(get_suit, group_keys=False).apply(draw,n=2)
df=pd.DataFrame({'category':['a']*4+['b']*4,'data':np.random.randn(8),'weights':np.random.randn(8)})
grouped=df.groupby('category')
get_wavg=lambda g: np.average(g['data'],weights=g['weights'])
grouped.apply(get_wavg)
close_px = pd.read_csv('g:/stock_px.csv',parse_dates=True,index_col=0)
close_px[-4:]
rets = close_px.pct_change().dropna()
spx_corr = lambda x: x.corrwith(x['SPX'])
by_year = rets.groupby(lambda x: x.year) # 爲什麼這樣就可以?
by_year.apply(spx_corr)
by_year = rets.groupby(rets.index.year) # works 2.
by_year.apply(spx_corr)
by_year.apply(corrwith(rets.SPX)) # ×
by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))
by_year.apply(rets['AAPL'].corr(rets['MSFT'])) # ×
import statsmodels.api as sm
def regress(data,yvar,xvars):
y=data[yvar]
x=data[xvars]
x['intercept']=1.
result=sm.OLS(y,x).fit()
return result.params
by_year.apply(regress,'AAPL',['SPX'])
tips.pivot_table(index=['sex','smoker'])
tips.groupby(['sex','smoker']).mean()
tips.pivot_table(['tip_pct','size'],index=['sex','day'],columns='smoker')
tips.pivot_table(['tip_pct','size'],index=['sex','day'],columns='smoker',margins=True)
tips.pivot_table('tip_pct',index=['sex','smoker'],columns='day',aggfunc='count',margins=True)
tips.pivot_table('size',index=['time','sex','smoker'],columns='day',aggfunc=sum,fill_value=0)
data=pd.DataFrame({'sample':[1,2,3,4,5,6,7,8,9,10],'gender':['f','m','f','m','m','m','f','f','m','f'],
'handedness':['r','l','r','r','l','r','r','l','r','r']})
pd.crosstab(data.gender,data.handedness,margins=True)
pd.crosstab([tips.time,tips.day],tips.smoker,margins=True)
fec=pd.read_csv('g:/p_all.csv')
fec.ix[123456]
unique_cands=fec.cand_nm.unique()
unique_cands[2]
parties={'Bachmann, Michelle':'r','Romney, Mitt':'r','Obama, Barack':'d'}
fec.cand_nm[123456:123461]
fec.cand_nm[123456:123461].map(parties)
fec['party']=fec.cand_nm.map(parties)
fec.party.value_counts()
fec.party.counts() # ×
(fec.contb_receipt_amt>0).value_counts()
fec=fec[fec.contb_receipt_amt>0]
fec_mrbo=fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])]
fec_mrbo=fec[fec.cand_nm.isin(('Obama, Barack','Romney, Mitt'))]
fec_mrbo=fec[fec.cand_nm in ['Obama, Barack','Romney, Mitt']] # ×
fec_mrbo=fec[fec.cand_nm.str.isin('Obama, Barack','Romney, Mitt')] # ×
fec_mrbo=fec[fec.cand_nm.str in('Obama, Barack','Romney, Mitt')] # ×
fec.contbr_occupation.value_counts()[:10]
occ_mapping={
'INFORMATION REQUESTED PER BEST EFFORTS':'not provided',
'INFORMATION REQUESTED':'not provided',
'INFORMATION REQUESTED (BEST EFFORTS)':'not provided',
'C.E.O.':'CEO'
}
f=lambda x: occ_mapping.get(x,x)
fec.contbr_occupation=fec.contbr_occupation.map(f)
emp_mapping={
'INFORMATION REQUESTED PER BEST EFFORTS':'not provided',
'INFORMATION REQUESTED':'not provided',
'SELF':'SELF-EMPLOYED',
'SELF EMPLOYED':'SELF-EMPLOYED',
}
f=lambda x: emp_mapping.get(x,x)
fec.contbr_employer=fec.contbr_employer.map(f)
by_occupation=fec.pivot_table('contb_receipt_amt',index='contbr_occupation',columns='party',aggfunc='sum')
over_2mm=by_occupation[by_occupation.sum(1)>2000000]
over_2mm.plot(kind='barh')
def get_top_amounts(group,key,n=5):
totals= group.groupby(key)['contb_receipt_amt'].sum()
return totals.order(ascending=False)[:n]
grouped=fec_mrbo.groupby('cand_nm')
grouped.apply(get_top_amounts,'contbr_occupation',n=7)
grouped.apply(get_top_amounts,'contbr_employer',n=7)
bins=np.array([0,1,10,100,1000,10000,100000,1000000,10000000])
labels=pd.cut(fec_mrbo.contb_receipt_amt,bins)
grouped=fec_mrbo.groupby(['cand_nm',labels])
grouped.size().unstack(0)
bucket_sums=grouped.contb_receipt_amt.sum().unstack(0)
normed_sum=bucket_sums.div(bucket_sums.sum(axis=1),axis=0)
normed_sum[:-2].plot(kind='barh',stacked=True)
grouped=fec_mrbo.groupby(['cand_nm','contbr_st'])
totals=grouped.contb_receipt_amt.sum().unstack(0).fillna(0)
totals[:10]
percent=totals.div(totals.sum(axis=1),axis=0)
percent=totals.div(totals.sum(1),axis=0)
percent[:10]
from mpl_toolkits.basemap import Basemap,cm
from matplotlib import rcParams
from matplotlib.collections import LineCollection
import matplotlib.pyplot as plt
from shapelib import ShapeFile
import dbflib
# p255
data=pd.read_csv('g:/haiti.csv')
data.head()
data.describe()
data=data[(data.LATITUDE>18)&(data.LATITUDE<20)&(data.LONGITUDE>-75)&(data.LONGITUDE<-70)&data.CATEGORY.notnull()]
def to_cat_list(catstr):
stripped=(x.strip() for x in catstr.split(','))
return [x for x in stripped if x]
def get_all_categories(cat_series):
cat_sets=(set(to_cat_list(x)) for x in cat_series)
return sorted(set.union(*cat_sets))
def get_english(cat):
code,names=cat.split('.')
if '|' in names:
names=names.split('|')[1]
return code, names
all_cats=get_all_categories(data.CATEGORY)
english_mapping=dict(get_english(x) for x in all_cats)
english_mapping['2a']
def get_code(seq):
return [x.split('.')[0] for x in seq if x ]
all_codes=get_code(all_cats)
all_cats.str.split('.')[0] # × 'list' object has no attribute 'str'
code_index=pd.Index(np.unique(all_codes))
dummy_frame=pd.DataFrame(np.zeros((len(data),len(code_index))),index=data.index,columns=code_index)
dummy_frame.ix[:,:6]
for row, cat in zip(data.index, data.CATEGORY):
codes=get_code(to_cat_list(cat))
dummy_frame.ix[row,codes]=1
data=data.join(dummy_frame.add_prefix('category_'))
data.ix[:,10:15]
def basic_haiti_map(ax=None,lllat=17.25,urlat=20.25,lllon=-75,urlon=-71):
m=Basemap(ax=ax, projection='stere',lon_0=(urlon+lllon)/2,lat_0=(urlat+lllat)/2,
llcrnrlat=lllat,urcrnrlat=urlat,llcrnrlon=lllon,urcrnrlon=urlon,resolution='f')
m.drawcoastlines()
m.drawstates()
m.drawcountries()
return m
fig,axes=plt.subplots(nrows=2,ncols=2,figsize=(12,10))
fig.subplots_adjust(hspace=0.05,wspace=0.05)
to_plot=['2a','1','3c','7a']
lllat=17.25;urlat=20.25;lllon=-75;urlon=-71
for code, ax in zip(to_plot,axes.flat):
m=basic_haiti_map(ax,lllat=lllat,urlat=urlat,lllon=lllon,urlon=urlon)
cat_data=data[data['category_%s' % code]==1]
# x,y=m(cat_data.LONGITUDE,cat_data.LATITUDE)
add_v=pd.DataFrame()
for i,j in zip(cat_data.LONGITUDE,cat_data.LATITUDE) :
x,y=m(i,j)
add_v=add_v.append(pd.DataFrame([x,y]).T)
# m.plot(x,y,'k.',alpha=0.5)
m.plot(add_v[0],add_v[1],'k.',alpha=0.5)
ax.set_title('%s:%s' % (code, english_mapping))
add_v=pd.DataFrame()
for i,j in zip(cat_data.LONGITUDE,cat_data.LATITUDE) :
x,y=m(i,j)
add_v=add_v.append(pd.DataFrame([x,y]).T)
m.plot(add_v[0],add_v[1],'k.',alpha=0.5)
for code, ax in zip(to_plot,axes.flat):
m=basic_haiti_map(ax,lllat=lllat,urlat=urlat,lllon=lllon,urlon=urlon)
cat_data=data[data['category_%s' % code]==1]
x,y=m(cat_data.LONGITUDE,cat_data.LATITUDE)
m.plot(x,y,'k.',alpha=0.5)
ax.set_title('%s:%s' % (code, english_mapping))