Python for Data Analysis 第9章

從merge(第7章)開始,就可以擺脫用MySQL進行大表join的冗長時間了;有了.loc(DataFrame數據選取)和 .str(矢量化字符串處理),可以不用mysql的update了;有了groupby(第9章),基本可以擺脫MySQL緩慢的select了。由此,MySQL只用來讀原始數據和寫最終結果,數據清洗中間過程全在Python下了。

groupby選擇列的幾個位置,當然效率是不同的:

p_prs_23.groupby('NAT_AN')['is_failed'].sum()

p_prs_23[['NAT_AN','is_failed']].groupby('NAT_AN').sum()

p_prs_23.groupby('NAT_AN')[['NAT_AN','is_failed']].sum()

p_prs_23.groupby('NAT_AN').sum().reset_index()[['NAT_AN','is_failed']]

第9章代碼:

# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np

# p265
df=pd.DataFrame({'key1':['a','a','b','b','a'],
                 'key2':['one','two','one','two','one'],
                 'data1':np.random.randn(5),
                 'data2':np.random.randn(5)})
grouped = df['data1'].groupby(df['key1'])
grouped.mean()
means = df['data1'].groupby([df['key1'],df['key2']]).mean()

means.unstack()
states = np.array(['ohio','california','california','ohio','ohio'])
years  = np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()
df.groupby('key1').mean()
df.groupby(['key1','key2']).mean()
df.groupby(['key1','key2']).size()

for name,group in df.groupby('key1'):
    print name
    print group
for (k1,k2),group in df.groupby(['key1','key2']):
    print k1,k2
    print group

pieces = dict(list(df.groupby('key1')))
pieces['b']
df.dtypes
grouped = df.groupby(df.dtypes,axis=1)
dict(list(grouped))
df.groupby('key1')['data1'].mean()
df['data1'].groupby(df['key1']).mean()
df.groupby('key1')[['data1']].mean()
df[['data1']].groupby(df['key1']).mean()

df.groupby(['key1','key2'])[['data1']].mean()
s_group = df.groupby(['key1','key2'])['data2']
s_group.mean()
people=pd.DataFrame(np.random.randn(5,5),
                    columns=['a','b','c','d','e'],
                    index=['joe','steve','wes','jim','travis'])
people.ix[2:3,['b','c']]=np.nan

mapping = {'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}
by_column = people.groupby(mapping,axis=1)
by_column.sum()
map_series=pd.Series(mapping)
people.groupby(map_series,axis=1).count()
people.groupby(len).sum()

key_list=['one','one','one','two','two']
people.groupby([len,key_list]).min()
columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names=['cty','tensor'])
hief_df = pd.DataFrame(np.random.randn(4,5), columns=columns)
hief_df.groupby(level='cty',axis=1).count()

grouped=df.groupby('key1')
grouped['data1'].quantile(0.9)
grouped['data1'].quantile(0)
grouped['data1'].quantile(1)
def peak_to_peak(arr):
    return arr.max()-arr.min()
grouped.agg(peak_to_peak)
grouped.describe()

tips = pd.read_csv('g:/tips.csv')
tips['tip_pct']=tips['tip']/tips['total_bill']

tips[:6]
grouped = tips.groupby(['sex','smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')
grouped_pct.agg(['mean','std',peak_to_peak])

grouped_pct.agg([('foo','mean'),('bar',np.std)])
functions=['count','mean','max']
result = grouped['tip_pct','total_bill'].agg(functions)
result = grouped[['tip_pct','total_bill']].agg(functions) # 同上一條
result['tip_pct']
ftuples = [('durchschnitt','mean'),('abweichung',np.var)]
grouped['tip_pct','total_bill'].agg(ftuples)

grouped.agg({'tip':np.max,'size':'sum'})
grouped.agg({'tip_pct':['min','max','mean','std'],'size':'sum'})
tips.groupby(['sex','smoker'], as_index=False).mean()
tips.groupby(['sex','smoker']).mean().reset_index()

k1_means = df.groupby('key1').mean().add_prefix('mean_')
pd.merge(df,k1_means,left_on='key1',right_index=True)
key = ['one','two','one','two','one']
people.groupby(key).mean()
people.groupby(key).transform(np.mean)
people.groupby(key).transform().mean() # ×

def demean(arr):
    return arr-arr.mean()
demeaned = people.groupby(key).transform(demean)
demeaned.groupby(key).mean()
def top(df,n=5,column='tip_pct'):
    return df.sort_index(by=column)[-n:] # 默認升續

top(tips, n=6)
tips.groupby('smoker').apply(top)
tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill') # ×

result=tips.groupby('smoker')['tip_pct'].describe()
result.unstack('smoker')
f=lambda x:x.describe()
grouped.apply(f)
tips.groupby('smoker',group_keys=False).apply(top)

frame=pd.DataFrame({'data1':np.random.randn(1000),'data2':np.random.randn(1000)})
factor=pd.cut(frame.data1,4)
factor[:10]
def get_stats(group):
    return {'min':group.min(),'max':group.max(),'count':group.count(),'mean':group.mean()}
grouped=frame.data2.groupby(factor)
grouped.apply(get_stats).unstack()

grouping = pd.qcut(frame.data1,10,labels=False)
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()
s=pd.Series(np.random.randn(6))
s[::2]=np.nan
s.fillna(s.mean())

states=['ohio','new york','vermont','florida','oregon','nevada','california','idaho']
group_key=['east']*4+['west']*4
data=pd.Series(np.random.randn(8),index=states)
data[['vermont','nevada','idaho']]=np.nan
data.groupby(group_key).mean()
fill_mean = lambda g:g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)
def xx(g):
    return g.fillna(g.mean())
data.groupby(group_key).apply(xx)
fill_values={'east':0.5,'west':-1}

fill_func=lambda g:g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)
def yy(g):
    return g.fillna(fill_values[g.name])
data.groupby(group_key).apply(yy)
suits=['h','s','c','d']
card_val=(range(1,11)+[10]*3)*4
base_names=['a']+range(2,11)+['j','q','k']
cards=[]
for suit in suits:
    cards.extend(str(num)+suit for num in base_names)
cards=[]
for suit in suits:
    for num in base_names:
        cards.extend(str(num)+suit) # ×
deck=pd.Series(card_val,index=cards)

def draw(deck,n=5):
    return deck.take(np.random.permutation(len(deck))[:n])
draw(deck)
get_suit=lambda card: card[-1]
deck.groupby(get_suit).apply(draw,n=2)
deck.groupby(get_suit, group_keys=False).apply(draw,n=2)
df=pd.DataFrame({'category':['a']*4+['b']*4,'data':np.random.randn(8),'weights':np.random.randn(8)})

grouped=df.groupby('category')
get_wavg=lambda g: np.average(g['data'],weights=g['weights'])
grouped.apply(get_wavg)
close_px = pd.read_csv('g:/stock_px.csv',parse_dates=True,index_col=0)
close_px[-4:]
rets = close_px.pct_change().dropna()

spx_corr = lambda x: x.corrwith(x['SPX'])
by_year = rets.groupby(lambda x: x.year) # 爲什麼這樣就可以?
by_year.apply(spx_corr)
by_year = rets.groupby(rets.index.year) # works 2.
by_year.apply(spx_corr)
by_year.apply(corrwith(rets.SPX)) # ×
by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))
by_year.apply(rets['AAPL'].corr(rets['MSFT'])) # ×
import statsmodels.api as sm
def regress(data,yvar,xvars):
    y=data[yvar]
    x=data[xvars]
    x['intercept']=1.
    result=sm.OLS(y,x).fit()
    return result.params
by_year.apply(regress,'AAPL',['SPX'])

tips.pivot_table(index=['sex','smoker'])
tips.groupby(['sex','smoker']).mean()
tips.pivot_table(['tip_pct','size'],index=['sex','day'],columns='smoker')

tips.pivot_table(['tip_pct','size'],index=['sex','day'],columns='smoker',margins=True)
tips.pivot_table('tip_pct',index=['sex','smoker'],columns='day',aggfunc='count',margins=True)
tips.pivot_table('size',index=['time','sex','smoker'],columns='day',aggfunc=sum,fill_value=0)

data=pd.DataFrame({'sample':[1,2,3,4,5,6,7,8,9,10],'gender':['f','m','f','m','m','m','f','f','m','f'],
'handedness':['r','l','r','r','l','r','r','l','r','r']})
pd.crosstab(data.gender,data.handedness,margins=True)
pd.crosstab([tips.time,tips.day],tips.smoker,margins=True)

fec=pd.read_csv('g:/p_all.csv')
fec.ix[123456]
unique_cands=fec.cand_nm.unique()
unique_cands[2]
parties={'Bachmann, Michelle':'r','Romney, Mitt':'r','Obama, Barack':'d'}

fec.cand_nm[123456:123461]
fec.cand_nm[123456:123461].map(parties)
fec['party']=fec.cand_nm.map(parties)
fec.party.value_counts()
fec.party.counts() # ×
(fec.contb_receipt_amt>0).value_counts()
fec=fec[fec.contb_receipt_amt>0]
fec_mrbo=fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])]
fec_mrbo=fec[fec.cand_nm.isin(('Obama, Barack','Romney, Mitt'))]
fec_mrbo=fec[fec.cand_nm in ['Obama, Barack','Romney, Mitt']] # ×
fec_mrbo=fec[fec.cand_nm.str.isin('Obama, Barack','Romney, Mitt')] # ×
fec_mrbo=fec[fec.cand_nm.str in('Obama, Barack','Romney, Mitt')] # ×

fec.contbr_occupation.value_counts()[:10]
occ_mapping={
'INFORMATION REQUESTED PER BEST EFFORTS':'not provided',
'INFORMATION REQUESTED':'not provided',
'INFORMATION REQUESTED (BEST EFFORTS)':'not provided',
'C.E.O.':'CEO'
}
f=lambda x: occ_mapping.get(x,x)
fec.contbr_occupation=fec.contbr_occupation.map(f)
emp_mapping={
'INFORMATION REQUESTED PER BEST EFFORTS':'not provided',
'INFORMATION REQUESTED':'not provided',
'SELF':'SELF-EMPLOYED',
'SELF EMPLOYED':'SELF-EMPLOYED',
}
f=lambda x: emp_mapping.get(x,x)
fec.contbr_employer=fec.contbr_employer.map(f)
by_occupation=fec.pivot_table('contb_receipt_amt',index='contbr_occupation',columns='party',aggfunc='sum')

over_2mm=by_occupation[by_occupation.sum(1)>2000000]
over_2mm.plot(kind='barh')

def get_top_amounts(group,key,n=5):
    totals= group.groupby(key)['contb_receipt_amt'].sum()
    return totals.order(ascending=False)[:n]
grouped=fec_mrbo.groupby('cand_nm')
grouped.apply(get_top_amounts,'contbr_occupation',n=7)
grouped.apply(get_top_amounts,'contbr_employer',n=7)

bins=np.array([0,1,10,100,1000,10000,100000,1000000,10000000])
labels=pd.cut(fec_mrbo.contb_receipt_amt,bins)
grouped=fec_mrbo.groupby(['cand_nm',labels])
grouped.size().unstack(0)
bucket_sums=grouped.contb_receipt_amt.sum().unstack(0)

normed_sum=bucket_sums.div(bucket_sums.sum(axis=1),axis=0)
normed_sum[:-2].plot(kind='barh',stacked=True)
grouped=fec_mrbo.groupby(['cand_nm','contbr_st'])

totals=grouped.contb_receipt_amt.sum().unstack(0).fillna(0)
totals[:10]
percent=totals.div(totals.sum(axis=1),axis=0)
percent=totals.div(totals.sum(1),axis=0)
percent[:10]
from mpl_toolkits.basemap import Basemap,cm

from matplotlib import rcParams
from matplotlib.collections import LineCollection
import matplotlib.pyplot as plt
from shapelib import ShapeFile
import dbflib

# p255
data=pd.read_csv('g:/haiti.csv')
data.head()

data.describe()
data=data[(data.LATITUDE>18)&(data.LATITUDE<20)&(data.LONGITUDE>-75)&(data.LONGITUDE<-70)&data.CATEGORY.notnull()]
def to_cat_list(catstr):
    stripped=(x.strip() for x in catstr.split(','))
    return [x for x in stripped if x]
def get_all_categories(cat_series):
    cat_sets=(set(to_cat_list(x)) for x in cat_series)
    return sorted(set.union(*cat_sets))
def get_english(cat):
    code,names=cat.split('.')
    if '|' in names:
        names=names.split('|')[1]
    return code, names

all_cats=get_all_categories(data.CATEGORY)
english_mapping=dict(get_english(x) for x in all_cats)        
english_mapping['2a'] 
def get_code(seq):
    return [x.split('.')[0] for x in seq if x ] 
all_codes=get_code(all_cats)
all_cats.str.split('.')[0] # × 'list' object has no attribute 'str'
code_index=pd.Index(np.unique(all_codes))
dummy_frame=pd.DataFrame(np.zeros((len(data),len(code_index))),index=data.index,columns=code_index)
dummy_frame.ix[:,:6]
for row, cat in zip(data.index, data.CATEGORY):
    codes=get_code(to_cat_list(cat))
    dummy_frame.ix[row,codes]=1
data=data.join(dummy_frame.add_prefix('category_'))    

data.ix[:,10:15]
def basic_haiti_map(ax=None,lllat=17.25,urlat=20.25,lllon=-75,urlon=-71):
    m=Basemap(ax=ax, projection='stere',lon_0=(urlon+lllon)/2,lat_0=(urlat+lllat)/2,
              llcrnrlat=lllat,urcrnrlat=urlat,llcrnrlon=lllon,urcrnrlon=urlon,resolution='f')
    m.drawcoastlines()
    m.drawstates()
    m.drawcountries()
    return m
fig,axes=plt.subplots(nrows=2,ncols=2,figsize=(12,10))
fig.subplots_adjust(hspace=0.05,wspace=0.05)
to_plot=['2a','1','3c','7a']
lllat=17.25;urlat=20.25;lllon=-75;urlon=-71
for code, ax in zip(to_plot,axes.flat):
    m=basic_haiti_map(ax,lllat=lllat,urlat=urlat,lllon=lllon,urlon=urlon)
    cat_data=data[data['category_%s' % code]==1]
    # x,y=m(cat_data.LONGITUDE,cat_data.LATITUDE) 
    add_v=pd.DataFrame()
    for i,j in zip(cat_data.LONGITUDE,cat_data.LATITUDE) :    
        x,y=m(i,j)
        add_v=add_v.append(pd.DataFrame([x,y]).T)
    # m.plot(x,y,'k.',alpha=0.5)
    m.plot(add_v[0],add_v[1],'k.',alpha=0.5)
    ax.set_title('%s:%s' % (code, english_mapping))
   
add_v=pd.DataFrame()
for i,j in zip(cat_data.LONGITUDE,cat_data.LATITUDE) :    
    x,y=m(i,j)
    add_v=add_v.append(pd.DataFrame([x,y]).T)
m.plot(add_v[0],add_v[1],'k.',alpha=0.5)    

for code, ax in zip(to_plot,axes.flat):
    m=basic_haiti_map(ax,lllat=lllat,urlat=urlat,lllon=lllon,urlon=urlon)
    cat_data=data[data['category_%s' % code]==1]
    x,y=m(cat_data.LONGITUDE,cat_data.LATITUDE)    
    m.plot(x,y,'k.',alpha=0.5)
    ax.set_title('%s:%s' % (code, english_mapping))

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章