Apply Operations To Groups In Pandas

# -*- coding: utf-8 -*-
"""
Created on Thu Jul 26 17:31:03 2018

@author: Administrator
"""

"""
Apply Operations To Groups In Pandas
"""
import pandas as pd
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 
                         'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 
                         'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'], 
        'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', 
                    '2nd','1st', '1st', '2nd', '2nd'], 
        'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 
                 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'], 
        'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 70, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['regiment', 'company', 'name', 
                                       'preTestScore', 'postTestScore'])
#beacause dict is unsorted, so write columns argument for ensuring sequence 
#of columns.

# Create a groupby variable that groups preTestScores by regiment:
groupby_regiment = df['preTestScore'].groupby(df['regiment'])
print(groupby_regiment)
#<pandas.core.groupby.SeriesGroupBy object at 0x0000000011C920F0>
#This grouped variable is now a GroupBy object. It has not actually 
#computed anything yet except for some intermediate data about the group key 
#df['key1']. The idea is that this object has all of the information needed 
#to then apply some operation to each of the groups.

# view a groupby object:
print(list(groupby_regiment))
#[('Dragoons', 4     3
#5     4
#6    24
#7    31
#Name: preTestScore, dtype: int64), ('Nighthawks', 0     4
#1    24
#2    31
#3     2
#Name: preTestScore, dtype: int64), ('Scouts', 8     2
#9     3
#10    2
#11    3
#Name: preTestScore, dtype: int64)]

# Descriptive statistics by group:
print(groupby_regiment.describe())
#            count   mean        std  min   25%   50%    75%   max
#regiment                                                         
#Dragoons      4.0  15.50  14.153916  3.0  3.75  14.0  25.75  31.0
#Nighthawks    4.0  15.25  14.453950  2.0  3.50  14.0  25.75  31.0
#Scouts        4.0   2.50   0.577350  2.0  2.00   2.5   3.00   3.0

# Mean of each regiment's preTestScore:
print(groupby_regiment.mean())
#regiment
#Dragoons      15.50
#Nighthawks    15.25
#Scouts         2.50
#Name: preTestScore, dtype: float64

# Mean preTestScores grouped by regiment and company:
gb_r_c = df['preTestScore'].groupby([df['regiment'], df['company']])
print(gb_r_c.mean())
#regiment    company
#Dragoons    1st         3.5
#            2nd        27.5
#Nighthawks  1st        14.0
#            2nd        16.5
#Scouts      1st         2.5
#            2nd         2.5
#Name: preTestScore, dtype: float64

# Mean preTestScores grouped by regiment and company 
#without heirarchical indexing:
print(gb_r_c.mean().unstack())
#company      1st   2nd
#regiment              
#Dragoons     3.5  27.5
#Nighthawks  14.0  16.5
#Scouts       2.5   2.5
print(gb_r_c.mean().unstack().unstack(level=0))
#or print(gb_r_c.mean().unstack().unstack()) # default level=-1
#company  regiment  
#1st      Dragoons       3.5
#         Nighthawks    14.0
#         Scouts         2.5
#2nd      Dragoons      27.5
#         Nighthawks    16.5
#         Scouts         2.5
#dtype: float64

# Group the entire dataframe by regiment and company:
df_gb_rc = df.groupby(['regiment', 'company'])
print(df_gb_rc.mean())
#                    preTestScore  postTestScore
#regiment   company                             
#Dragoons   1st               3.5           47.5
#           2nd              27.5           75.5
#Nighthawks 1st              14.0           59.5
#           2nd              16.5           59.5
#Scouts     1st               2.5           66.0
#           2nd               2.5           66.0

# Number of observations in each regiment and company:
print(df_gb_rc.size())
#regiment    company
#Dragoons    1st        2
#            2nd        2
#Nighthawks  1st        2
#            2nd        2
#Scouts      1st        2
#            2nd        2
#dtype: int64

# Iterate an operations over groups:
# Group the dataframe by regiment, and for each regiment,
for name, group in df.groupby('regiment'):
    # print the name of the regiment
    print(name)
    print('\n')
    # print the data of that regiment
    print(group)
    print('\n')
#Dragoons
#
#
#   regiment company    name  preTestScore  postTestScore
#4  Dragoons     1st   Cooze             3             70
#5  Dragoons     1st   Jacon             4             25
#6  Dragoons     2nd  Ryaner            24             94
#7  Dragoons     2nd    Sone            31             57
#
#
#Nighthawks
#
#
#     regiment company      name  preTestScore  postTestScore
#0  Nighthawks     1st    Miller             4             25
#1  Nighthawks     1st  Jacobson            24             94
#2  Nighthawks     2nd       Ali            31             57
#3  Nighthawks     2nd    Milner             2             62
#
#
#Scouts
#
#
#   regiment company   name  preTestScore  postTestScore
#8    Scouts     1st  Sloan             2             62
#9    Scouts     1st  Piger             3             70
#10   Scouts     2nd  Riani             2             62
#11   Scouts     2nd    Ali             3             70
    
# add prefix to columns name:
print(df.groupby('regiment').mean().add_prefix('mean_'))
#            mean_preTestScore  mean_postTestScore
#regiment                                         
#Dragoons                15.50                61.5
#Nighthawks              15.25                59.5
#Scouts                   2.50                66.0

# Create a function to get the stats of a group:
def get_stats(group):
    return {'min': group.min(), 'max': group.max(), 'count': group.count(),
            'mean': group.mean()}
# Create bins and bin up postTestScore by those bins:
bins = [0, 25, 50, 75, 100]
group_names = ['Low', 'Okay', 'Good', 'Great']
df['categories'] = pd.cut(df['postTestScore'], bins, labels=group_names)
# create a new column which is bins based on df['postTestScore'] data
#by using bins and group_names. 
#print(df)
#      regiment company      name  preTestScore  postTestScore categories
#0   Nighthawks     1st    Miller             4             25        Low
#1   Nighthawks     1st  Jacobson            24             94      Great
#2   Nighthawks     2nd       Ali            31             57       Good
#3   Nighthawks     2nd    Milner             2             62       Good
#4     Dragoons     1st     Cooze             3             70       Good
#5     Dragoons     1st     Jacon             4             25        Low
#6     Dragoons     2nd    Ryaner            24             94      Great
#7     Dragoons     2nd      Sone            31             57       Good
#8       Scouts     1st     Sloan             2             62       Good
#9       Scouts     1st     Piger             3             70       Good
#10      Scouts     2nd     Riani             2             62       Good
#11      Scouts     2nd       Ali             3             70       Good
print(df['postTestScore'].groupby(df['categories']).apply(get_stats))
#            count   max   mean   min
#categories                          
#Low           2.0  25.0  25.00  25.0
#Okay          0.0   NaN    NaN   NaN
#Good          8.0  70.0  63.75  57.0
#Great         2.0  94.0  94.00  94.0

""" Group by columns, i.e. axies=1. """
print(list(df.groupby(df.dtypes, axis=1)))
#[(dtype('int64'),     preTestScore  postTestScore
#0              4             25
#1             24             94
#2             31             57
#3              2             62
#4              3             70
#5              4             25
#6             24             94
#7             31             57
#8              2             62
#9              3             70
#10             2             62
#11             3             70), (dtype('O'),       regiment company      name
#0   Nighthawks     1st    Miller
#1   Nighthawks     1st  Jacobson
#2   Nighthawks     2nd       Ali
#3   Nighthawks     2nd    Milner
#4     Dragoons     1st     Cooze
#5     Dragoons     1st     Jacon
#6     Dragoons     2nd    Ryaner
#7     Dragoons     2nd      Sone
#8       Scouts     1st     Sloan
#9       Scouts     1st     Piger
#10      Scouts     2nd     Riani
#11      Scouts     2nd       Ali)]

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章