# -*- coding: utf-8 -*-
"""
Created on Thu Jul 26 17:31:03 2018
@author: Administrator
"""
"""
Apply Operations To Groups In Pandas
"""
import pandas as pd
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks',
'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons',
'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'],
'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd',
'2nd','1st', '1st', '2nd', '2nd'],
'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon',
'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'],
'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 70, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['regiment', 'company', 'name',
'preTestScore', 'postTestScore'])
#beacause dict is unsorted, so write columns argument for ensuring sequence
#of columns.
# Create a groupby variable that groups preTestScores by regiment:
groupby_regiment = df['preTestScore'].groupby(df['regiment'])
print(groupby_regiment)
#<pandas.core.groupby.SeriesGroupBy object at 0x0000000011C920F0>
#This grouped variable is now a GroupBy object. It has not actually
#computed anything yet except for some intermediate data about the group key
#df['key1']. The idea is that this object has all of the information needed
#to then apply some operation to each of the groups.
# view a groupby object:
print(list(groupby_regiment))
#[('Dragoons', 4 3
#5 4
#6 24
#7 31
#Name: preTestScore, dtype: int64), ('Nighthawks', 0 4
#1 24
#2 31
#3 2
#Name: preTestScore, dtype: int64), ('Scouts', 8 2
#9 3
#10 2
#11 3
#Name: preTestScore, dtype: int64)]
# Descriptive statistics by group:
print(groupby_regiment.describe())
# count mean std min 25% 50% 75% max
#regiment
#Dragoons 4.0 15.50 14.153916 3.0 3.75 14.0 25.75 31.0
#Nighthawks 4.0 15.25 14.453950 2.0 3.50 14.0 25.75 31.0
#Scouts 4.0 2.50 0.577350 2.0 2.00 2.5 3.00 3.0
# Mean of each regiment's preTestScore:
print(groupby_regiment.mean())
#regiment
#Dragoons 15.50
#Nighthawks 15.25
#Scouts 2.50
#Name: preTestScore, dtype: float64
# Mean preTestScores grouped by regiment and company:
gb_r_c = df['preTestScore'].groupby([df['regiment'], df['company']])
print(gb_r_c.mean())
#regiment company
#Dragoons 1st 3.5
# 2nd 27.5
#Nighthawks 1st 14.0
# 2nd 16.5
#Scouts 1st 2.5
# 2nd 2.5
#Name: preTestScore, dtype: float64
# Mean preTestScores grouped by regiment and company
#without heirarchical indexing:
print(gb_r_c.mean().unstack())
#company 1st 2nd
#regiment
#Dragoons 3.5 27.5
#Nighthawks 14.0 16.5
#Scouts 2.5 2.5
print(gb_r_c.mean().unstack().unstack(level=0))
#or print(gb_r_c.mean().unstack().unstack()) # default level=-1
#company regiment
#1st Dragoons 3.5
# Nighthawks 14.0
# Scouts 2.5
#2nd Dragoons 27.5
# Nighthawks 16.5
# Scouts 2.5
#dtype: float64
# Group the entire dataframe by regiment and company:
df_gb_rc = df.groupby(['regiment', 'company'])
print(df_gb_rc.mean())
# preTestScore postTestScore
#regiment company
#Dragoons 1st 3.5 47.5
# 2nd 27.5 75.5
#Nighthawks 1st 14.0 59.5
# 2nd 16.5 59.5
#Scouts 1st 2.5 66.0
# 2nd 2.5 66.0
# Number of observations in each regiment and company:
print(df_gb_rc.size())
#regiment company
#Dragoons 1st 2
# 2nd 2
#Nighthawks 1st 2
# 2nd 2
#Scouts 1st 2
# 2nd 2
#dtype: int64
# Iterate an operations over groups:
# Group the dataframe by regiment, and for each regiment,
for name, group in df.groupby('regiment'):
# print the name of the regiment
print(name)
print('\n')
# print the data of that regiment
print(group)
print('\n')
#Dragoons
#
#
# regiment company name preTestScore postTestScore
#4 Dragoons 1st Cooze 3 70
#5 Dragoons 1st Jacon 4 25
#6 Dragoons 2nd Ryaner 24 94
#7 Dragoons 2nd Sone 31 57
#
#
#Nighthawks
#
#
# regiment company name preTestScore postTestScore
#0 Nighthawks 1st Miller 4 25
#1 Nighthawks 1st Jacobson 24 94
#2 Nighthawks 2nd Ali 31 57
#3 Nighthawks 2nd Milner 2 62
#
#
#Scouts
#
#
# regiment company name preTestScore postTestScore
#8 Scouts 1st Sloan 2 62
#9 Scouts 1st Piger 3 70
#10 Scouts 2nd Riani 2 62
#11 Scouts 2nd Ali 3 70
# add prefix to columns name:
print(df.groupby('regiment').mean().add_prefix('mean_'))
# mean_preTestScore mean_postTestScore
#regiment
#Dragoons 15.50 61.5
#Nighthawks 15.25 59.5
#Scouts 2.50 66.0
# Create a function to get the stats of a group:
def get_stats(group):
return {'min': group.min(), 'max': group.max(), 'count': group.count(),
'mean': group.mean()}
# Create bins and bin up postTestScore by those bins:
bins = [0, 25, 50, 75, 100]
group_names = ['Low', 'Okay', 'Good', 'Great']
df['categories'] = pd.cut(df['postTestScore'], bins, labels=group_names)
# create a new column which is bins based on df['postTestScore'] data
#by using bins and group_names.
#print(df)
# regiment company name preTestScore postTestScore categories
#0 Nighthawks 1st Miller 4 25 Low
#1 Nighthawks 1st Jacobson 24 94 Great
#2 Nighthawks 2nd Ali 31 57 Good
#3 Nighthawks 2nd Milner 2 62 Good
#4 Dragoons 1st Cooze 3 70 Good
#5 Dragoons 1st Jacon 4 25 Low
#6 Dragoons 2nd Ryaner 24 94 Great
#7 Dragoons 2nd Sone 31 57 Good
#8 Scouts 1st Sloan 2 62 Good
#9 Scouts 1st Piger 3 70 Good
#10 Scouts 2nd Riani 2 62 Good
#11 Scouts 2nd Ali 3 70 Good
print(df['postTestScore'].groupby(df['categories']).apply(get_stats))
# count max mean min
#categories
#Low 2.0 25.0 25.00 25.0
#Okay 0.0 NaN NaN NaN
#Good 8.0 70.0 63.75 57.0
#Great 2.0 94.0 94.00 94.0
""" Group by columns, i.e. axies=1. """
print(list(df.groupby(df.dtypes, axis=1)))
#[(dtype('int64'), preTestScore postTestScore
#0 4 25
#1 24 94
#2 31 57
#3 2 62
#4 3 70
#5 4 25
#6 24 94
#7 31 57
#8 2 62
#9 3 70
#10 2 62
#11 3 70), (dtype('O'), regiment company name
#0 Nighthawks 1st Miller
#1 Nighthawks 1st Jacobson
#2 Nighthawks 2nd Ali
#3 Nighthawks 2nd Milner
#4 Dragoons 1st Cooze
#5 Dragoons 1st Jacon
#6 Dragoons 2nd Ryaner
#7 Dragoons 2nd Sone
#8 Scouts 1st Sloan
#9 Scouts 1st Piger
#10 Scouts 2nd Riani
#11 Scouts 2nd Ali)]