目錄:
一. 數據讀取與預處理
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline
sns.set_context('poster', font_scale = 1.3)
df = pd.read_csv('redcard.csv.gz', compression = 'gzip')
print(df.shape)
df.head()
(146028, 28)
all_columns = df.columns.tolist()
# df.info()
# df.dtypes
df.describe().T
len(df.playerShort.unique())
len(df.player.unique())
2053
2034
print(df.height.mean())
print(np.mean(df.groupby('playerShort').height.mean()))
np.mean(df.groupby('player').height.mean())
181.93593798236887
181.74372848007872
181.7593471266707
二. 數據板塊切分
2.1 創建子表並快速測試唯一性的輔助函數
player_index = 'playerShort'
player_cols = ['birthday', 'height', 'weight', 'position', 'photoID', 'rater1', 'rater2']
def get_subgroup(dataframe, g_index, g_columns):
g = dataframe.groupby(g_index).agg({col:'nunique' for col in g_columns})
if g[g > 1].dropna().shape[0] != 0:
print("Warning: you probably assumed this had all unique values but it doesn't.")
return dataframe.groupby(g_index).agg({col:'max' for col in g_columns})
players = get_subgroup(df, player_index, player_cols)
players.head()
2.2 Players子表
def save_subgroup(dataframe, g_index, subgroup_name, prefix = 'raw_'):
save_subgroup_filename = ''.join([prefix, subgroup_name, '.csv.gz'])
dataframe.to_csv(save_subgroup_filename, compression='gzip', encoding = 'UTF-8')
test_df = pd.read_csv(save_subgroup_filename, compression='gzip', index_col = g_index, encoding='UTF-8')
if dataframe.equals(test_df):
print('Test-passed: we recover the equivalent subgroup dataframe.')
else:
print('Warning -- equivalence test!!! Double-check.')
save_subgroup(players, player_index, 'players')
Test-passed: we recover the equivalent subgroup dataframe.
2.3 Clubs子表
club_index = 'club'
club_cols = ['leagueCountry']
clubs = get_subgroup(df, club_index, club_cols)
clubs.head()
clubs['leagueCountry'].value_counts()
save_subgroup(clubs, club_index, 'clubs')
2.4 Referees子表
referee_index = 'refNum'
referee_cols = ['refCountry']
referees = get_subgroup(df, referee_index, referee_cols)
referees.head()
print(referees.refCountry.nunique())
print(referees.shape)
save_subgroup(referees, referee_index, 'referees')
161
(3147, 1)
Test-passed: we recover the equivalent subgroup dataframe.
2.5 refCountry子表
country_index = 'refCountry'
country_cols = ['Alpha_3', 'meanIAT', 'nIAT', 'seIAT', 'meanExp', 'nExp', 'seExp']
countries = get_subgroup(df, country_index, country_cols)
countries = countries.rename(columns = {'Alpha_3' : 'countryName'})
print(countries.shape)
countries.head()
(161, 7)
save_subgroup(countries, country_index, 'countries')
2.6 紅牌Dyads子表
dyad_index = ['refNum', 'playerShort']
dyad_cols = ['games','victories','ties','defeats','goals','yellowCards','yellowReds','redCards']
dyads = get_subgroup(df, dyad_index, dyad_cols)
print(dyads.shape)
dyads.head()
(146028, 8)
print(dyads.redCards.max())
save_subgroup(dyads, dyad_index, 'dyads')
2
Test-passed: we recover the equivalent subgroup dataframe.
三. 缺失值
def load_subgroup(filename, index_col = [0]):
return pd.read_csv(filename, compression = 'gzip', index_col =index_col)
players = load_subgroup('raw_players.csv.gz')
print(player.shape)
players.head()
(2053, 7)
3.1 missingno庫
import missingno as msno
# msno.bar(players, figsize = (16, 6))
msno.matrix(players, figsize = (16, 6))
msno.heatmap(players, figsize = (16, 6))
3.2 查看數據缺失值比例python模板
def missing_value_table(df):
mis_val = df.isnull().sum()
mis_val_percent = 100 * mis_val / len(df)
mis_val_table = pd.concat([mis_val, mis_val_percent], axis = 1)
mis_val_table_ren_columns = mis_val_table.rename(columns = {0:'Missing Values',
1:'% of Total Values'})
mis_val_table_ren_columns = mis_val_table_ren_columns[
mis_val_table_ren_columns.iloc[:,1] != 0].sort_values('% of Total Values', ascending = False).round(2)
print('Your selected dataframe has {} columns.\nThere are {} columns that have missing values.'.format(df.shape[1], mis_val_table_ren_columns.shape[0]))
return mis_val_table_ren_columns
missing_value_table(players)
Your selected dataframe has 7 columns.
There are 6 columns that have missing values.
3.3 pandas查看缺失值
print('All players: ', len(players))
print('rater1 nulls: ', len(players[players.rater1.isnull()]))
print('rater2 nulls: ', len(players[players.rater2.isnull()]))
print('Both nulls: ', len(players[players.rater1.isnull() & players.rater2.isnull()]))
All players: 2053
rater1 nulls: 468
rater2 nulls: 468
Both nulls: 468
3.4 剔除缺失值
players = players[players.rater1.notnull()]
print(players.shape[0])
msno.matrix(players, figsize = (16, 6))
1585
3.5 創造新特徵
rater1與rater2相似度極高
# 交叉列表
pd.crosstab(players.rater1, players.rater2)
fig, ax = plt.subplots(figsize = (10, 8))
sns.heatmap(pd.crosstab(players.rater1, players.rater2), cmap = 'Blues',
annot = True, fmt = 'd', annot_kws = {'size': 20}, ax = ax)
ax.set_title('Correlation between Rater1 and Rater2\n', fontsize = 22)
ax.tick_params(labelsize = 16)
ax.set_xlabel('rater2', fontsize = 18); ax.set_ylabel('rater1', fontsize = 18)
fig.tight_layout()
根據rater1和rater2創造新的特徵
players['skintone'] = players[['rater1', 'rater2']].mean(axis = 1)
players.head()
四. 特徵可視化展示
# 膚色
plt.figure(figsize = (8, 6))
sns.distplot(players.skintone, kde = False)
# Positions
fig, ax = plt.subplots(figsize = (12, 8))
players.position.value_counts(dropna=False, ascending=True).plot(kind='barh', ax=ax)
ax.set_ylabel('Position'); ax.set_xlabel('Coounts')
fig.tight_layout()
fullback = ['Center Back', 'Left Fullback', 'Right Fullback']
midfield = ['Right Midfielder', 'Center Midfielder', 'Left Midfielder',
'Defensive Midfielder', 'Attacking Midfielder']
forward = ['Left Winger', 'Right Winger', 'Center Forward']
keeper = ['Goalkeeper']
players.loc[players['position'].isin(fullback), 'positon_agg'] = 'Fullback'
players.loc[players['position'].isin(midfield), 'position_agg'] = 'Midfield'
players.loc[players['position'].isin(forward), 'position_agg'] = 'Forward'
players.loc[players['position'].isin(keeper), 'position_agg'] = 'Keeper'
plt.style.use('ggplot')
plt.figure(figsize = (12, 6))
players['position_agg'].value_counts(dropna = False, ascending = True).plot(kind = 'barh')
plt.xlabel('Count', fontsize = 22)
plt.ylabel('position_agg', fontsize = 22)
五. 多特徵關係
5.1 pandas.plotting.scatter_matrix
fig, ax = plt.subplots(figsize = (10, 10))
pd.plotting.scatter_matrix(players[['height', 'weight', 'skintone']], diagonal = 'hist', ax = ax)
5.2 seaborn.PairGrid
grid = sns.PairGrid(players[['height', 'weight', 'skintone']], height = 4)
grid.map_upper(plt.scatter, s = 20)
grid.map_upper(plt.hist, edgecolor = 'black')
grid.map_lower(plt.scatter, color = 'darkred')
5.3 seaborn.pairplot
sns.pairplot(players[['height', 'weight', 'skintone']], height = 4, diag_kind = 'hist',
plot_kws = {'alpha': 0.6, 's': 50})
5.4 seaborn.regplot
plt.figure(figsize = (8, 6))
sns.regplot('weight', 'height', data = players, scatter_kws = {'s': 30})
plt.xlabel('Height [cm]')
plt.ylabel('Weight [kg]')
5.5 連續值變量切分
weight_categories = ["vlow_weight","low_weight","mid_weight","high_weight","vhigh_weight"]
players['weightclass'] = pd.qcut(players['weight'], len(weight_categories), weight_categories)
height_categories = ["vlow_height","low_height","mid_height","high_height","vhigh_height"]
players['heightclass'] = pd.qcut(players['height'], len(height_categories), height_categories)
players['skintoneclass'] = pd.qcut(players['skintone'], 3)
六. 報表可視化分析
Pandas profiling
import pandas_profiling
pandas_profiling.ProfileReport(players)
七. 特徵組合與特徵選擇
7.1 創造年齡特徵
players['birth_date'] = pd.to_datetime(players.birthday, format = '%d.%m.%Y')
players.birth_date.head()
players['age_years'] = (pd.to_datetime('2013-01-01') - players['birth_date']).dt.days / 365.25
players['age_years']
7.2 選擇有用的特徵
players_cleaned_variables = [#'birthday',
'height',
'weight',
# 'position',
# 'photoID',
# 'rater1',
# 'rater2',
'skintone',
'position_agg',
'weightclass',
'heightclass',
'skintoneclass',
# 'birth_date',
'age_years']
pandas_profiling.ProfileReport(players[players_cleaned_variables])
players[players_cleaned_variables].to_csv('cleaned_players.csv.gz', compression = 'gzip')
八. 紅牌與人種膚色關係
8.1 根據球員新子表過濾其它子表
agg_dyads = pd.read_csv('raw_dyads.csv.gz', compression = 'gzip', index_col = [0, 1])
print(agg_dyads.shape)
agg_dyads.head(8)
(146028, 8)
print(len(agg_dyads.reset_index().set_index('playerShort'))
all(agg_dyads['games'] == agg_dyads.victories + agg_dyads.ties + agg_dyads.defeats)
146028
True
agg_dyads['totalRedCards'] = agg_dyads['yellowReds'] + agg_dyads['redCards']
agg_dyads.rename(columns = {'redCards': 'strictRedCards'}, inplace = True)
agg_dyads.head()
player_dyad = clean_players.merge(agg_dyads.reset_index().set_index('playerShort'),
left_index = True, right_index = True)
print(player_dyad.shape)
player_dyad.head()
clean_dyads = agg_dyads.reset_index()[agg_dyads.reset_index().playerShort.isin(set(clean_players.index))
].set_index(['refNum', 'playerShort'])
print(clean_dyads.shape)
clean_dyads.head()
爲了正確地處理數據,我們必須將數據分解,讓每個遊戲都是一行。
j = 0
out = [0 for _ in range(sum(clean_dyads['games']))]
for index, row in clean_dyads.reset_index().iterrows():
n = row['games']
d = row['totalRedCards']
ref = row['refNum']
player = row['playerShort']
for _ in range(n):
row['totalRedCards'] = 1 if (d - _) > 0 else 0
rowlist = list([ref, player, row['totalRedCards']])
out[j] = rowlist
j += 1
tidy_dyads = pd.DataFrame(out, columns=['refNum','playerShort','redcard']).set_index(['refNum', 'playerShort'])
print(tidy_dyads.redcard.sum())
print(clean_dyads.games.sum())
print(tidy_dyads.shape)
tidy_dyads.head()
3092
373067
(373067, 1)
tidy_dyads.redcard.sum()
# 3092
clean_referees = referees.reset_index()[referees.reset_index().refNum.isin(tidy_dyads.reset_index().refNum.unique())].set_index('refNum')
clean_referees.shape, referees.shape
# ((2978, 1), (3147, 1))
clean_countries = countries.reset_index()[countries.reset_index().refCountry.isin(clean_referees.refCountry.unique())].set_index('refCountry')
clean_countries.shape, countries.shape
# ((160, 7), (161, 7))
tidy_dyads.to_csv('cleaned_dyads.csv.gz', compression = 'gzip')
8.2 球員獲得紅牌及裁判給紅牌分佈
tidy_dyads = pd.read_csv('cleaned_dyads.csv.gz', compression = 'gzip', index_col = [0,1])
clean_players = load_subgroup('cleaned_players.csv.gz')
tidy_dyads.groupby(level=0).sum().sort_values('redcard', ascending=False).rename(columns = 'redcard':'total redcards given'}).head(8)
tidy_dyads.groupby(level=1).sum().sort_value('redcard', ascending=False).rename(columns={'redcard':'total redcards received'}).head()
total_ref_games = tidy_dyads.groupby(level=0).size().sort_values(ascending = False)
total_player_games= tidy_dyads.groupby(level=1).size().sort_values(ascending=False)
total_ref_given = tidy_dyads.groupby(level=0).sum().sort_values(ascending=False, by='redcard')
total_player_received=tidy_dyads.groupby(level=1).sum().sort_values(ascending=False,by='redcard')
sns.set_style('darkgrid')
plt.figure(figsize = (8, 6))
sns.distplot(total_player_received, kde = False)
plt.figure(figsize = (8, 6))
sns.distplot(total_ref_given, kde = False)
8.3 紅牌與膚色的關係
player_ref_game = tidy_dyads.reset_index().set_index('playerShort'
).merge(clean_players, left_index = True, right_index = True)
print(player_ref_game.shape)
player_ref_game.head()
(373067, 10)
bootstrap = pd.concat([player_ref_game.sample(replace = True, n = 10000).groupby('skintone'
).mean() for _ in range(100)])
print(bootstrap.shape)
bootstrap.head()
(900, 5)
plt.figure(figsize = (8, 6))
ax = sns.regplot(bootstrap.index.values, y='redcard', data=bootstrap, lowess=True,
scatter_kws = {'alpha': 0.4, 's': 10}, x_jitter = 0.125 / 4)
ax.set_xlabel('Skintone')