pandas清洗Kobe數據

清洗Kobe數據

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np


raw = pd.read_csv('.\data\kobe_data.csv')
print(raw.shape)
kobe = raw[pd.notnull(raw['shot_made_flag'])]
print(kobe.shape)

alpha = 0.01#透明度
plt.figure(figsize=(10,10))

plt.subplot(121)
plt.scatter(kobe.loc_x,kobe.loc_y,c='blue',alpha=alpha)
plt.title('loc_x and loc_y')

plt.subplot(122)
plt.scatter(kobe.lon,kobe.lat,c='green',alpha=alpha)
plt.title('lot and lat')

#計算距離
raw['dist'] = np.sqrt(raw['loc_x']**2+raw['loc_y']**2)
#計算角度
loc_x_zero = raw['loc_x'] == 0
raw['angle'] = np.array([0]*len(raw))
raw['angle'][~loc_x_zero] = np.arctan(raw['loc_y'][~loc_x_zero] / 
                               raw['loc_x'][~loc_x_zero])
raw['angle'][loc_x_zero] = np.pi/2

raw['remaining_time'] = raw['minutes_remaining']*60+raw['seconds_remaining']

print(kobe.action_type.unique())
print(kobe.combined_shot_type.value_counts())
print(kobe.shot_type.unique())
print(kobe.season.unique())
#賽季格式轉換
raw['season'] = raw['season'].apply(lambda x: int(x.split('-')[1]))
print(raw.season.unique())

mat = pd.DataFrame({'matchup':kobe.matchup,'opponent':kobe.opponent})

plt.figure(figsize=(5,5))
plt.scatter(raw.dist,raw.shot_distance,c='b')
plt.title('dis and shot_distance')

gs = kobe.groupby('shot_zone_area')
print(kobe['shot_zone_area'].value_counts())
print(len(gs))

plt.figure(figsize=(20,10))
def scatter_plot_by_category(feat):
    alpha = 0.02
    gs = kobe.groupby(feat)
    cs = cm.rainbow(np.linspace(0,1,len(gs)))
    for g,c in zip(gs,cs):
        plt.scatter(g[1].loc_x, g[1].loc_y, color=c,alpha=alpha)
        
plt.subplot(131)
scatter_plot_by_category('shot_zone_basic')
plt.title('shot_zone_basic')

plt.subplot(132)
scatter_plot_by_category('shot_zone_area')
plt.title('shot_zone_basic')
        
plt.subplot(133)
scatter_plot_by_category('shot_zone_range')
plt.title('shot_zone_basic')

drops = ['shot_id','team_id', 'team_name','shot_zone_area', 
         'shot_zone_basic', 'shot_zone_range', 'lat','lon',
         'matchup','shot_distance', 'minutes_remaining',
         'seconds_remaining', 'loc_x', 'loc_y','game_date',
         'game_id', 'game_event_id']
for i in drops:
    raw = raw.drop(i,1)
    
print(raw.combined_shot_type.value_counts())
print(pd.get_dummies(raw.combined_shot_type,prefix='combined_shot_type')[0:2])

categorical = ['action_type','combined_shot_type','shot_type',
               'opponent','period','season']
for var in categorical:
    raw = pd.concat([raw, pd.get_dummies(raw[var],prefix=var)],1)
    raw = raw.drop(var,1)
    
train_kobe = raw[pd.notnull(raw['shot_made_flag'])]
train_label = train_kobe['shot_made_flag']
train_kobe = train_kobe.drop('shot_made_flag',1)
test_kobe = raw[pd.isnull(raw['shot_made_flag'])]
test_label = test_kobe.drop('shot_made_flag',1)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章