科比生涯數據集分析與預測

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
filename = 'data.csv'
raw = pd.read_csv(filename)
print(raw.shape)
raw.head()
(30697, 25)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
action_type combined_shot_type game_event_id game_id lat loc_x loc_y lon minutes_remaining period shot_type shot_zone_area shot_zone_basic shot_zone_range team_id team_name game_date matchup opponent shot_id
0 Jump Shot Jump Shot 10 20000012 33.9723 167 72 -118.1028 10 1 2PT Field Goal Right Side(R) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 1
1 Jump Shot Jump Shot 12 20000012 34.0443 -157 0 -118.4268 10 1 2PT Field Goal Left Side(L) Mid-Range 8-16 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 2
2 Jump Shot Jump Shot 35 20000012 33.9093 -101 135 -118.3708 7 1 2PT Field Goal Left Side Center(LC) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 3
3 Jump Shot Jump Shot 43 20000012 33.8693 138 175 -118.1318 6 1 2PT Field Goal Right Side Center(RC) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 4
4 Driving Dunk Shot Dunk 155 20000012 34.0443 0 0 -118.2698 6 2 2PT Field Goal Center(C) Restricted Area Less Than 8 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 5

5 rows × 25 columns

pd.notnull(raw['shot_made_flag'])
0 False 1 True 2 True 3 True 4 True 5 True 6 True 7 False 8 True 9 True 10 True 11 True 12 True 13 True 14 True 15 True 16 False 17 True 18 True 19 False 20 True 21 True 22 True 23 True 24 True 25 True 26 True 27 True 28 True 29 True … 30667 True 30668 False 30669 True 30670 True 30671 True 30672 True 30673 True 30674 True 30675 True 30676 True 30677 True 30678 True 30679 True 30680 False 30681 True 30682 False 30683 True 30684 True 30685 True 30686 False 30687 True 30688 True 30689 True 30690 True 30691 True 30692 True 30693 False 30694 True 30695 True 30696 True Name: shot_made_flag, Length: 30697, dtype: bool
#
kebo = raw[pd.notnull(raw['shot_made_flag'])]
print(kebo.shape)
kebo.head()
(25697, 25)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
action_type combined_shot_type game_event_id game_id lat loc_x loc_y lon minutes_remaining period shot_type shot_zone_area shot_zone_basic shot_zone_range team_id team_name game_date matchup opponent shot_id
1 Jump Shot Jump Shot 12 20000012 34.0443 -157 0 -118.4268 10 1 2PT Field Goal Left Side(L) Mid-Range 8-16 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 2
2 Jump Shot Jump Shot 35 20000012 33.9093 -101 135 -118.3708 7 1 2PT Field Goal Left Side Center(LC) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 3
3 Jump Shot Jump Shot 43 20000012 33.8693 138 175 -118.1318 6 1 2PT Field Goal Right Side Center(RC) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 4
4 Driving Dunk Shot Dunk 155 20000012 34.0443 0 0 -118.2698 6 2 2PT Field Goal Center(C) Restricted Area Less Than 8 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 5
5 Jump Shot Jump Shot 244 20000012 34.0553 -145 -11 -118.4148 9 3 2PT Field Goal Left Side(L) Mid-Range 8-16 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 6

5 rows × 25 columns

alpha = 0.02
plt.figure(figsize=(15,15))

plt.subplot(121)
plt.scatter(kebo.loc_x,kebo.loc_y,color='R',alpha=alpha)
plt.title('loc_x and loc_y')

plt.subplot(122)
plt.scatter(kebo.lon,kebo.lat,color='B',alpha=alpha)
plt.title('lat and lon')
Text(0.5,1,’lat and lon’) ![png](output_4_1.png)
raw['dist'] = np.sqrt(raw['loc_x']**2+raw['loc_y']**2)
raw.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
action_type combined_shot_type game_event_id game_id lat loc_x loc_y lon minutes_remaining period shot_zone_area shot_zone_basic shot_zone_range team_id team_name game_date matchup opponent shot_id dist
0 Jump Shot Jump Shot 10 20000012 33.9723 167 72 -118.1028 10 1 Right Side(R) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 1 181.859836
1 Jump Shot Jump Shot 12 20000012 34.0443 -157 0 -118.4268 10 1 Left Side(L) Mid-Range 8-16 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 2 157.000000
2 Jump Shot Jump Shot 35 20000012 33.9093 -101 135 -118.3708 7 1 Left Side Center(LC) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 3 168.600119
3 Jump Shot Jump Shot 43 20000012 33.8693 138 175 -118.1318 6 1 Right Side Center(RC) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 4 222.865430
4 Driving Dunk Shot Dunk 155 20000012 34.0443 0 0 -118.2698 6 2 Center(C) Restricted Area Less Than 8 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 5 0.000000

5 rows × 26 columns

loc_x_zero = raw['loc_x']==0
raw['angle'] = np.array([0]*len(raw))
#raw.head()
# ~loc_x_zero:取反
raw['angle'][~loc_x_zero]=np.arctan(raw['loc_y'][~loc_x_zero]/raw['loc_y'][~loc_x_zero])
raw['angle'][loc_x_zero] = np.pi/2
/home/heres/.conda/envs/GPU_test/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy “”” /home/heres/.conda/envs/GPU_test/lib/python3.6/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
raw['remaining_time'] = raw['minutes_remaining']*60+raw['seconds_remaining']
#當前列裏面有多少不重複的值
print(kebo.action_type.unique())
print(kebo.combined_shot_type.unique())
print(kebo['shot_type'].unique())
print(kebo.shot_type.value_counts())
[‘Jump Shot’ ‘Driving Dunk Shot’ ‘Layup Shot’ ‘Running Jump Shot’ ‘Reverse Dunk Shot’ ‘Slam Dunk Shot’ ‘Driving Layup Shot’ ‘Turnaround Jump Shot’ ‘Reverse Layup Shot’ ‘Tip Shot’ ‘Running Hook Shot’ ‘Alley Oop Dunk Shot’ ‘Dunk Shot’ ‘Alley Oop Layup shot’ ‘Running Dunk Shot’ ‘Driving Finger Roll Shot’ ‘Running Layup Shot’ ‘Finger Roll Shot’ ‘Fadeaway Jump Shot’ ‘Follow Up Dunk Shot’ ‘Hook Shot’ ‘Turnaround Hook Shot’ ‘Jump Hook Shot’ ‘Running Finger Roll Shot’ ‘Jump Bank Shot’ ‘Turnaround Finger Roll Shot’ ‘Hook Bank Shot’ ‘Driving Hook Shot’ ‘Running Tip Shot’ ‘Running Reverse Layup Shot’ ‘Driving Finger Roll Layup Shot’ ‘Fadeaway Bank shot’ ‘Pullup Jump shot’ ‘Finger Roll Layup Shot’ ‘Turnaround Fadeaway shot’ ‘Driving Reverse Layup Shot’ ‘Driving Slam Dunk Shot’ ‘Step Back Jump shot’ ‘Turnaround Bank shot’ ‘Reverse Slam Dunk Shot’ ‘Floating Jump shot’ ‘Putback Slam Dunk Shot’ ‘Running Bank shot’ ‘Driving Bank shot’ ‘Driving Jump shot’ ‘Putback Layup Shot’ ‘Putback Dunk Shot’ ‘Running Finger Roll Layup Shot’ ‘Pullup Bank shot’ ‘Running Slam Dunk Shot’ ‘Cutting Layup Shot’ ‘Driving Floating Jump Shot’ ‘Running Pull-Up Jump Shot’ ‘Tip Layup Shot’ ‘Driving Floating Bank Jump Shot’] [‘Jump Shot’ ‘Dunk’ ‘Layup’ ‘Tip Shot’ ‘Hook Shot’ ‘Bank Shot’] [‘2PT Field Goal’ ‘3PT Field Goal’] 2PT Field Goal 20285 3PT Field Goal 5412 Name: shot_type, dtype: int64
raw['season'].unique()
array([‘2000-01’, ‘2001-02’, ‘2002-03’, ‘2003-04’, ‘2004-05’, ‘2005-06’, ‘2006-07’, ‘2007-08’, ‘2008-09’, ‘2009-10’, ‘2010-11’, ‘2011-12’, ‘2012-13’, ‘2013-14’, ‘2014-15’, ‘2015-16’, ‘1996-97’, ‘1997-98’, ‘1998-99’, ‘1999-00’], dtype=object)
raw22 = raw['season'].apply(lambda x: int(x.split('-')[1]))
raw['season']=raw22
raw['season'].unique()
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 97, 98, 99, 0])
print(kebo.team_id.unique())
print(kebo['team_name'].unique())
[1610612747] [‘Los Angeles Lakers’]
pd.DataFrame({'matchup':kebo.matchup,'opponent':kebo.opponent})[0:5]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
matchup opponent
1 LAL @ POR POR
2 LAL @ POR POR
3 LAL @ POR POR
4 LAL @ POR POR
5 LAL @ POR POR
plt.figure(figsize=(5,5))
plt.scatter(raw.dist,raw.shot_distance,color='blue')
plt.title('dist and shot_distance')
Text(0.5,1,’dist and shot_distance’) ![png](output_13_1.png)
df = pd.DataFrame({'key1':['a', 'a', 'b', 'b', 'c'],
...     'key2':['one', 'two', 'one', 'two', 'one'],
...     'data1':np.random.randn(5),
...     'data2':np.random.randn(5)})
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key1 key2 data1 data2
0 a one -1.208815 0.858278
1 a two 0.189239 0.536879
2 b one -1.188808 0.405909
3 b two -0.231954 -0.137537
4 c one 0.358366 -1.643352
# df.drop(4,1):1表示列
#df.drop(4)
# print(pd.get_dummies(df['key1'],prefix='key1'))
#拼接one-hot編碼
df = pd.concat([df,pd.get_dummies(df['key1'],prefix='key1')],1)
#刪除
df.drop('key1',1)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key2 data1 data2 key1_a key1_b key1_c
0 one -1.208815 0.858278 1 0 0
1 two 0.189239 0.536879 1 0 0
2 one -1.188808 0.405909 0 1 0
3 two -0.231954 -0.137537 0 1 0
4 one 0.358366 -1.643352 0 0 1
ss = df.groupby('key1')
for g in ss:
    print(g[1])

for g in ss:
    print(g)
key1 key2 data1 data2 0 a one 1.084443 -0.806561 1 a two -0.191386 2.743989 key1 key2 data1 data2 2 b one 0.302269 -0.560596 3 b two -0.213348 0.962007 key1 key2 data1 data2 4 c one -0.893631 -0.193283 (‘a’, key1 key2 data1 data2 0 a one 1.084443 -0.806561 1 a two -0.191386 2.743989) (‘b’, key1 key2 data1 data2 2 b one 0.302269 -0.560596 3 b two -0.213348 0.962007) (‘c’, key1 key2 data1 data2 4 c one -0.893631 -0.193283)
df['data2'].groupby(df['key1']).sum()
key1 a -2.167849 b -1.954537 Name: data2, dtype: float64
import matplotlib.cm as cm
plt.figure(figsize=(20,10))

def scatter_plot_by_category(feat):
    appha = 0.1
    gs = kebo.groupby(feat)
    cs = cm.rainbow(np.linspace(0,1,len(gs)))
    for g,c in zip(gs,cs):
        plt.scatter(g[1].loc_x,g[1].loc_y,color=c,alpha=alpha)

plt.subplot(131)
scatter_plot_by_category('shot_zone_area')
plt.title('shot_zone_area')

plt.subplot(132)df['data1'].groupby(df['key1']).sum()
scatter_plot_by_category('shot_zone_basic')
plt.title('shot_zone_basic')

plt.subplot(133)
scatter_plot_by_category('shot_zone_range')
plt.title('shot_zone_range')
Text(0.5,1,’shot_zone_range’) ![png](output_18_1.png)
#one-hot
print(raw['combined_shot_type'].value_counts())
pd.get_dummies(raw['combined_shot_type'],prefix='combined_shot_type')[0:2]
Jump Shot 23485 Layup 5448 Dunk 1286 Tip Shot 184 Hook Shot 153 Bank Shot 141 Name: combined_shot_type, dtype: int64
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
combined_shot_type_Bank Shot combined_shot_type_Dunk combined_shot_type_Hook Shot combined_shot_type_Jump Shot combined_shot_type_Layup combined_shot_type_Tip Shot
0 0 0 0 1 0 0
1 0 0 0 1 0 0
from sklearn import preprocessing

enc = preprocessing.OneHotEncoder()
enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
enc.transform([[0, 0, 2], [1, 1, 0], [0, 2, 1], [1, 0, 2]]).toarray()
array([[1., 0., 1., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 1., 0., 1., 0., 0.],
       [0., 1., 1., 0., 0., 0., 0., 1., 0.]])
#drop一些數據
filename = 'data.csv'
raw = pd.read_csv(filename)
drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic', \
         'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining', \
         'shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id', 'game_date']
for drop in drops:
    raw = raw.drop(drop, 1)
print(raw['combined_shot_type'].value_counts())
pd.get_dummies(raw['combined_shot_type'], prefix='combined_shot_type')[0:2]
#製作數據集
categorical_vars = ['action_type', 'combined_shot_type', 'shot_type', 'opponent', 'period', 'season']
for var in categorical_vars:
    raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)], 1)
    raw = raw.drop(var, 1)
train_kobe = raw[pd.notnull(raw['shot_made_flag'])]
train_label = train_kobe['shot_made_flag']
train_kobe = train_kobe.drop('shot_made_flag', 1)

test_kobe = raw[pd.isnull(raw['shot_made_flag'])]
test_kobe = test_kobe.drop('shot_made_flag', 1)
#建一個模型
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix,log_loss
import time

import numpy  as np
#等比數列
range_m = np.logspace(0,2,num=5)
print(range_m)
range_m = range_m.astype(int)
range_m
Jump Shot    23485
Layup         5448
Dunk          1286
Tip Shot       184
Hook Shot      153
Bank Shot      141
Name: combined_shot_type, dtype: int64
[  1.           3.16227766  10.          31.6227766  100.        ]





array([  1,   3,  10,  31, 100])
from sklearn.cross_validation import KFold

print('find best n_estimators for RandomForestClassfier...')
min_score = 100000
best_n = 0
scores_n = []
range_n = np.linspace(1,100,num=10).astype(int)
for n in range_n:
    print('the number of trees: {0}'.format(n))
    t1= time.time()

    rfc_score = 0.
    #模型是什麼
    rfc = RandomForestClassifier(n_estimators=n)
    for train_k,test_k in KFold(len(train_kobe),n_folds=10,shuffle=True):
#             print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
# #             print(train_k,test_k)
#             print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
            rfc.fit(train_kobe.iloc[train_k],train_label.iloc[train_k])
            pred = rfc.predict(train_kobe.iloc[test_k])
            rfc_score += log_loss(train_label.iloc[test_k],pred)/10
    scores_n.append(rfc_score)
    print(rfc_score)
    if rfc_score < min_score:
        min_score = rfc_score
        best_n = n
    t2 = time.time()
    print('Done processing {0} trees ({1:.3f}sec)'.format(n,t2-t1))
print(best_n,min_score)
find best n_estimators for RandomForestClassfier...
the number of trees: 1
13.919386979412682
Done processing 1 trees (0.726sec)
the number of trees: 12
13.389844271562911
Done processing 12 trees (6.008sec)
the number of trees: 23
13.180159610972076
Done processing 23 trees (11.043sec)
the number of trees: 34
13.219150771746676
Done processing 34 trees (18.443sec)
the number of trees: 45
13.043065195199537
Done processing 45 trees (22.152sec)
the number of trees: 56
13.057856481381911
Done processing 56 trees (27.636sec)
the number of trees: 67
13.055167436843206
Done processing 67 trees (32.039sec)
the number of trees: 78
13.051153915112355
Done processing 78 trees (38.372sec)
the number of trees: 89
13.048438999530031
Done processing 89 trees (44.605sec)
the number of trees: 100
13.114318256666222
Done processing 100 trees (49.886sec)
45 13.043065195199537
print('Finding best max_depth for RandomForestClassifier...')
min_score = 100000
best_m = 0
scores_m = []
range_m = np.logspace(0,2,num=3).astype(int)
for m in range_m:
    print("the max depth : {0}".format(m))
    t1 = time.time()

    rfc_score = 0.
    rfc = RandomForestClassifier(max_depth=m, n_estimators=best_n)
    for train_k, test_k in KFold(len(train_kobe), n_folds=10, shuffle=True):
        rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])
        #rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
        pred = rfc.predict(train_kobe.iloc[test_k])
        rfc_score += log_loss(train_label.iloc[test_k], pred) / 10
    scores_m.append(rfc_score)
    if rfc_score < min_score:
        min_score = rfc_score
        best_m = m

    t2 = time.time()
    print('Done processing {0} trees ({1:.3f}sec)'.format(m, t2-t1))
print(best_m, min_score)
Finding best max_depth for RandomForestClassifier...
the max depth : 1
Done processing 1 trees (2.389sec)
the max depth : 10
Done processing 10 trees (6.044sec)
the max depth : 100
Done processing 100 trees (21.773sec)
10 11.06186210729279
plt.figure(figsize=(10,5))
plt.subplot(121)
plt.plot(range_n, scores_n)
plt.ylabel('score')
plt.xlabel('number of trees')

plt.subplot(122)
plt.plot(range_m, scores_m)
plt.ylabel('score')
plt.xlabel('max depth')
Text(0.5,0,'max depth')

這裏寫圖片描述

model = RandomForestClassifier(n_estimators=best_n, max_depth=best_m)
model.fit(train_kobe, train_label)
pred = model.predict(train_kobe)
count = 0 
for i,j in zip(train_label,pred):
    if i == j:
        count +=1
print("acc is {0}".format(count/len(train_label)))
acc is 0.6850215978518893
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_kobe,train_label, test_size=0.3, random_state=0) 
model = RandomForestClassifier(n_estimators=best_n, max_depth=best_m)
model.fit(X_train,y_train)
pre = model.predict(X_test)

count = 0 
for i,j in zip(y_test,pred):
    if i == j:
        count +=1
print("acc is {0}".format(count/len(pre)))
acc is 0.524254215304799
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章