import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
filename = 'data.csv'
raw = pd.read_csv(filename)
print(raw.shape)
raw.head()
(30697, 25)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
action_type |
combined_shot_type |
game_event_id |
game_id |
lat |
loc_x |
loc_y |
lon |
minutes_remaining |
period |
… |
shot_type |
shot_zone_area |
shot_zone_basic |
shot_zone_range |
team_id |
team_name |
game_date |
matchup |
opponent |
shot_id |
0 |
Jump Shot |
Jump Shot |
10 |
20000012 |
33.9723 |
167 |
72 |
-118.1028 |
10 |
1 |
… |
2PT Field Goal |
Right Side(R) |
Mid-Range |
16-24 ft. |
1610612747 |
Los Angeles Lakers |
2000-10-31 |
LAL @ POR |
POR |
1 |
1 |
Jump Shot |
Jump Shot |
12 |
20000012 |
34.0443 |
-157 |
0 |
-118.4268 |
10 |
1 |
… |
2PT Field Goal |
Left Side(L) |
Mid-Range |
8-16 ft. |
1610612747 |
Los Angeles Lakers |
2000-10-31 |
LAL @ POR |
POR |
2 |
2 |
Jump Shot |
Jump Shot |
35 |
20000012 |
33.9093 |
-101 |
135 |
-118.3708 |
7 |
1 |
… |
2PT Field Goal |
Left Side Center(LC) |
Mid-Range |
16-24 ft. |
1610612747 |
Los Angeles Lakers |
2000-10-31 |
LAL @ POR |
POR |
3 |
3 |
Jump Shot |
Jump Shot |
43 |
20000012 |
33.8693 |
138 |
175 |
-118.1318 |
6 |
1 |
… |
2PT Field Goal |
Right Side Center(RC) |
Mid-Range |
16-24 ft. |
1610612747 |
Los Angeles Lakers |
2000-10-31 |
LAL @ POR |
POR |
4 |
4 |
Driving Dunk Shot |
Dunk |
155 |
20000012 |
34.0443 |
0 |
0 |
-118.2698 |
6 |
2 |
… |
2PT Field Goal |
Center(C) |
Restricted Area |
Less Than 8 ft. |
1610612747 |
Los Angeles Lakers |
2000-10-31 |
LAL @ POR |
POR |
5 |
5 rows × 25 columns
pd.notnull(raw['shot_made_flag'])
0 False
1 True
2 True
3 True
4 True
5 True
6 True
7 False
8 True
9 True
10 True
11 True
12 True
13 True
14 True
15 True
16 False
17 True
18 True
19 False
20 True
21 True
22 True
23 True
24 True
25 True
26 True
27 True
28 True
29 True
…
30667 True
30668 False
30669 True
30670 True
30671 True
30672 True
30673 True
30674 True
30675 True
30676 True
30677 True
30678 True
30679 True
30680 False
30681 True
30682 False
30683 True
30684 True
30685 True
30686 False
30687 True
30688 True
30689 True
30690 True
30691 True
30692 True
30693 False
30694 True
30695 True
30696 True
Name: shot_made_flag, Length: 30697, dtype: bool
kebo = raw[pd.notnull(raw['shot_made_flag'])]
print(kebo.shape)
kebo.head()
(25697, 25)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
action_type |
combined_shot_type |
game_event_id |
game_id |
lat |
loc_x |
loc_y |
lon |
minutes_remaining |
period |
… |
shot_type |
shot_zone_area |
shot_zone_basic |
shot_zone_range |
team_id |
team_name |
game_date |
matchup |
opponent |
shot_id |
1 |
Jump Shot |
Jump Shot |
12 |
20000012 |
34.0443 |
-157 |
0 |
-118.4268 |
10 |
1 |
… |
2PT Field Goal |
Left Side(L) |
Mid-Range |
8-16 ft. |
1610612747 |
Los Angeles Lakers |
2000-10-31 |
LAL @ POR |
POR |
2 |
2 |
Jump Shot |
Jump Shot |
35 |
20000012 |
33.9093 |
-101 |
135 |
-118.3708 |
7 |
1 |
… |
2PT Field Goal |
Left Side Center(LC) |
Mid-Range |
16-24 ft. |
1610612747 |
Los Angeles Lakers |
2000-10-31 |
LAL @ POR |
POR |
3 |
3 |
Jump Shot |
Jump Shot |
43 |
20000012 |
33.8693 |
138 |
175 |
-118.1318 |
6 |
1 |
… |
2PT Field Goal |
Right Side Center(RC) |
Mid-Range |
16-24 ft. |
1610612747 |
Los Angeles Lakers |
2000-10-31 |
LAL @ POR |
POR |
4 |
4 |
Driving Dunk Shot |
Dunk |
155 |
20000012 |
34.0443 |
0 |
0 |
-118.2698 |
6 |
2 |
… |
2PT Field Goal |
Center(C) |
Restricted Area |
Less Than 8 ft. |
1610612747 |
Los Angeles Lakers |
2000-10-31 |
LAL @ POR |
POR |
5 |
5 |
Jump Shot |
Jump Shot |
244 |
20000012 |
34.0553 |
-145 |
-11 |
-118.4148 |
9 |
3 |
… |
2PT Field Goal |
Left Side(L) |
Mid-Range |
8-16 ft. |
1610612747 |
Los Angeles Lakers |
2000-10-31 |
LAL @ POR |
POR |
6 |
5 rows × 25 columns
alpha = 0.02
plt.figure(figsize=(15,15))
plt.subplot(121)
plt.scatter(kebo.loc_x,kebo.loc_y,color='R',alpha=alpha)
plt.title('loc_x and loc_y')
plt.subplot(122)
plt.scatter(kebo.lon,kebo.lat,color='B',alpha=alpha)
plt.title('lat and lon')
Text(0.5,1,’lat and lon’)
![png](output_4_1.png)
raw['dist'] = np.sqrt(raw['loc_x']**2+raw['loc_y']**2)
raw.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
action_type |
combined_shot_type |
game_event_id |
game_id |
lat |
loc_x |
loc_y |
lon |
minutes_remaining |
period |
… |
shot_zone_area |
shot_zone_basic |
shot_zone_range |
team_id |
team_name |
game_date |
matchup |
opponent |
shot_id |
dist |
0 |
Jump Shot |
Jump Shot |
10 |
20000012 |
33.9723 |
167 |
72 |
-118.1028 |
10 |
1 |
… |
Right Side(R) |
Mid-Range |
16-24 ft. |
1610612747 |
Los Angeles Lakers |
2000-10-31 |
LAL @ POR |
POR |
1 |
181.859836 |
1 |
Jump Shot |
Jump Shot |
12 |
20000012 |
34.0443 |
-157 |
0 |
-118.4268 |
10 |
1 |
… |
Left Side(L) |
Mid-Range |
8-16 ft. |
1610612747 |
Los Angeles Lakers |
2000-10-31 |
LAL @ POR |
POR |
2 |
157.000000 |
2 |
Jump Shot |
Jump Shot |
35 |
20000012 |
33.9093 |
-101 |
135 |
-118.3708 |
7 |
1 |
… |
Left Side Center(LC) |
Mid-Range |
16-24 ft. |
1610612747 |
Los Angeles Lakers |
2000-10-31 |
LAL @ POR |
POR |
3 |
168.600119 |
3 |
Jump Shot |
Jump Shot |
43 |
20000012 |
33.8693 |
138 |
175 |
-118.1318 |
6 |
1 |
… |
Right Side Center(RC) |
Mid-Range |
16-24 ft. |
1610612747 |
Los Angeles Lakers |
2000-10-31 |
LAL @ POR |
POR |
4 |
222.865430 |
4 |
Driving Dunk Shot |
Dunk |
155 |
20000012 |
34.0443 |
0 |
0 |
-118.2698 |
6 |
2 |
… |
Center(C) |
Restricted Area |
Less Than 8 ft. |
1610612747 |
Los Angeles Lakers |
2000-10-31 |
LAL @ POR |
POR |
5 |
0.000000 |
5 rows × 26 columns
loc_x_zero = raw['loc_x']==0
raw['angle'] = np.array([0]*len(raw))
raw['angle'][~loc_x_zero]=np.arctan(raw['loc_y'][~loc_x_zero]/raw['loc_y'][~loc_x_zero])
raw['angle'][loc_x_zero] = np.pi/2
/home/heres/.conda/envs/GPU_test/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
“””
/home/heres/.conda/envs/GPU_test/lib/python3.6/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
raw['remaining_time'] = raw['minutes_remaining']*60+raw['seconds_remaining']
print(kebo.action_type.unique())
print(kebo.combined_shot_type.unique())
print(kebo['shot_type'].unique())
print(kebo.shot_type.value_counts())
[‘Jump Shot’ ‘Driving Dunk Shot’ ‘Layup Shot’ ‘Running Jump Shot’
‘Reverse Dunk Shot’ ‘Slam Dunk Shot’ ‘Driving Layup Shot’
‘Turnaround Jump Shot’ ‘Reverse Layup Shot’ ‘Tip Shot’
‘Running Hook Shot’ ‘Alley Oop Dunk Shot’ ‘Dunk Shot’
‘Alley Oop Layup shot’ ‘Running Dunk Shot’ ‘Driving Finger Roll Shot’
‘Running Layup Shot’ ‘Finger Roll Shot’ ‘Fadeaway Jump Shot’
‘Follow Up Dunk Shot’ ‘Hook Shot’ ‘Turnaround Hook Shot’ ‘Jump Hook Shot’
‘Running Finger Roll Shot’ ‘Jump Bank Shot’ ‘Turnaround Finger Roll Shot’
‘Hook Bank Shot’ ‘Driving Hook Shot’ ‘Running Tip Shot’
‘Running Reverse Layup Shot’ ‘Driving Finger Roll Layup Shot’
‘Fadeaway Bank shot’ ‘Pullup Jump shot’ ‘Finger Roll Layup Shot’
‘Turnaround Fadeaway shot’ ‘Driving Reverse Layup Shot’
‘Driving Slam Dunk Shot’ ‘Step Back Jump shot’ ‘Turnaround Bank shot’
‘Reverse Slam Dunk Shot’ ‘Floating Jump shot’ ‘Putback Slam Dunk Shot’
‘Running Bank shot’ ‘Driving Bank shot’ ‘Driving Jump shot’
‘Putback Layup Shot’ ‘Putback Dunk Shot’ ‘Running Finger Roll Layup Shot’
‘Pullup Bank shot’ ‘Running Slam Dunk Shot’ ‘Cutting Layup Shot’
‘Driving Floating Jump Shot’ ‘Running Pull-Up Jump Shot’ ‘Tip Layup Shot’
‘Driving Floating Bank Jump Shot’]
[‘Jump Shot’ ‘Dunk’ ‘Layup’ ‘Tip Shot’ ‘Hook Shot’ ‘Bank Shot’]
[‘2PT Field Goal’ ‘3PT Field Goal’]
2PT Field Goal 20285
3PT Field Goal 5412
Name: shot_type, dtype: int64
raw['season'].unique()
array([‘2000-01’, ‘2001-02’, ‘2002-03’, ‘2003-04’, ‘2004-05’, ‘2005-06’,
‘2006-07’, ‘2007-08’, ‘2008-09’, ‘2009-10’, ‘2010-11’, ‘2011-12’,
‘2012-13’, ‘2013-14’, ‘2014-15’, ‘2015-16’, ‘1996-97’, ‘1997-98’,
‘1998-99’, ‘1999-00’], dtype=object)
raw22 = raw['season'].apply(lambda x: int(x.split('-')[1]))
raw['season']=raw22
raw['season'].unique()
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 97,
98, 99, 0])
print(kebo.team_id.unique())
print(kebo['team_name'].unique())
[1610612747]
[‘Los Angeles Lakers’]
pd.DataFrame({'matchup':kebo.matchup,'opponent':kebo.opponent})[0:5]
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
matchup |
opponent |
1 |
LAL @ POR |
POR |
2 |
LAL @ POR |
POR |
3 |
LAL @ POR |
POR |
4 |
LAL @ POR |
POR |
5 |
LAL @ POR |
POR |
plt.figure(figsize=(5,5))
plt.scatter(raw.dist,raw.shot_distance,color='blue')
plt.title('dist and shot_distance')
Text(0.5,1,’dist and shot_distance’)
![png](output_13_1.png)
df = pd.DataFrame({'key1':['a', 'a', 'b', 'b', 'c'],
... 'key2':['one', 'two', 'one', 'two', 'one'],
... 'data1':np.random.randn(5),
... 'data2':np.random.randn(5)})
df
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
key1 |
key2 |
data1 |
data2 |
0 |
a |
one |
-1.208815 |
0.858278 |
1 |
a |
two |
0.189239 |
0.536879 |
2 |
b |
one |
-1.188808 |
0.405909 |
3 |
b |
two |
-0.231954 |
-0.137537 |
4 |
c |
one |
0.358366 |
-1.643352 |
df = pd.concat([df,pd.get_dummies(df['key1'],prefix='key1')],1)
df.drop('key1',1)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
key2 |
data1 |
data2 |
key1_a |
key1_b |
key1_c |
0 |
one |
-1.208815 |
0.858278 |
1 |
0 |
0 |
1 |
two |
0.189239 |
0.536879 |
1 |
0 |
0 |
2 |
one |
-1.188808 |
0.405909 |
0 |
1 |
0 |
3 |
two |
-0.231954 |
-0.137537 |
0 |
1 |
0 |
4 |
one |
0.358366 |
-1.643352 |
0 |
0 |
1 |
ss = df.groupby('key1')
for g in ss:
print(g[1])
for g in ss:
print(g)
key1 key2 data1 data2
0 a one 1.084443 -0.806561
1 a two -0.191386 2.743989
key1 key2 data1 data2
2 b one 0.302269 -0.560596
3 b two -0.213348 0.962007
key1 key2 data1 data2
4 c one -0.893631 -0.193283
(‘a’, key1 key2 data1 data2
0 a one 1.084443 -0.806561
1 a two -0.191386 2.743989)
(‘b’, key1 key2 data1 data2
2 b one 0.302269 -0.560596
3 b two -0.213348 0.962007)
(‘c’, key1 key2 data1 data2
4 c one -0.893631 -0.193283)
df['data2'].groupby(df['key1']).sum()
key1
a -2.167849
b -1.954537
Name: data2, dtype: float64
import matplotlib.cm as cm
plt.figure(figsize=(20,10))
def scatter_plot_by_category(feat):
appha = 0.1
gs = kebo.groupby(feat)
cs = cm.rainbow(np.linspace(0,1,len(gs)))
for g,c in zip(gs,cs):
plt.scatter(g[1].loc_x,g[1].loc_y,color=c,alpha=alpha)
plt.subplot(131)
scatter_plot_by_category('shot_zone_area')
plt.title('shot_zone_area')
plt.subplot(132)df['data1'].groupby(df['key1']).sum()
scatter_plot_by_category('shot_zone_basic')
plt.title('shot_zone_basic')
plt.subplot(133)
scatter_plot_by_category('shot_zone_range')
plt.title('shot_zone_range')
Text(0.5,1,’shot_zone_range’)
![png](output_18_1.png)
print(raw['combined_shot_type'].value_counts())
pd.get_dummies(raw['combined_shot_type'],prefix='combined_shot_type')[0:2]
Jump Shot 23485
Layup 5448
Dunk 1286
Tip Shot 184
Hook Shot 153
Bank Shot 141
Name: combined_shot_type, dtype: int64
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
combined_shot_type_Bank Shot |
combined_shot_type_Dunk |
combined_shot_type_Hook Shot |
combined_shot_type_Jump Shot |
combined_shot_type_Layup |
combined_shot_type_Tip Shot |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
1 |
0 |
0 |
0 |
1 |
0 |
0 |
from sklearn import preprocessing
enc = preprocessing.OneHotEncoder()
enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
enc.transform([[0, 0, 2], [1, 1, 0], [0, 2, 1], [1, 0, 2]]).toarray()
array([[1., 0., 1., 0., 0., 0., 0., 1., 0.],
[0., 1., 0., 1., 0., 1., 0., 0., 0.],
[1., 0., 0., 0., 1., 0., 1., 0., 0.],
[0., 1., 1., 0., 0., 0., 0., 1., 0.]])
filename = 'data.csv'
raw = pd.read_csv(filename)
drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic', \
'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining', \
'shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id', 'game_date']
for drop in drops:
raw = raw.drop(drop, 1)
print(raw['combined_shot_type'].value_counts())
pd.get_dummies(raw['combined_shot_type'], prefix='combined_shot_type')[0:2]
categorical_vars = ['action_type', 'combined_shot_type', 'shot_type', 'opponent', 'period', 'season']
for var in categorical_vars:
raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)], 1)
raw = raw.drop(var, 1)
train_kobe = raw[pd.notnull(raw['shot_made_flag'])]
train_label = train_kobe['shot_made_flag']
train_kobe = train_kobe.drop('shot_made_flag', 1)
test_kobe = raw[pd.isnull(raw['shot_made_flag'])]
test_kobe = test_kobe.drop('shot_made_flag', 1)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix,log_loss
import time
import numpy as np
range_m = np.logspace(0,2,num=5)
print(range_m)
range_m = range_m.astype(int)
range_m
Jump Shot 23485
Layup 5448
Dunk 1286
Tip Shot 184
Hook Shot 153
Bank Shot 141
Name: combined_shot_type, dtype: int64
[ 1. 3.16227766 10. 31.6227766 100. ]
array([ 1, 3, 10, 31, 100])
from sklearn.cross_validation import KFold
print('find best n_estimators for RandomForestClassfier...')
min_score = 100000
best_n = 0
scores_n = []
range_n = np.linspace(1,100,num=10).astype(int)
for n in range_n:
print('the number of trees: {0}'.format(n))
t1= time.time()
rfc_score = 0.
rfc = RandomForestClassifier(n_estimators=n)
for train_k,test_k in KFold(len(train_kobe),n_folds=10,shuffle=True):
rfc.fit(train_kobe.iloc[train_k],train_label.iloc[train_k])
pred = rfc.predict(train_kobe.iloc[test_k])
rfc_score += log_loss(train_label.iloc[test_k],pred)/10
scores_n.append(rfc_score)
print(rfc_score)
if rfc_score < min_score:
min_score = rfc_score
best_n = n
t2 = time.time()
print('Done processing {0} trees ({1:.3f}sec)'.format(n,t2-t1))
print(best_n,min_score)
find best n_estimators for RandomForestClassfier...
the number of trees: 1
13.919386979412682
Done processing 1 trees (0.726sec)
the number of trees: 12
13.389844271562911
Done processing 12 trees (6.008sec)
the number of trees: 23
13.180159610972076
Done processing 23 trees (11.043sec)
the number of trees: 34
13.219150771746676
Done processing 34 trees (18.443sec)
the number of trees: 45
13.043065195199537
Done processing 45 trees (22.152sec)
the number of trees: 56
13.057856481381911
Done processing 56 trees (27.636sec)
the number of trees: 67
13.055167436843206
Done processing 67 trees (32.039sec)
the number of trees: 78
13.051153915112355
Done processing 78 trees (38.372sec)
the number of trees: 89
13.048438999530031
Done processing 89 trees (44.605sec)
the number of trees: 100
13.114318256666222
Done processing 100 trees (49.886sec)
45 13.043065195199537
print('Finding best max_depth for RandomForestClassifier...')
min_score = 100000
best_m = 0
scores_m = []
range_m = np.logspace(0,2,num=3).astype(int)
for m in range_m:
print("the max depth : {0}".format(m))
t1 = time.time()
rfc_score = 0.
rfc = RandomForestClassifier(max_depth=m, n_estimators=best_n)
for train_k, test_k in KFold(len(train_kobe), n_folds=10, shuffle=True):
rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])
pred = rfc.predict(train_kobe.iloc[test_k])
rfc_score += log_loss(train_label.iloc[test_k], pred) / 10
scores_m.append(rfc_score)
if rfc_score < min_score:
min_score = rfc_score
best_m = m
t2 = time.time()
print('Done processing {0} trees ({1:.3f}sec)'.format(m, t2-t1))
print(best_m, min_score)
Finding best max_depth for RandomForestClassifier...
the max depth : 1
Done processing 1 trees (2.389sec)
the max depth : 10
Done processing 10 trees (6.044sec)
the max depth : 100
Done processing 100 trees (21.773sec)
10 11.06186210729279
plt.figure(figsize=(10,5))
plt.subplot(121)
plt.plot(range_n, scores_n)
plt.ylabel('score')
plt.xlabel('number of trees')
plt.subplot(122)
plt.plot(range_m, scores_m)
plt.ylabel('score')
plt.xlabel('max depth')
Text(0.5,0,'max depth')
model = RandomForestClassifier(n_estimators=best_n, max_depth=best_m)
model.fit(train_kobe, train_label)
pred = model.predict(train_kobe)
count = 0
for i,j in zip(train_label,pred):
if i == j:
count +=1
print("acc is {0}".format(count/len(train_label)))
acc is 0.6850215978518893
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_kobe,train_label, test_size=0.3, random_state=0)
model = RandomForestClassifier(n_estimators=best_n, max_depth=best_m)
model.fit(X_train,y_train)
pre = model.predict(X_test)
count = 0
for i,j in zip(y_test,pred):
if i == j:
count +=1
print("acc is {0}".format(count/len(pre)))
acc is 0.524254215304799