import pandas as pd
store=pd.read_csv('w2_store_rev.csv',index_col=0)
store.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 985 entries, 845 to 26
Data columns (total 7 columns):
revenue 985 non-null float64
reach 985 non-null int64
local_tv 929 non-null float64
online 985 non-null int64
instore 985 non-null int64
person 985 non-null int64
event 985 non-null object
dtypes: float64(2), int64(4), object(1)
memory usage: 61.6+ KB
store.isnull().sum()
revenue 0
reach 0
local_tv 56
online 0
instore 0
person 0
event 0
dtype: int64
store.describe()
|
revenue |
reach |
local_tv |
online |
instore |
person |
count |
985.000000 |
985.000000 |
929.000000 |
985.000000 |
985.000000 |
985.000000 |
mean |
38357.355025 |
3.395939 |
31324.061109 |
1596.527919 |
3350.962437 |
11.053807 |
std |
11675.603883 |
1.011913 |
3970.934733 |
496.131586 |
976.546381 |
3.041740 |
min |
5000.000000 |
0.000000 |
20000.000000 |
0.000000 |
0.000000 |
0.000000 |
25% |
30223.600000 |
3.000000 |
28733.830000 |
1253.000000 |
2690.000000 |
9.000000 |
50% |
38159.110000 |
3.000000 |
31104.520000 |
1607.000000 |
3351.000000 |
11.000000 |
75% |
45826.520000 |
4.000000 |
33972.410000 |
1921.000000 |
4011.000000 |
13.000000 |
max |
79342.070000 |
7.000000 |
43676.900000 |
3280.000000 |
6489.000000 |
24.000000 |
store.event.unique()
array(['non_event', 'special', 'cobranding', 'holiday'], dtype=object)
store.groupby(['event'])['revenue'].describe()
|
count |
mean |
std |
min |
25% |
50% |
75% |
max |
event |
|
|
|
|
|
|
|
|
cobranding |
398.0 |
38277.664497 |
11879.097324 |
7146.99 |
30472.1525 |
37864.155 |
46333.5600 |
79342.07 |
holiday |
103.0 |
37791.890583 |
11942.369136 |
5000.00 |
29644.5250 |
38432.780 |
46036.1300 |
73377.15 |
non_event |
192.0 |
37903.081562 |
11186.436740 |
6874.43 |
29852.3775 |
37937.175 |
44611.6375 |
69429.39 |
special |
292.0 |
38964.136438 |
11648.616882 |
10207.96 |
30325.8125 |
39197.870 |
45897.0400 |
71757.50 |
store.groupby(['event'])['local_tv'].describe()
|
count |
mean |
std |
min |
25% |
50% |
75% |
max |
event |
|
|
|
|
|
|
|
|
cobranding |
376.0 |
31424.590186 |
3951.049566 |
21252.35 |
28746.9725 |
31336.570 |
33839.0200 |
42162.64 |
holiday |
96.0 |
30860.524896 |
4448.719364 |
21792.84 |
27769.6000 |
30564.705 |
33595.5975 |
41047.01 |
non_event |
182.0 |
31415.197527 |
3952.155383 |
20000.00 |
29222.5875 |
31238.235 |
34386.0825 |
42069.84 |
special |
275.0 |
31288.110982 |
3842.412128 |
21428.20 |
28668.7100 |
30921.790 |
34105.6250 |
43676.90 |
store=pd.get_dummies(store)
store.head(10)
|
revenue |
reach |
local_tv |
online |
instore |
person |
event_cobranding |
event_holiday |
event_non_event |
event_special |
845 |
45860.28 |
2 |
31694.91 |
2115 |
3296 |
8 |
0 |
0 |
1 |
0 |
483 |
63588.23 |
2 |
35040.17 |
1826 |
2501 |
14 |
0 |
0 |
0 |
1 |
513 |
23272.69 |
4 |
30992.82 |
1851 |
2524 |
6 |
0 |
0 |
0 |
1 |
599 |
45911.23 |
2 |
29417.78 |
2437 |
3049 |
12 |
0 |
0 |
0 |
1 |
120 |
36644.23 |
2 |
35611.11 |
1122 |
1142 |
13 |
1 |
0 |
0 |
0 |
867 |
36172.81 |
4 |
22372.59 |
2001 |
1881 |
17 |
1 |
0 |
0 |
0 |
847 |
43797.03 |
3 |
31443.74 |
1667 |
1846 |
15 |
1 |
0 |
0 |
0 |
950 |
41629.80 |
4 |
35775.75 |
1155 |
2715 |
12 |
0 |
0 |
0 |
1 |
942 |
21303.48 |
2 |
24888.31 |
1853 |
3677 |
4 |
0 |
0 |
1 |
0 |
550 |
20746.15 |
4 |
26623.48 |
1497 |
3075 |
9 |
0 |
1 |
0 |
0 |
store.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 985 entries, 845 to 26
Data columns (total 10 columns):
revenue 985 non-null float64
reach 985 non-null int64
local_tv 929 non-null float64
online 985 non-null int64
instore 985 non-null int64
person 985 non-null int64
event_cobranding 985 non-null uint8
event_holiday 985 non-null uint8
event_non_event 985 non-null uint8
event_special 985 non-null uint8
dtypes: float64(2), int64(4), uint8(4)
memory usage: 57.7 KB
store.corr()
|
revenue |
reach |
local_tv |
online |
instore |
person |
event_cobranding |
event_holiday |
event_non_event |
event_special |
revenue |
1.000000 |
-0.155314 |
0.602114 |
0.171227 |
0.311739 |
0.559208 |
-0.005623 |
-0.016559 |
-0.019155 |
0.033752 |
reach |
-0.155314 |
1.000000 |
-0.034039 |
-0.025141 |
0.035635 |
0.061417 |
0.043809 |
0.020398 |
-0.043128 |
-0.023330 |
local_tv |
0.602114 |
-0.034039 |
1.000000 |
0.006775 |
-0.046825 |
0.048664 |
0.020886 |
-0.039650 |
0.011335 |
-0.005874 |
online |
0.171227 |
-0.025141 |
0.006775 |
1.000000 |
-0.026399 |
0.036662 |
-0.024646 |
-0.018596 |
-0.020587 |
0.056799 |
instore |
0.311739 |
0.035635 |
-0.046825 |
-0.026399 |
1.000000 |
-0.007482 |
-0.057725 |
0.045963 |
0.015495 |
0.017788 |
person |
0.559208 |
0.061417 |
0.048664 |
0.036662 |
-0.007482 |
1.000000 |
0.002439 |
-0.025692 |
-0.025568 |
0.036771 |
event_cobranding |
-0.005623 |
0.043809 |
0.020886 |
-0.024646 |
-0.057725 |
0.002439 |
1.000000 |
-0.281389 |
-0.405169 |
-0.534499 |
event_holiday |
-0.016559 |
0.020398 |
-0.039650 |
-0.018596 |
0.045963 |
-0.025692 |
-0.281389 |
1.000000 |
-0.168151 |
-0.221824 |
event_non_event |
-0.019155 |
-0.043128 |
0.011335 |
-0.020587 |
0.015495 |
-0.025568 |
-0.405169 |
-0.168151 |
1.000000 |
-0.319403 |
event_special |
0.033752 |
-0.023330 |
-0.005874 |
0.056799 |
0.017788 |
0.036771 |
-0.534499 |
-0.221824 |
-0.319403 |
1.000000 |
store.corr()[['revenue']].sort_values('revenue',ascending=False)
|
revenue |
revenue |
1.000000 |
local_tv |
0.602114 |
person |
0.559208 |
instore |
0.311739 |
online |
0.171227 |
event_special |
0.033752 |
event_cobranding |
-0.005623 |
event_holiday |
-0.016559 |
event_non_event |
-0.019155 |
reach |
-0.155314 |
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.regplot('local_tv','revenue',store)
<matplotlib.axes._subplots.AxesSubplot at 0x1f2206a22b0>
sns.regplot('person','revenue',store)
<matplotlib.axes._subplots.AxesSubplot at 0x1f2207de8d0>
sns.regplot('instore','revenue',store)
<matplotlib.axes._subplots.AxesSubplot at 0x1f220855978>
from sklearn.linear_model import LinearRegression
model=LinearRegression()
y=store['revenue']
model.fit(x,y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
store=store.fillna(0)
store=store.fillna(store.local_tv.mean())
store.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 985 entries, 845 to 26
Data columns (total 10 columns):
revenue 985 non-null float64
reach 985 non-null int64
local_tv 985 non-null float64
online 985 non-null int64
instore 985 non-null int64
person 985 non-null int64
event_cobranding 985 non-null uint8
event_holiday 985 non-null uint8
event_non_event 985 non-null uint8
event_special 985 non-null uint8
dtypes: float64(2), int64(4), uint8(4)
memory usage: 57.7 KB
model.coef_
array([4.01736340e-01, 2.10959413e+03, 3.61050584e+00, 3.79371728e+00])
model.intercept_
-8967.736870300454
score=model.score(x,y)
predictions=model.predict(x)
error=predictions-y
rmse=(error**2).mean()**.5
mae=abs(error).mean()
print(rmse)
print(mae)
8321.491623472051
6556.036999600779
score=model.score(x,y)
predictions=model.predict(x)
error=predictions-y
rmse=(error**2).mean()**.5
mae=abs(error).mean()
print(rmse)
print(mae)
8106.512169325369
6402.202883441895
score=model.score(x,y)
predictions=model.predict(x)
error=predictions-y
rmse=(error**2).mean()**.5
mae=abs(error).mean()
print(rmse)
print(mae)
5591.764749668997
4485.506383110859
from statsmodels.formula.api import ols
x=store[['local_tv','person','instore']]
y=store['revenue']
model=ols('y~x',store).fit()
print(model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.746
Model: OLS Adj. R-squared: 0.745
Method: Least Squares F-statistic: 959.2
Date: Wed, 19 Jun 2019 Prob (F-statistic): 4.09e-291
Time: 15:24:38 Log-Likelihood: -9947.5
No. Observations: 985 AIC: 1.990e+04
Df Residuals: 981 BIC: 1.992e+04
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept -5.288e+04 1804.489 -29.305 0.000 -5.64e+04 -4.93e+04
x[0] 1.7515 0.049 35.857 0.000 1.656 1.847
x[1] 2050.5749 61.866 33.146 0.000 1929.171 2171.979
x[2] 4.0903 0.193 21.229 0.000 3.712 4.468
==============================================================================
Omnibus: 0.352 Durbin-Watson: 2.056
Prob(Omnibus): 0.839 Jarque-Bera (JB): 0.402
Skew: 0.043 Prob(JB): 0.818
Kurtosis: 2.951 Cond. No. 3.05e+05
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.05e+05. This might indicate that there are
strong multicollinearity or other numerical problems.