保潔業務數據概況分析

#調包
import pandas as pd
#數據讀取#
#index_col=0 ,去除Unnamed=0數據
store=pd.read_csv('w2_store_rev.csv',index_col=0)
#數據的基本信息
#發現local_tv有50多個空值
#發現event是object,即類別型變量
store.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 985 entries, 845 to 26
Data columns (total 7 columns):
revenue     985 non-null float64
reach       985 non-null int64
local_tv    929 non-null float64
online      985 non-null int64
instore     985 non-null int64
person      985 non-null int64
event       985 non-null object
dtypes: float64(2), int64(4), object(1)
memory usage: 61.6+ KB
#統計各個列哪些是空值
#發現local_tv有56個空值
store.isnull().sum()
revenue      0
reach        0
local_tv    56
online       0
instore      0
person       0
event        0
dtype: int64
#瞭解數據的分佈
#判斷數據是否符合業務場景
store.describe()
revenue reach local_tv online instore person
count 985.000000 985.000000 929.000000 985.000000 985.000000 985.000000
mean 38357.355025 3.395939 31324.061109 1596.527919 3350.962437 11.053807
std 11675.603883 1.011913 3970.934733 496.131586 976.546381 3.041740
min 5000.000000 0.000000 20000.000000 0.000000 0.000000 0.000000
25% 30223.600000 3.000000 28733.830000 1253.000000 2690.000000 9.000000
50% 38159.110000 3.000000 31104.520000 1607.000000 3351.000000 11.000000
75% 45826.520000 4.000000 33972.410000 1921.000000 4011.000000 13.000000
max 79342.070000 7.000000 43676.900000 3280.000000 6489.000000 24.000000
#瞭解event的具體值
store.event.unique()
array(['non_event', 'special', 'cobranding', 'holiday'], dtype=object)
#這些類別對應的revenue(銷售額)是怎樣的
store.groupby(['event'])['revenue'].describe()
count mean std min 25% 50% 75% max
event
cobranding 398.0 38277.664497 11879.097324 7146.99 30472.1525 37864.155 46333.5600 79342.07
holiday 103.0 37791.890583 11942.369136 5000.00 29644.5250 38432.780 46036.1300 73377.15
non_event 192.0 37903.081562 11186.436740 6874.43 29852.3775 37937.175 44611.6375 69429.39
special 292.0 38964.136438 11648.616882 10207.96 30325.8125 39197.870 45897.0400 71757.50
#這幾個類別對應的local_tv(本地電視廣告投入)是怎樣的
store.groupby(['event'])['local_tv'].describe()
count mean std min 25% 50% 75% max
event
cobranding 376.0 31424.590186 3951.049566 21252.35 28746.9725 31336.570 33839.0200 42162.64
holiday 96.0 30860.524896 4448.719364 21792.84 27769.6000 30564.705 33595.5975 41047.01
non_event 182.0 31415.197527 3952.155383 20000.00 29222.5875 31238.235 34386.0825 42069.84
special 275.0 31288.110982 3842.412128 21428.20 28668.7100 30921.790 34105.6250 43676.90
#將類別變量轉化爲啞變量
store=pd.get_dummies(store)
#生成event的4個標籤,每個標籤取值0/1
store.head(10)
revenue reach local_tv online instore person event_cobranding event_holiday event_non_event event_special
845 45860.28 2 31694.91 2115 3296 8 0 0 1 0
483 63588.23 2 35040.17 1826 2501 14 0 0 0 1
513 23272.69 4 30992.82 1851 2524 6 0 0 0 1
599 45911.23 2 29417.78 2437 3049 12 0 0 0 1
120 36644.23 2 35611.11 1122 1142 13 1 0 0 0
867 36172.81 4 22372.59 2001 1881 17 1 0 0 0
847 43797.03 3 31443.74 1667 1846 15 1 0 0 0
950 41629.80 4 35775.75 1155 2715 12 0 0 0 1
942 21303.48 2 24888.31 1853 3677 4 0 0 1 0
550 20746.15 4 26623.48 1497 3075 9 0 1 0 0
#確認類別變量已經轉換成數字變量
store.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 985 entries, 845 to 26
Data columns (total 10 columns):
revenue             985 non-null float64
reach               985 non-null int64
local_tv            929 non-null float64
online              985 non-null int64
instore             985 non-null int64
person              985 non-null int64
event_cobranding    985 non-null uint8
event_holiday       985 non-null uint8
event_non_event     985 non-null uint8
event_special       985 non-null uint8
dtypes: float64(2), int64(4), uint8(4)
memory usage: 57.7 KB
#數據準備完成#
#相關分析#
#所有變量,任意兩個變量相關分析
#local_tv,person,instore是比較好的指標,與revenue相關度高
store.corr()
revenue reach local_tv online instore person event_cobranding event_holiday event_non_event event_special
revenue 1.000000 -0.155314 0.602114 0.171227 0.311739 0.559208 -0.005623 -0.016559 -0.019155 0.033752
reach -0.155314 1.000000 -0.034039 -0.025141 0.035635 0.061417 0.043809 0.020398 -0.043128 -0.023330
local_tv 0.602114 -0.034039 1.000000 0.006775 -0.046825 0.048664 0.020886 -0.039650 0.011335 -0.005874
online 0.171227 -0.025141 0.006775 1.000000 -0.026399 0.036662 -0.024646 -0.018596 -0.020587 0.056799
instore 0.311739 0.035635 -0.046825 -0.026399 1.000000 -0.007482 -0.057725 0.045963 0.015495 0.017788
person 0.559208 0.061417 0.048664 0.036662 -0.007482 1.000000 0.002439 -0.025692 -0.025568 0.036771
event_cobranding -0.005623 0.043809 0.020886 -0.024646 -0.057725 0.002439 1.000000 -0.281389 -0.405169 -0.534499
event_holiday -0.016559 0.020398 -0.039650 -0.018596 0.045963 -0.025692 -0.281389 1.000000 -0.168151 -0.221824
event_non_event -0.019155 -0.043128 0.011335 -0.020587 0.015495 -0.025568 -0.405169 -0.168151 1.000000 -0.319403
event_special 0.033752 -0.023330 -0.005874 0.056799 0.017788 0.036771 -0.534499 -0.221824 -0.319403 1.000000
#其他變量與revenue的相關分析
#sort_values 將revenue排序,ascending默認升序,False爲降序排列
#看到前3個相關變量爲local_tv,person,instore
store.corr()[['revenue']].sort_values('revenue',ascending=False)
revenue
revenue 1.000000
local_tv 0.602114
person 0.559208
instore 0.311739
online 0.171227
event_special 0.033752
event_cobranding -0.005623
event_holiday -0.016559
event_non_event -0.019155
reach -0.155314
#可視化分析
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#線性關係可視化
#斜率與相關係數有關
sns.regplot('local_tv','revenue',store)
<matplotlib.axes._subplots.AxesSubplot at 0x1f2206a22b0>

[外鏈圖片轉存失敗(img-xyg3ngFU-1567847898797)(output_16_1.png)]

#線性關係可視化
sns.regplot('person','revenue',store)
<matplotlib.axes._subplots.AxesSubplot at 0x1f2207de8d0>

[外鏈圖片轉存失敗(img-CfxQK9iP-1567847898797)(output_17_1.png)]

#線性關係可視化
sns.regplot('instore','revenue',store)
<matplotlib.axes._subplots.AxesSubplot at 0x1f220855978>

[外鏈圖片轉存失敗(img-62uC8x3m-1567847898798)(output_18_1.png)]

#線性迴歸分析 
#調包
from sklearn.linear_model import LinearRegression
#建模
model=LinearRegression()
#設定自變量和因變量
y=store['revenue']
#第一次三個 x=store[['local_tv','person','instore']
#第二次四個 x=store[['local_tv','person','instore','online']]
model.fit(x,y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
#缺失值處理,填充0
store=store.fillna(0)
#缺失值處理,均值填充
store=store.fillna(store.local_tv.mean())
store.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 985 entries, 845 to 26
Data columns (total 10 columns):
revenue             985 non-null float64
reach               985 non-null int64
local_tv            985 non-null float64
online              985 non-null int64
instore             985 non-null int64
person              985 non-null int64
event_cobranding    985 non-null uint8
event_holiday       985 non-null uint8
event_non_event     985 non-null uint8
event_special       985 non-null uint8
dtypes: float64(2), int64(4), uint8(4)
memory usage: 57.7 KB
#自變量係數
model.coef_
array([4.01736340e-01, 2.10959413e+03, 3.61050584e+00, 3.79371728e+00])
#模型的截距
model.intercept_
-8967.736870300454
#模型的評估,x爲'local_tv','person','instore'
score=model.score(x,y)#x和y打分
predictions=model.predict(x)#計算y預測值
error=predictions-y#計算誤差

rmse=(error**2).mean()**.5#計算rmse
mae=abs(error).mean()#計算mae

print(rmse)
print(mae)
8321.491623472051
6556.036999600779
#模型的評估,x爲'local_tv','person','instore','online'
#發現模型誤差略有調整
score=model.score(x,y)#x和y打分
predictions=model.predict(x)#計算y預測值
error=predictions-y#計算誤差

rmse=(error**2).mean()**.5#計算rmse
mae=abs(error).mean()#計算mae

print(rmse)
print(mae)
8106.512169325369
6402.202883441895
#模型的評估,x爲'local_tv','person','instore','online',local_tv用均值填充
#發現誤差大幅下降,預測效果提升
score=model.score(x,y)#x和y打分
predictions=model.predict(x)#計算y預測值
error=predictions-y#計算誤差

rmse=(error**2).mean()**.5#計算rmse
mae=abs(error).mean()#計算mae

print(rmse)
print(mae)
5591.764749668997
4485.506383110859
#如果各位希望能看到標準的模型輸出表
from statsmodels.formula.api import ols
x=store[['local_tv','person','instore']]   #生成自變量
y=store['revenue']  
model=ols('y~x',store).fit()
#觀察coef-係數,P值顯著性
print(model.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.746
Model:                            OLS   Adj. R-squared:                  0.745
Method:                 Least Squares   F-statistic:                     959.2
Date:                Wed, 19 Jun 2019   Prob (F-statistic):          4.09e-291
Time:                        15:24:38   Log-Likelihood:                -9947.5
No. Observations:                 985   AIC:                         1.990e+04
Df Residuals:                     981   BIC:                         1.992e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept  -5.288e+04   1804.489    -29.305      0.000   -5.64e+04   -4.93e+04
x[0]           1.7515      0.049     35.857      0.000       1.656       1.847
x[1]        2050.5749     61.866     33.146      0.000    1929.171    2171.979
x[2]           4.0903      0.193     21.229      0.000       3.712       4.468
==============================================================================
Omnibus:                        0.352   Durbin-Watson:                   2.056
Prob(Omnibus):                  0.839   Jarque-Bera (JB):                0.402
Skew:                           0.043   Prob(JB):                        0.818
Kurtosis:                       2.951   Cond. No.                     3.05e+05
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.05e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章