保潔業務數據概況分析

#調包
import pandas as pd

#數據讀取#
#index_col=0 ，去除Unnamed=0數據
store=pd.read_csv('w2_store_rev.csv',index_col=0)

#數據的基本信息
#發現local_tv有50多個空值
#發現event是object,即類別型變量
store.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 985 entries, 845 to 26
Data columns (total 7 columns):
revenue     985 non-null float64
reach       985 non-null int64
local_tv    929 non-null float64
online      985 non-null int64
instore     985 non-null int64
person      985 non-null int64
event       985 non-null object
dtypes: float64(2), int64(4), object(1)
memory usage: 61.6+ KB

#統計各個列哪些是空值
#發現local_tv有56個空值
store.isnull().sum()

revenue      0
reach        0
local_tv    56
online       0
instore      0
person       0
event        0
dtype: int64

#瞭解數據的分佈
#判斷數據是否符合業務場景
store.describe()

	revenue	reach	local_tv	online	instore	person
count	985.000000	985.000000	929.000000	985.000000	985.000000	985.000000
mean	38357.355025	3.395939	31324.061109	1596.527919	3350.962437	11.053807
std	11675.603883	1.011913	3970.934733	496.131586	976.546381	3.041740
min	5000.000000	0.000000	20000.000000	0.000000	0.000000	0.000000
25%	30223.600000	3.000000	28733.830000	1253.000000	2690.000000	9.000000
50%	38159.110000	3.000000	31104.520000	1607.000000	3351.000000	11.000000
75%	45826.520000	4.000000	33972.410000	1921.000000	4011.000000	13.000000
max	79342.070000	7.000000	43676.900000	3280.000000	6489.000000	24.000000

#瞭解event的具體值
store.event.unique()

array(['non_event', 'special', 'cobranding', 'holiday'], dtype=object)

#這些類別對應的revenue(銷售額)是怎樣的
store.groupby(['event'])['revenue'].describe()

	count	mean	std	min	25%	50%	75%	max
event
cobranding	398.0	38277.664497	11879.097324	7146.99	30472.1525	37864.155	46333.5600	79342.07
holiday	103.0	37791.890583	11942.369136	5000.00	29644.5250	38432.780	46036.1300	73377.15
non_event	192.0	37903.081562	11186.436740	6874.43	29852.3775	37937.175	44611.6375	69429.39
special	292.0	38964.136438	11648.616882	10207.96	30325.8125	39197.870	45897.0400	71757.50

#這幾個類別對應的local_tv（本地電視廣告投入）是怎樣的
store.groupby(['event'])['local_tv'].describe()

	count	mean	std	min	25%	50%	75%	max
event
cobranding	376.0	31424.590186	3951.049566	21252.35	28746.9725	31336.570	33839.0200	42162.64
holiday	96.0	30860.524896	4448.719364	21792.84	27769.6000	30564.705	33595.5975	41047.01
non_event	182.0	31415.197527	3952.155383	20000.00	29222.5875	31238.235	34386.0825	42069.84
special	275.0	31288.110982	3842.412128	21428.20	28668.7100	30921.790	34105.6250	43676.90

#將類別變量轉化爲啞變量
store=pd.get_dummies(store)

#生成event的4個標籤，每個標籤取值0/1
store.head(10)

	revenue	reach	local_tv	online	instore	person	event_cobranding	event_holiday	event_non_event	event_special
845	45860.28	2	31694.91	2115	3296	8	0	0	1	0
483	63588.23	2	35040.17	1826	2501	14	0	0	0	1
513	23272.69	4	30992.82	1851	2524	6	0	0	0	1
599	45911.23	2	29417.78	2437	3049	12	0	0	0	1
120	36644.23	2	35611.11	1122	1142	13	1	0	0	0
867	36172.81	4	22372.59	2001	1881	17	1	0	0	0
847	43797.03	3	31443.74	1667	1846	15	1	0	0	0
950	41629.80	4	35775.75	1155	2715	12	0	0	0	1
942	21303.48	2	24888.31	1853	3677	4	0	0	1	0
550	20746.15	4	26623.48	1497	3075	9	0	1	0	0

#確認類別變量已經轉換成數字變量
store.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 985 entries, 845 to 26
Data columns (total 10 columns):
revenue             985 non-null float64
reach               985 non-null int64
local_tv            929 non-null float64
online              985 non-null int64
instore             985 non-null int64
person              985 non-null int64
event_cobranding    985 non-null uint8
event_holiday       985 non-null uint8
event_non_event     985 non-null uint8
event_special       985 non-null uint8
dtypes: float64(2), int64(4), uint8(4)
memory usage: 57.7 KB

#數據準備完成#

#相關分析#

#所有變量，任意兩個變量相關分析
#local_tv,person,instore是比較好的指標，與revenue相關度高
store.corr()

	revenue	reach	local_tv	online	instore	person	event_cobranding	event_holiday	event_non_event	event_special
revenue	1.000000	-0.155314	0.602114	0.171227	0.311739	0.559208	-0.005623	-0.016559	-0.019155	0.033752
reach	-0.155314	1.000000	-0.034039	-0.025141	0.035635	0.061417	0.043809	0.020398	-0.043128	-0.023330
local_tv	0.602114	-0.034039	1.000000	0.006775	-0.046825	0.048664	0.020886	-0.039650	0.011335	-0.005874
online	0.171227	-0.025141	0.006775	1.000000	-0.026399	0.036662	-0.024646	-0.018596	-0.020587	0.056799
instore	0.311739	0.035635	-0.046825	-0.026399	1.000000	-0.007482	-0.057725	0.045963	0.015495	0.017788
person	0.559208	0.061417	0.048664	0.036662	-0.007482	1.000000	0.002439	-0.025692	-0.025568	0.036771
event_cobranding	-0.005623	0.043809	0.020886	-0.024646	-0.057725	0.002439	1.000000	-0.281389	-0.405169	-0.534499
event_holiday	-0.016559	0.020398	-0.039650	-0.018596	0.045963	-0.025692	-0.281389	1.000000	-0.168151	-0.221824
event_non_event	-0.019155	-0.043128	0.011335	-0.020587	0.015495	-0.025568	-0.405169	-0.168151	1.000000	-0.319403
event_special	0.033752	-0.023330	-0.005874	0.056799	0.017788	0.036771	-0.534499	-0.221824	-0.319403	1.000000

#其他變量與revenue的相關分析
#sort_values 將revenue排序，ascending默認升序，False爲降序排列
#看到前3個相關變量爲local_tv,person,instore
store.corr()[['revenue']].sort_values('revenue',ascending=False)

	revenue
revenue	1.000000
local_tv	0.602114
person	0.559208
instore	0.311739
online	0.171227
event_special	0.033752
event_cobranding	-0.005623
event_holiday	-0.016559
event_non_event	-0.019155
reach	-0.155314

#可視化分析
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#線性關係可視化
#斜率與相關係數有關
sns.regplot('local_tv','revenue',store)

<matplotlib.axes._subplots.AxesSubplot at 0x1f2206a22b0>

#線性關係可視化
sns.regplot('person','revenue',store)

<matplotlib.axes._subplots.AxesSubplot at 0x1f2207de8d0>

#線性關係可視化
sns.regplot('instore','revenue',store)

<matplotlib.axes._subplots.AxesSubplot at 0x1f220855978>

#線性迴歸分析 
#調包
from sklearn.linear_model import LinearRegression

#建模
model=LinearRegression()

#設定自變量和因變量
y=store['revenue']
#第一次三個 x=store[['local_tv','person','instore']
#第二次四個 x=store[['local_tv','person','instore','online']]

model.fit(x,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

#缺失值處理,填充0
store=store.fillna(0)

#缺失值處理,均值填充
store=store.fillna(store.local_tv.mean())

store.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 985 entries, 845 to 26
Data columns (total 10 columns):
revenue             985 non-null float64
reach               985 non-null int64
local_tv            985 non-null float64
online              985 non-null int64
instore             985 non-null int64
person              985 non-null int64
event_cobranding    985 non-null uint8
event_holiday       985 non-null uint8
event_non_event     985 non-null uint8
event_special       985 non-null uint8
dtypes: float64(2), int64(4), uint8(4)
memory usage: 57.7 KB

#自變量係數
model.coef_

array([4.01736340e-01, 2.10959413e+03, 3.61050584e+00, 3.79371728e+00])

#模型的截距
model.intercept_

-8967.736870300454

#模型的評估,x爲'local_tv','person','instore'
score=model.score(x,y)#x和y打分
predictions=model.predict(x)#計算y預測值
error=predictions-y#計算誤差

rmse=(error**2).mean()**.5#計算rmse
mae=abs(error).mean()#計算mae

print(rmse)
print(mae)

8321.491623472051
6556.036999600779

#模型的評估,x爲'local_tv','person','instore'，'online'
#發現模型誤差略有調整
score=model.score(x,y)#x和y打分
predictions=model.predict(x)#計算y預測值
error=predictions-y#計算誤差

rmse=(error**2).mean()**.5#計算rmse
mae=abs(error).mean()#計算mae

print(rmse)
print(mae)

8106.512169325369
6402.202883441895

#模型的評估,x爲'local_tv','person','instore'，'online'，local_tv用均值填充
#發現誤差大幅下降，預測效果提升
score=model.score(x,y)#x和y打分
predictions=model.predict(x)#計算y預測值
error=predictions-y#計算誤差

rmse=(error**2).mean()**.5#計算rmse
mae=abs(error).mean()#計算mae

print(rmse)
print(mae)

5591.764749668997
4485.506383110859

#如果各位希望能看到標準的模型輸出表
from statsmodels.formula.api import ols

x=store[['local_tv','person','instore']]   #生成自變量
y=store['revenue']

model=ols('y~x',store).fit()

#觀察coef-係數，P值顯著性
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.746
Model:                            OLS   Adj. R-squared:                  0.745
Method:                 Least Squares   F-statistic:                     959.2
Date:                Wed, 19 Jun 2019   Prob (F-statistic):          4.09e-291
Time:                        15:24:38   Log-Likelihood:                -9947.5
No. Observations:                 985   AIC:                         1.990e+04
Df Residuals:                     981   BIC:                         1.992e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept  -5.288e+04   1804.489    -29.305      0.000   -5.64e+04   -4.93e+04
x[0]           1.7515      0.049     35.857      0.000       1.656       1.847
x[1]        2050.5749     61.866     33.146      0.000    1929.171    2171.979
x[2]           4.0903      0.193     21.229      0.000       3.712       4.468
==============================================================================
Omnibus:                        0.352   Durbin-Watson:                   2.056
Prob(Omnibus):                  0.839   Jarque-Bera (JB):                0.402
Skew:                           0.043   Prob(JB):                        0.818
Kurtosis:                       2.951   Cond. No.                     3.05e+05
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.05e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

保潔業務數據概況分析

項目學習01--用戶畫像

基於 Python 的 11 種經典數據降維算法|LLE(locally linear embedding)降維算法

基於 Python 的 11 種經典數據降維算法|主成分分析（PCA）降維

保潔業務數據概況分析

基於 Python 的 11 種經典數據降維算法|LPP(Locality Preserving Projections)

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結