#導入可能需要的包,應爲我們先做的是初步的分析嘛,所以可視化的包和pandas,numpy ,scipy還是要導入的
#https://www.kaggle.com/wkevin/house-prices/notebook
import plotly.offline as py
from plotly.graph_objs import Scatter, Layout
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from pyecharts import Bar,Radar,enable_nteract,Scatter
#變量類型檢查
# 數值量特徵
feats_numeric = data.dtypes[data.dtypes != "object"].index.values
# 字符量特徵
feats_object = data.dtypes[data.dtypes == "object"].index.values
print(feats_numeric.shape,feats_object.shape)
def plotfeats(df,feats,kind,cols=4,y='AveragePrice'):
rows=int(np.ceil(len(feats)/cols))
if rows==1 and len(feats)<cols:
cols=len(feats)
#我們要畫圖,圖的整體設計有多少多少列,列數一般自定義cols,但當feats少於cols,len(feats)作爲列數
if kind=='hs':#柱狀圖和散點圖,行數變爲原來的兩倍
fig,axes=plt.subplots(nrows=rows*2,ncols=cols,figsize=(cols*5,rows*10))
else:
fig,axes=plt.subplots(nrows=rows,ncols=cols,figsize=(cols*5,rows*5))
if rows==1 and cols==1:
axes=np.array([axes])
axes=axes.reshape(rows,cols)# 當 rows=1 時,axes.shape:(cols,),需要reshape一下
i=0#特徵計數器
for f in feats:
#print(int(i/cols),i%cols)#圖的位置
if kind=='hist':
df.plot.hist(y=f,bins=100,ax=axes[int(i/cols),i%cols])
elif kind == 'scatter':
df.plot.scatter(x=f,y=y, ax=axes[int(i/cols),i%cols])
elif kind == 'hs':
df.plot.hist(y=f,bins=100,ax=axes[int(i/cols)*2,i%cols])#偶數行
df.plot.scatter(x=f,y=y, ax=axes[int(i/cols)*2+1,i%cols])#奇數行
elif kind == 'box':
df.plot.box(y=f,ax=axes[int(i/cols),i%cols])
elif kind == 'boxp':
sns.boxplot(x=f,y=y, data=df, ax=axes[int(i/cols),i%cols])
i += 1
plt.show()
###plotly
trace1 = go.Histogram(
x =data['AveragePrice'],
#histnorm = 'probability',
opacity = 0.75
)
dat= [trace1]
layout = go.Layout(barmode='overlay')
fig = go.Figure(data = dat, layout = layout)
py.iplot(fig)
# 計算各列自己的偏離度
skewed = data[feats_numeric].apply(lambda x: stats.skew(x.dropna())).sort_values(ascending=False)
#skewed = df_allX[feats_numeric].skew().sort_values(ascending=False)
# 計算各列自己的峯度
kurted = data[feats_numeric].kurt().sort_values(ascending=False)
kurted[:10]
def spearman(frame, features):
'''
採用“斯皮爾曼等級相關”來計算變量與房價的相關性(可查閱百科)
'''
spr = pd.DataFrame()
spr['feature'] = features
spr['corr'] = [frame[f].corr(frame['AveragePrice'], 'spearman') for f in features] # 此處用的是 Series.corr()
spr = spr.sort_values('corr')
plt.figure(figsize=(6, 0.5*len(features)))
sns.barplot(data=spr, y='feature', x='corr', orient='h')
plt.show()
corr_pearson = data.corr(method='pearson')
corr_spearman = data.corr(method='spearman')
#缺失值分析
def missing_value(alldata):
alldata_na = pd.DataFrame(alldata.isnull().sum(), columns={'missingNum'})#數據框默認列加和
alldata_na['missingRatio'] = alldata_na['missingNum']/len(alldata)*100#計算缺失比例
alldata_na['existNum'] = len(alldata) - alldata_na['missingNum']#每個變量中非缺失值的個數
alldata_na['train_notna']=len(train)-train.isnull().sum()#訓練集非缺失值的個數
alldata_na['test_notna']=alldata_na['existNum']-alldata_na['train_notna']#測試集非缺失值個數
alldata_na = alldata_na[alldata_na['missingNum']>0].reset_index().sort_values(by=['missingNum','index'],ascending=[False,True])
#找到有缺失的變量,只保存這些,設置index爲新變量 ,完成後數據框排序按照'missingNum','index',前者降序,後者升序
alldata_na.set_index('index',inplace=True)#重新更新索引,以index 列作爲索引
return alldata_na