python EDA

#導入可能需要的包,應爲我們先做的是初步的分析嘛,所以可視化的包和pandas,numpy ,scipy還是要導入的
#https://www.kaggle.com/wkevin/house-prices/notebook
import plotly.offline as py
from plotly.graph_objs import Scatter, Layout
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from pyecharts import Bar,Radar,enable_nteract,Scatter


#變量類型檢查
# 數值量特徵
feats_numeric  = data.dtypes[data.dtypes != "object"].index.values
# 字符量特徵
feats_object = data.dtypes[data.dtypes == "object"].index.values
print(feats_numeric.shape,feats_object.shape)

def plotfeats(df,feats,kind,cols=4,y='AveragePrice'):
    rows=int(np.ceil(len(feats)/cols))
    if rows==1 and len(feats)<cols:
        cols=len(feats)
    #我們要畫圖,圖的整體設計有多少多少列,列數一般自定義cols,但當feats少於cols,len(feats)作爲列數
    if kind=='hs':#柱狀圖和散點圖,行數變爲原來的兩倍
        fig,axes=plt.subplots(nrows=rows*2,ncols=cols,figsize=(cols*5,rows*10))
    else:
        fig,axes=plt.subplots(nrows=rows,ncols=cols,figsize=(cols*5,rows*5))
        if rows==1 and cols==1:
            axes=np.array([axes])
        axes=axes.reshape(rows,cols)# 當 rows=1 時,axes.shape:(cols,),需要reshape一下
    i=0#特徵計數器
    for f in feats:
        #print(int(i/cols),i%cols)#圖的位置
        if kind=='hist':
            df.plot.hist(y=f,bins=100,ax=axes[int(i/cols),i%cols])
        elif kind == 'scatter':
            df.plot.scatter(x=f,y=y, ax=axes[int(i/cols),i%cols])
        elif kind == 'hs':
            df.plot.hist(y=f,bins=100,ax=axes[int(i/cols)*2,i%cols])#偶數行
            df.plot.scatter(x=f,y=y, ax=axes[int(i/cols)*2+1,i%cols])#奇數行
        elif kind == 'box':
            df.plot.box(y=f,ax=axes[int(i/cols),i%cols])
        elif kind == 'boxp':
            sns.boxplot(x=f,y=y, data=df, ax=axes[int(i/cols),i%cols])
        i += 1
    plt.show()              
    
        
        
###plotly
trace1 = go.Histogram(

    x =data['AveragePrice'],

    #histnorm = 'probability',

    opacity = 0.75

)




dat= [trace1]

layout = go.Layout(barmode='overlay')

fig = go.Figure(data = dat, layout = layout)

py.iplot(fig)

# 計算各列自己的偏離度
skewed = data[feats_numeric].apply(lambda x: stats.skew(x.dropna())).sort_values(ascending=False)
#skewed = df_allX[feats_numeric].skew().sort_values(ascending=False)

# 計算各列自己的峯度
kurted = data[feats_numeric].kurt().sort_values(ascending=False)
kurted[:10]

def spearman(frame, features):
    '''
    採用“斯皮爾曼等級相關”來計算變量與房價的相關性(可查閱百科)
    '''
    spr = pd.DataFrame()
    spr['feature'] = features
    spr['corr'] = [frame[f].corr(frame['AveragePrice'], 'spearman') for f in features] # 此處用的是 Series.corr()
    spr = spr.sort_values('corr')
    plt.figure(figsize=(6, 0.5*len(features)))
    sns.barplot(data=spr, y='feature', x='corr', orient='h')    
    plt.show()

corr_pearson = data.corr(method='pearson')
corr_spearman = data.corr(method='spearman')

#缺失值分析

def missing_value(alldata):
    alldata_na = pd.DataFrame(alldata.isnull().sum(), columns={'missingNum'})#數據框默認列加和
    alldata_na['missingRatio'] = alldata_na['missingNum']/len(alldata)*100#計算缺失比例
    alldata_na['existNum'] = len(alldata) - alldata_na['missingNum']#每個變量中非缺失值的個數
    alldata_na['train_notna']=len(train)-train.isnull().sum()#訓練集非缺失值的個數
    alldata_na['test_notna']=alldata_na['existNum']-alldata_na['train_notna']#測試集非缺失值個數
    
    alldata_na = alldata_na[alldata_na['missingNum']>0].reset_index().sort_values(by=['missingNum','index'],ascending=[False,True])
    #找到有缺失的變量,只保存這些,設置index爲新變量 ,完成後數據框排序按照'missingNum','index',前者降序,後者升序
    alldata_na.set_index('index',inplace=True)#重新更新索引,以index 列作爲索引
    return alldata_na

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章