EDA-數據探索性分析
1、加載數據
2、枚舉特徵分類統計
3、數字特徵可視化
4、特徵擴充
5、數字特徵異常值檢測
6、正態分佈檢測
7、對數轉化圖形對比
8、對數正態轉化
9、數字特徵異常值檢測(正態變換後)
10、特徵選擇(根據數據分佈)
11、特徵選擇(l嶺迴歸)
12、特徵選擇(逐步迴歸)
13、特徵選擇(xgboost)
EDA-數據探索性分析
實現思路,按目錄步驟實現如下:
步驟1,首先加載天池二手車價格預測賽題的數據;
步驟2,觀察數據類型,初步可以分爲數值型和枚舉型;爲大致瞭解下數據的分佈情況,對所有指標進行分組統計,可以發現連續型數值的量化統計值:最大最小,方差,峭度,裕度等,也可以初步瞭解下各個指標類別,以及類別的頻數。
步驟3,要對數字特徵進行可視化,觀察數字特徵的分佈。
步驟4,做特徵擴充,此部分特徵擴充主要在針對年份的處理上,提取出年份,和月份單獨作爲特徵,另外對月份爲非法的進行衆數填充。
步驟5,利用箱線圖法對數字特徵進行異常值檢測,此處主要是針對價格特徵進行了異常值檢測
步驟6,對連續型的數值特徵進行正態分佈檢測。
步驟7,進行對數正態轉化及畫圖對比,可以發現某些特徵通過指數變換是可以達到正態分佈的效果的,而有些特徵變化效果則不怎麼樣。
步驟8,對部分有效果的特徵進行正態轉化,再次用異常值檢測方法檢測其效果,
步驟9,對特徵進行篩選,主要採用了直接觀察法,刪除掉哪些明顯不平衡的特徵樣本;另外還通過嶺迴歸,逐步迴歸,xgboost等方法進行特徵選擇,
1、加載數據
def load_data(self,train_data_path):
# 訓練數據初步統計
train_data_df = pd.read_csv(train_data_path, sep=' ')
pd.set_option('display.max_columns', None)
# test_data_path = r"C:\Users\ccs\Documents\dataWhale\used_car_testA_20200313\used_car_testA_20200313.csv"
# test_data_df = pd.read_csv(test_data_path, sep=' ')
# train_data_df = train_data_df.append(test_data_df)
# print("原數據的數量:\n",train_data_df.count())
# print("去重後的數量:\n",train_data_df.drop_duplicates().count())
train_data_df.describe(include='all')
# 空值統計
print(train_data_df.isnull().sum())
print(train_data_df.columns)
return train_data_df
2、枚舉特徵分類統計
def categorial_statistus(self,train_data_df,category_columns):
"""
總體特徵或者字符特徵數據統計
"""
import numpy as np
#
# print(train_data_df['model'])
# category_columns = ['month_regDate']
# train_data_df["regDate"] = train_data_df["regDate"]
print(train_data_df.columns)
# train_data_df.loc[train_data_df['regDate'][4:6],'C']=train_data_df['regDate'][4:6]
# train_data_df
for i in category_columns:
# print(train_data_df.groupby(i).size())
total = pd.DataFrame({'count': train_data_df.groupby(i).size()})
total = total.sort_values(['count'], ascending=False)
print(total, '\n', total.count())
return train_data_df
3、數字特徵可視化
#數字特徵可視化
def plot_nemurical(self,train_data_df,numerical_columns):
## 3) 每個數字特徵得分佈可視化--連續型取值的
##去除掉字符型的變量
# numerical_columns = ['regDate', 'power', 'kilometer', 'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3',
# 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
# 'v_13', 'v_14']
f = pd.melt(train_data_df, value_vars=numerical_columns)
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False)
g = g.map(sns.distplot, "value")
## 4) 數字特徵相互之間的關係可視化
sns.set()
columns = ['price', 'v_12', 'v_8', 'v_0', 'power', 'v_5', 'v_2', 'v_6', 'v_1', 'v_14']
sns.pairplot(train_data_df[columns], size=2, kind='scatter', diag_kind='kde')
plt.show()
4、特徵擴充
def categorial_extend(self,train_data_df):
"""
字符特徵擴展
"""
def fun(x):
if str(x)[4:6] == '00':
rst = str(x)[0:4] + '03' + str(x)[6:]
return rst
else:
return str(x)
train_data_df['regDate'] = train_data_df['regDate'].apply(lambda x: fun(x))
train_data_df["year_regDate"] = train_data_df['regDate'].astype("str").str[0:4]
train_data_df["month_regDate"] = train_data_df['regDate'].astype("str").str[4:6]
return train_data_df
5、數字特徵異常值檢測
# 異常值檢測
def detect_outliers(self,df, n, features):
"""
"""
outlier_indices = []
# iterate over features(columns)
for col in features:
# 1st quartile (25%)
Q1 = np.percentile(df[col], 25)
# 3rd quartile (75%)
Q3 = np.percentile(df[col], 75)
# quartile spacing (IQR)
IQR = Q3 - Q1
# outlier step
outlier_step = 1.5 * IQR
# Determine a list of indices of outliers for feature col
outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index
# append the found outlier indices for col to the list of outlier indices
outlier_indices.extend(outlier_list_col)
# select observations containing more than n outliers
outlier_indices = Counter(outlier_indices)
print("outlier_indices is ", outlier_indices)
print("outlier_indices length is ", outlier_indices.__len__())
multiple_outliers = list(k for k, v in outlier_indices.items() if v > n)
return multiple_outliers
6、正態分佈檢測
def normal_test(self,train_data_df):
# 對於連續型指標---正態分佈檢驗
# 判斷是否符合近似正態分佈
# 若p_value比較小,表示不大可能來自正態分佈
#經檢驗,都不是正態分佈,因此需要對重要的power和kilometer進行轉換。
numerical_columns = ['regDate', 'power', 'kilometer', 'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3',
'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
'v_13', 'v_14']
train_data_df['regDate'] = train_data_df['regDate'].astype('int')
train_data_df['notRepairedDamage'] = train_data_df['notRepairedDamage'].replace('-', np.nan, inplace=True)
train_data_df = train_data_df.fillna(0)
train_data_df.info()
print('看P-Value是否滿足正態分佈,不大表明不大可能來自正態分佈',
list(map(lambda x: scipy.stats.normaltest(train_data_df[x])[1], numerical_columns)))
7、對數轉化圖形對比
#畫指標原圖與對數轉化後的原圖
def log_plot(self,train_data_df):
## 3) 查看預測值的具體頻數
plt.subplot(2,2,1)
plt.hist(train_data_df['price'], orientation='vertical', histtype='bar', color='red',label='price')
plt.subplot(2, 2, 2)
plt.hist(train_data_df['kilometer'], orientation='vertical', histtype='bar', color='green',label='price')
# plt.subplot(2, 3, 3)
# plt.hist(train_data_df['power'], orientation='vertical', histtype='bar', color='yellow',label='price')
# log變換 z之後的分佈較均勻,可以進行log變換進行預測,這也是預測問題常用的trick
plt.subplot(2, 2, 3)
plt.hist(np.log(train_data_df['price']), orientation='vertical', histtype='bar', color='red',label='price')
plt.subplot(2, 2, 4)
#會發現,這種離散的做了對數變化,正態化效果並不明顯
# plt.hist(np.log(train_data_df['kilometer']), orientation='vertical', histtype='bar', color='red',label='kilometer')
#轉換不了,會報錯,ValueError: supplied range of [-inf, 9.868481943337313] is not finite
# plt.subplot(2, 3, 6)
# plt.hist(np.log(train_data_df['power']), orientation='vertical', histtype='bar', color='red',label='power')
plt.show()
8、對數正態轉化
def change_to_nomal(self,train_data_df):
"""
轉換爲正態分佈
"""
train_data_df['price'] = train_data_df['price'].apply(lambda x: np.log(x))
# train_data_df['log_kilometer'] = train_data_df['kilometer'].apply(lambda x: np.log(x))
# train_data_df['log_power'] = train_data_df['power'].apply(lambda x: np.log(x))
# train_data_df['log_model'] = train_data_df['model'].apply(lambda x: np.log(x))
return train_data_df
8、對數正態轉化
def change_to_nomal(self,train_data_df):
"""
轉換爲正態分佈
"""
train_data_df['price'] = train_data_df['price'].apply(lambda x: np.log(x))
# train_data_df['log_kilometer'] = train_data_df['kilometer'].apply(lambda x: np.log(x))
# train_data_df['log_power'] = train_data_df['power'].apply(lambda x: np.log(x))
# train_data_df['log_model'] = train_data_df['model'].apply(lambda x: np.log(x))
return train_data_df
10、特徵選擇(根據數據分佈)
11、特徵選擇(嶺迴歸)
def ridge_cv(self,train_data_df,feature_columns):
"""
注意此時價格爲正態
"""
# 使用嶺迴歸處理共線性 ;逐步迴歸法(Stepwise Regression);
from sklearn import linear_model
# 初始化一個Ridge Cross-Validation Regression
# train_data_df = train_data_df.fillna(0)
data = train_data_df[feature_columns]
clf = linear_model.RidgeCV(fit_intercept=False)
# 訓練模型---嶺迴歸訓練模型
clf.fit(data, train_data_df['price'])
print('alpha的數值 : ', clf.alpha_)
rst = list(map(lambda x: '{:.5f}'.format(abs(x)), clf.coef_))
rst = sorted(rst)
print(rst)
print(len(rst), len(feature_columns))
print('參數的數值:', dict(zip(feature_columns, rst)))
12、特徵選擇(逐步迴歸)
def stepwise_selection(self,X, y,
initial_list=[],
threshold_in=0.01,
threshold_out=0.05,
verbose=True):
"""
逐步迴歸,篩選特徵
"""
included = list(initial_list)
while True:
changed = False
# forward step
excluded = list(set(X.columns) - set(included))
new_pval = pd.Series(index=excluded)
for new_column in excluded:
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit()
new_pval[new_column] = model.pvalues[new_column]
best_pval = new_pval.min()
if best_pval < threshold_in:
best_feature = new_pval.argmin()
included.append(best_feature)
changed = True
if verbose:
print('Add {:30} with p-value {:.6}'.format(best_feature, best_pval))
# backward step
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
# use all coefs except intercept
pvalues = model.pvalues.iloc[1:]
worst_pval = pvalues.max() # null if pvalues is empty
if worst_pval > threshold_out:
changed = True
worst_feature = pvalues.argmax()
included.remove(worst_feature)
if verbose:
print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
if not changed:
break
return included
13、特徵選擇(xgboost)
def xgb_model_fit(self,train_data_df,predictors,alg, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(train_data_df[predictors], label=train_data_df['price'])
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='mae', early_stopping_rounds=early_stopping_rounds)
alg.set_params(n_estimators=cvresult.shape[0])
# Fit the algorithm on the data
alg.fit(train_data_df[predictors], train_data_df['price'], eval_metric='mae')
# Predict training set:
train_data_df_predictions = alg.predict(train_data_df[predictors])
# 迴歸問題評價標
print("mean_absolute_error is : " )
print(mean_absolute_error(train_data_df['price'], train_data_df_predictions))
# feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
# feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance is')
plot_importance(alg)
plt.show()
xgboost的特徵預測效果如下:
mae的誤差結果是0.14;
特徵重要度如下: