Task2——數據分析

1. 賽題理解

此次數據挖掘的目標爲二手車交易價格預測,該問題本質上是一個迴歸問題。
(1)數據集介紹
數據總量超過370000條,隨機抽取其中10萬條作爲訓練數據集,5萬條作爲測試集A,5萬條作爲測試集B。數據共包含20個特徵變量。
(2)評估指標
對於分類和迴歸的問題應當採用不同的評估指標。
分類:

  • 二分類:accuracy, precision, recall, F-score, pr曲線,AUC/ROC
  • 多分類:accuracy, 宏平均, 微平均, F-score

迴歸:
MAE、MSE、MAPE、RMSE、R2R^2

(3)包導入介紹

#  Step 1:數據讀取
import pandas as pd  #用於讀取數據
train_data = pd.read_csv('used_car_train_20200313.csv', delimiter=' ', header=0)
test_data = pd.read_csv('used_car_testA_20200313.csv', delimiter=' ', header=0)
# 數據大致信息查看
print('train data shape:', train_data.shape)  # 打印訓練數據的形狀 (150000,31)
print(train_data.head())    # 打印訓練數據的前5行
print(train_data.info())    # 打印訓練數據每個特徵變量的信息,包括有無空值及數據類型等
print(train_data.columns)  # 打印訓練數據的列名稱
print(train_data.describe())  # 打印訓練數據各列的統計信息,包括count、mean、std、min、25%、50%、75%、max
print(train_data.head().append(train_data.tail)) #數據首尾查看

# Step 2:評價指標的導入
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score    # 分類問題
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score  #迴歸問題
# MAPE和RMSE需要自己實現
# MAPE
def mape(y_true, y_pred):
	return np.mean(np.abs((y_pred - y_true) / y_true))
# RMSE
from sklearn import metrics
import numpy as np
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_true, y_pred)))

2. EDA-數據探索性分析

數據探索性分析的目的在於初步瞭解和熟悉數據集,瞭解變量間的相互關係以及變量與預測值之間的關係,是進行特徵工程前的重要步驟。主要內容包括:

  • 判斷是否存在確實和異常;若存在,如何填補缺失?如何剔除異常?
  • 瞭解預測值的分佈;包括總體分佈情況,偏態、峯態、具體頻數等
  • 數字特徵分析;包括數字特徵間的相關性分析、特徵值的偏度和峯值、特徵分佈的可視化、多變量互相迴歸關係可視化(主要確定是否存在正態分佈,若不服從正態分佈要考慮如何正態化)
  • 類型特徵分析;包括unique分佈、箱型圖、小提琴圖、柱形圖、類別頻數可視化等;
  • pandas_profiling生成數據報告

代碼示例:

#  導入繪圖包
import matplotlib.pyplot as plt
import seaborn as sns
# 1.判斷數據缺失和異常
print(train_data.isnull().sum())  #查看每列存在nan的情況
#  nan可視化
missing = train_data.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar()  #柱狀圖

若存在nan的個數很小,在使用傳統機器學習模型時可選擇填充,適用lgb/xgb等樹模型時可空缺由樹自身優化,nan不影響分佈的情況下棵考慮直接刪除

#  2. 將缺省值可視化
import missingno as msno
msno.matrix(train_data.sample(250))
msno.bar(train_data.sample(1000))
# 3. 異常值檢測
train_data['notRepairedDamage'].value_counts()  #記數
train_data['notRepairedDamage'].replace('-', np.nan, inplace=True)  # 將缺省值替換爲nan
train_data['notRepairedDamage'].value_counts()
# 4.查看傾斜較爲嚴重的特徵並刪除
train_data["seller"].value_counts()
train_data["offerType"].value_counts()
del train_data["seller"]
del train_data["offerType"]
# 5.瞭解預測值的分佈
train_data['price'].value_counts()
# 6.總體分佈情況(無界約翰遜分佈)
import scipy.stats as st
y = train_data['price']
plt.figure(1); plt.title('Johnson SU')
sns.distplot(y, kde=False, fit=st.johnsonsu)
plt.figure(2); plt.title('Normal')
sns.distplot(y, kde=False, fit=st.norm)
plt.figure(3); plt.title('Log Normal')
sns.distplot(y, kde=False, fit=st.lognorm)
# 若預測值不服從正態分佈,在迴歸前必須進行轉換,原因在於正態分佈能保證數據獨立分佈,加快模型處理效率。一般採用無界約翰遜分佈對其進行轉換
# 7.查看skewness and kurtosis
sns.distplot(train_data['price']);
print("Skewness: %f" % train_data['price'].skew())
print("Kurtosis: %f" % train_data['price'].kurt())
train_data.skew()
train_data.kurt()
sns.distplot(train_data.skew(),color='blue',axlabel ='Skewness')
sns.distplot(train_data.kurt(),color='orange',axlabel ='Kurtness')
# 8.查看預測值的具體頻數
plt.hist(train_data['price'], orientation = 'vertical',histtype = 'bar', color ='red')
plt.show()
# 9.log變換之後的分佈較均勻,可以進行log變換進行預測,這也是預測問題常用的trick
plt.hist(np.log(train_data['price']), orientation = 'vertical',histtype = 'bar', color ='red')
plt.show()
# 10.分離label即預測值
Y_train = train_data['price']
numeric_features = Train_data.select_dtypes(include=[np.number])
categorical_features = Train_data.select_dtypes(include=[np.object])
 # 也可直接人爲給定
 # 11. 特徵nunique分佈
for cat_fea in categorical_features:
	print(cat_fea + "的特徵分佈如下:")
	print("{}特徵有個{}不同的值".format(cat_fea, 				train_data[cat_fea].nunique()))
	print(train_data[cat_fea].value_counts())
#  數字特徵分析
#  12.相關性分析
price_numeric = train_data[numeric_features]
correlation = price_numeric.corr()
print(correlation['price'].sort_values(ascending = False),'\n')
f , ax = plt.subplots(figsize = (7, 7))
plt.title('Correlation of Numeric Features with Price',y=1,size=16)
sns.heatmap(correlation,square = True, vmax=0.8)
# 13. 查看幾個特徵得 偏度和峯值
for col in numeric_features:
	print('{:15}'.format(col),'Skewness: {:05.2f}'.format(Train_data[col].skew()) , ' ' ,'Kurtosis: {:06.2f}'.format(Train_data[col].kurt()))
# 14.每個數字特徵的分佈可視化
f = pd.melt(Train_data, value_vars=numeric_features)
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False)
g = g.map(sns.distplot, "value")
# 15.數字特徵相互之間的關係可視化
sns.set()
columns = ['price', 'v_12', 'v_8' , 'v_0', 'power', 'v_5', 'v_2', 'v_6', 'v_1', 'v_14']
sns.pairplot(Train_data[columns],size = 2 ,kind ='scatter',diag_kind='kde')
plt.show()
# 16.多變量互相迴歸關係可視化
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8), (ax9, ax10)) = plt.subplots(nrows=5, ncols=2,
# ['v_12', 'v_8' , 'v_0', 'power', 'v_5', 'v_2', 'v_6', 'v_1', 'v_14']
v_12_scatter_plot = pd.concat([Y_train,Train_data['v_12']],axis = 1)
sns.regplot(x='v_12',y = 'price', data = v_12_scatter_plot,scatter= True, fit_reg=True, ax=ax1)
v_8_scatter_plot = pd.concat([Y_train,Train_data['v_8']],axis = 1)
sns.regplot(x='v_8',y = 'price',data = v_8_scatter_plot,scatter= True, fit_reg=True, ax=ax2)
v_0_scatter_plot = pd.concat([Y_train,Train_data['v_0']],axis = 1)
sns.regplot(x='v_0',y = 'price',data = v_0_scatter_plot,scatter= True, fit_reg=True, ax=ax3)
power_scatter_plot = pd.concat([Y_train,Train_data['power']],axis = 1)
sns.regplot(x='power',y = 'price',data = power_scatter_plot,scatter= True, fit_reg=True, ax=ax4)
v_5_scatter_plot = pd.concat([Y_train,Train_data['v_5']],axis = 1)
sns.regplot(x='v_5',y = 'price',data = v_5_scatter_plot,scatter= True, fit_reg=True, ax=ax5)
v_2_scatter_plot = pd.concat([Y_train,Train_data['v_2']],axis = 1)
sns.regplot(x='v_2',y = 'price',data = v_2_scatter_plot,scatter= True, fit_reg=True, ax=ax6)
v_6_scatter_plot = pd.concat([Y_train,Train_data['v_6']],axis = 1)
sns.regplot(x='v_6',y = 'price',data = v_6_scatter_plot,scatter= True, fit_reg=True, ax=ax7)
v_1_scatter_plot = pd.concat([Y_train,Train_data['v_1']],axis = 1)
sns.regplot(x='v_1',y = 'price',data = v_1_scatter_plot,scatter= True, fit_reg=True, ax=ax8)
v_14_scatter_plot = pd.concat([Y_train,Train_data['v_14']],axis = 1)
sns.regplot(x='v_14',y = 'price',data = v_14_scatter_plot,scatter= True, fit_reg=True, ax=ax9)
v_13_scatter_plot = pd.concat([Y_train,Train_data['v_13']],axis = 1)
sns.regplot(x='v_13',y = 'price',data = v_13_scatter_plot,scatter= True, fit_reg=True, ax=ax10)
# 類別特徵分析
# 17.  unique分佈
for fea in categorical_features:
	print(train_data[fea].nunique())
# 18.類別特徵箱形圖可視化
categorical_features = ['model','brand','bodyType','fuelType','gearbox','notRepairedDamage']
for c in categorical_features:
	train_data[c] = train_data[c].astype('category')
	if train_data[c].isnull().any():
		train_data[c] =train_data[c].cat.add_categories(['MISSING'])
		train_data[c] = train_data[c].fillna('MISSING')
def boxplot(x, y, **kwargs):
	sns.boxplot(x=x, y=y)
	x=plt.xticks(rotation=90)
	
f = pd.melt(train_data, id_vars=['price'], value_vars=categorical_features)
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False, size=5)
g = g.map(boxplot, "value", "price")
# 19. 類別特徵的小提琴圖可視化
catg_list = categorical_features
target = 'price'
for catg in catg_list :
	sns.violinplot(x=catg, y=target, data=Train_data)
	plt.show()
# 20.類別特徵的柱形圖可視化
def bar_plot(x, y, **kwargs):
	sns.barplot(x=x, y=y)
	x=plt.xticks(rotation=90)
	
f = pd.melt(Train_data, id_vars=['price'], value_vars=categorical_features)
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False, size=5)
g = g.map(bar_plot, "value", "price")
# 21. 類別特徵的每個類別頻數可視化(count_plot)
def count_plot(x, **kwargs):
	sns.countplot(x=x)
	x=plt.xticks(rotation=90)
	
f = pd.melt(Train_data, value_vars=categorical_features)
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False, size=5)
g = g.map(count_plot, "value")
# 22. 用pandas_profiling生成數據報告
import pandas_profiling
pfr = pandas_profiling.ProfileReport(train_data)
pfr.to_file("./example.html")

總結

  • 數據集的缺失處理應看缺失值的佔比情況,一般來說超過30%以上需要進行填充,填充方式需要根據數據特徵進行分析(均值填充、0填充、衆數填充)
  • 異常值需單獨分析,特徵異常的label是否異常,異常值剔除還是用正常值填充,異常爲記錄異常還是機器本身的異常。
  • 預測值需要單獨分析
  • 不同特徵的分佈情況如何,包括數值特徵和類型特徵。
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章