1. 導入工具包
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from operator import itemgetter
%matplotlib inline
2. 數據導入及查看
Train_data = pd.read_csv('used_car_train_20200313.csv', sep=' ')
Test_data = pd.read_csv('used_car_testA_20200313.csv', sep=' ')
print(Train_data.shape)
print(Test_data.shape)
輸出:
(150000, 31)
(50000, 30)
- 查看數據
3 異常值處理
3.1 刪除極限值
def outliers_proc(data, col_name, scale=3):
"""
用於清洗異常值,默認用 box_plot(scale=3)進行清洗
:param data: 接收 pandas 數據格式
:param col_name: pandas 列名
:param scale: 尺度
:return:
"""
def box_plot_outliers(data_ser, box_scale):
"""
利用箱線圖去除異常值
:param data_ser: 接收 pandas.Series 數據格式
:param box_scale: 箱線圖尺度,
:return:
"""
# 求四分位距(IQR)
iqr = box_scale * (data_ser.quantile(0.75) - data_ser.quantile(0.25))
# 下外限
val_low = data_ser.quantile(0.25) - iqr
# 上外限
val_up = data_ser.quantile(0.75) + iqr
# 下極限 bool值
rule_low = (data_ser < val_low)
# 上極限 bool值
rule_up = (data_ser > val_up)
return (rule_low, rule_up), (val_low, val_up)
data_n = data.copy()
data_series = data_n[col_name]
rule, value = box_plot_outliers(data_series, box_scale=scale)
#極限值索引
index = np.arange(data_series.shape[0])[rule[0] | rule[1]]
print("Delete number is: {}".format(len(index)))
data_n = data_n.drop(index)
data_n.reset_index(drop=True, inplace=True)
print("Now column number is: {}".format(data_n.shape[0]))
#下極限值索引
index_low = np.arange(data_series.shape[0])[rule[0]]
outliers = data_series.iloc[index_low]
print("Description of data less than the lower bound is:")
print(pd.Series(outliers).describe())
#上極限值索引
index_up = np.arange(data_series.shape[0])[rule[1]]
outliers = data_series.iloc[index_up]
print("Description of data larger than the upper bound is:")
print(pd.Series(outliers).describe())
#畫圖
fig, ax = plt.subplots(1, 2, figsize=(10, 7))
sns.boxplot(y=data[col_name], data=data, palette="Set1", ax=ax[0])
sns.boxplot(y=data_n[col_name], data=data_n, palette="Set1", ax=ax[1])
return data_n
這裏以數據“power"特徵爲例,但是要注意 test 的數據不能刪
Train_data = outliers_proc(Train_data, 'power', scale=3)
4. 特徵構造
4.1 使用時長特徵的構造
一輛二手車賣的價格高低,和它使用的時間應該是有關係的,同一個型號的車,理論上來說應該使用時間越少賣的價格越高,使用時間=data[‘creatDate’] - data[‘regDate’],反映汽車的使用時間,一般來說價格與使用時間成反比
# 訓練集和測試集放在一起,方便構造特徵
Train_data['train']=1
Test_data['train']=0
data = pd.concat([Train_data, Test_data], ignore_index=True) # 豎着疊加Train_data和Test_data
# 不過要注意,數據裏有時間出錯的格式,所以我們需要 errors='coerce'
data['used_time'] = (pd.to_datetime(data['creatDate'], format='%Y%m%d', errors='coerce') -
pd.to_datetime(data['regDate'], format='%Y%m%d', errors='coerce')).dt.days # 增加了一列used_time
#計算缺失值
data['used_time'].isnull().sum()
輸出:
15072
查看:
4.2 城市特徵的構造
# 從郵編中提取城市信息,相當於加入了先驗知識
data['city'] = data['regionCode'].apply(lambda x : str(x)[:-3])
data = data
5. 特徵變換
- 以"power"特徵爲例:
現在還有這麼奇怪的分佈是因爲 test 中的 power 極端值 ,對 train 進行極端值處理就正常了
對 power 特徵取 log,再做歸一化
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
data['power'] = np.log(data['power'] + 1)
data['power'] = ((data['power'] - np.min(data['power'])) / (np.max(data['power']) - np.min(data['power'])))
data['power'].plot.hist()
- 以 ‘ kilometer’ 爲例,直接歸一化
- 對其它特徵做歸一化
# 除此之外 還有我們剛剛構造的統計量特徵:
# 'brand_amount', 'brand_price_average', 'brand_price_max',
# 'brand_price_median', 'brand_price_min', 'brand_price_std',
# 'brand_price_sum'
# 這裏不再一一舉例分析了,直接做變換,
def max_min(x):
return (x - np.min(x)) / (np.max(x) - np.min(x))
data['brand_amount'] = ((data['brand_amount'] - np.min(data['brand_amount'])) /
(np.max(data['brand_amount']) - np.min(data['brand_amount'])))
data['brand_price_average'] = ((data['brand_price_average'] - np.min(data['brand_price_average'])) /
(np.max(data['brand_price_average']) - np.min(data['brand_price_average'])))
data['brand_price_max'] = ((data['brand_price_max'] - np.min(data['brand_price_max'])) /
(np.max(data['brand_price_max']) - np.min(data['brand_price_max'])))
data['brand_price_median'] = ((data['brand_price_median'] - np.min(data['brand_price_median'])) /
(np.max(data['brand_price_median']) - np.min(data['brand_price_median'])))
data['brand_price_min'] = ((data['brand_price_min'] - np.min(data['brand_price_min'])) /
(np.max(data['brand_price_min']) - np.min(data['brand_price_min'])))
data['brand_price_std'] = ((data['brand_price_std'] - np.min(data['brand_price_std'])) /
(np.max(data['brand_price_std']) - np.min(data['brand_price_std'])))
data['brand_price_sum'] = ((data['brand_price_sum'] - np.min(data['brand_price_sum'])) /
(np.max(data['brand_price_sum']) - np.min(data['brand_price_sum'])))
- 對類別特徵進行one encoder
data = pd.get_dummies(data, columns=['model', 'brand', 'bodyType', 'fuelType',
'gearbox', 'notRepairedDamage', 'power_bin'])