數據挖掘實戰--二手車交易價格預測(三)模型訓練和預測

異常值分析與處理

test_df = pd.read_csv('D:/DataMining/Test Data/used_car_testA_20200313.csv', sep=' ')
#將price變換爲正態分佈
train_df['price'] = np.log1p(train_df['price'])
# 刪除部分異常值
train_df.drop(train_df[train_df['price'] < 2].index, inplace=True)
# 超過邊界部分進行限制
train_df['power'] = train_df['power'].map(lambda x: 600 if x>600 else x)
test_df['power'] = test_df['power'].map(lambda x: 600 if x>600 else x)
# 整合訓練集測試集以便後續特徵工程
all_features = pd.concat([train_df, test_df], sort=False).reset_index(drop=True)

填充缺失值

將存在空值的部分填充爲均值

def fill_missing(df):
    df['fuelType'] = df['fuelType'].fillna(train_df['fuelType'].mean())
    df['gearbox'] = df['gearbox'].fillna(train_df['gearbox'].mean())
    df['bodyType'] = df['bodyType'].fillna(train_df['bodyType'].mean())
    df['model'] = df['model'].fillna(train_df['model'].mean())
    return df
all_features = fill_missing(all_features)

數據類型轉換

對一些分類特徵存儲成數值需要進行轉化爲字符型數值

def data_astype(df):
    # string
    df['SaleID'] = df['SaleID'].astype(int).astype(str)
    df['name'] = df['name'].astype(int).astype(str)
    df['model'] = df['model'].astype(str)
    df['brand'] = df['brand'].astype(str)
    df['bodyType'] = df['bodyType'].astype(str)
    df['fuelType'] = df['fuelType'].astype(str)
    df['gearbox'] = df['gearbox'].astype(str)
    df['notRepairedDamage'] = df['notRepairedDamage'].astype(str)
    df['regionCode'] = df['regionCode'].astype(int).astype(str)
    df['seller'] = df['seller'].astype(int).astype(str)
    df['offerType'] = df['offerType'].astype(int).astype(str)

    return df
    
all_features = data_astype(all_features)

提取年份和月份

# 提取年份
all_features['regYear'] = all_features['regDate'].map(lambda x:int(str(x)[:4]))
all_features['createYear'] = all_features['creatDate'].map(lambda x:int(str(x)[:4]))
# 提取月份
all_features['regMonth'] = all_features['regDate'].map(lambda x:int(str(x)[4:6]))
all_features['createMonth'] = all_features['creatDate'].map(lambda x:int(str(x)[4:6]))
# 計算上線日期與註冊日期想差月份數
all_features['months'] = (all_features['createYear']-all_features['regYear'])*12+(all_features['createMonth']-all_features['regMonth'])
all_features['years'] = all_features['months'] / 12

# 查看月份數統計值
all_features['months'].describe()
# 月份數分佈
all_features['months'].hist()

在這裏插入圖片描述
在這裏插入圖片描述

編碼分類變量

刪除一些不要的特徵。

all_features = all_features.drop(['SaleID', 'name', 'regDate', 'model', 'seller',
                                  'offerType', 'creatDate', 'regYear', 'regionCode',
                                  'createYear', 'regMonth', 'createMonth', 'months'], axis=1)

查看剩餘的特徵類型

all_features.dtypes

在這裏插入圖片描述
創建相關性組合,爲之後的刪除相關性高的變量做準備

corr = all_features.corr()
#創建相關性係數組合
feature_group = list(itertools.combinations(corr.columns, 2))
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章