天池二手車價格預測-特徵工程

特徵工程

(1)刪除分佈嚴重不平衡的特徵

(2)對預測值進行對數正態轉化

(3)針對日期特徵進行分桶(按年、月)統計

(4)針對兩個日期字段進行間隔天數計算

(5)構造品牌-價格的量化特徵(最大、最小、平均值、方差)

(6)對功率字段進行異常值檢測

(7)構造功率與價格的量化特徵

(8)針對地區編碼進行處理

(9)代碼實現

(10)結果

代碼

#!/usr/bin/env python
# coding: utf-8
import os
from collections import Counter

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import scipy
from scipy.stats import stats
import statsmodels.api as sm
from numpy import loadtxt
from sklearn import metrics
from xgboost import XGBClassifier
from matplotlib import pyplot
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error
from matplotlib import pyplot
from xgboost import plot_importance
import xgboost as xgb
import lightgbm as lgb

class DataSearch(object):
    def load_data(self,train_data_path,test_data_path):
        # 訓練數據初步統計
        train_data_df = pd.read_csv(train_data_path, sep=' ')
        pd.set_option('display.max_columns', None)

        test_data_df = pd.read_csv(test_data_path, sep=' ')
        pd.set_option('display.max_columns', None)

        train_data_df['train'] = 1
        test_data_df['train'] = 0
        data = pd.concat([train_data_df, test_data_df], ignore_index=True)

        # test_data_path = r"C:\Users\ccs\Documents\dataWhale\used_car_testA_20200313\used_car_testA_20200313.csv"
        # test_data_df =  pd.read_csv(test_data_path, sep=' ')

        # train_data_df = train_data_df.append(test_data_df)

        print("訓練數據的數量:\n",train_data_df.count())
        print("測試數據的數量:\n",test_data_df.count())
        print("訓練數據合併上測試數據後的數量:\n",data.count())
        print("合併後數據去重後的數量:\n",data.drop_duplicates().count())
        data.describe(include='all')
        # 空值統計
        print(data.isnull().sum())
        print(data.columns)


        return data


    def categorial_statistus(self,train_data_df,category_columns):
        """
        總體特徵或者字符特徵數據統計
        """
        print(train_data_df.columns)
        # train_data_df.loc[train_data_df['regDate'][4:6],'C']=train_data_df['regDate'][4:6]
        # train_data_df
        for i in category_columns:
            #     print(train_data_df.groupby(i).size())
            total = pd.DataFrame({'count': train_data_df.groupby(i).size()})
            total = total.sort_values(['count'], ascending=False)
            print(total, '\n', total.count())
        return train_data_df

    def categorial_extend(self,train_data_df):
        """
        字符特徵擴展
        """
        def fun(x):
            if str(x)[4:6] == '00':
                rst = str(x)[0:4] + '03' + str(x)[6:]
                return rst
            else:
                return str(x)

        train_data_df['regDate'] = train_data_df['regDate'].apply(lambda x: fun(x))

        train_data_df["year_regDate"] = train_data_df['regDate'].astype("str").str[0:4]
        train_data_df["month_regDate"] = train_data_df['regDate'].astype("str").str[4:6]

        #構造車輛使用天數特徵

        train_data_df['used_time'] = (pd.to_datetime(train_data_df['creatDate'], format='%Y%m%d', errors='coerce') -
                             pd.to_datetime(train_data_df['regDate'], format='%Y%m%d', errors='coerce')).dt.days


        #構造品牌與價格相關的特徵
        #選取爲訓練數據的那部分數據--計算品牌與價格的關聯特徵
        train_data_df_actual = train_data_df[train_data_df['train']==1]
        
        def combine_data(train_data_df,column):
            Train_gb = train_data_df_actual.groupby(column)
            all_info = {}
            for kind, kind_data in Train_gb:
                print("kind, kind_data is ",kind, kind_data)
                info = {}
                kind_data = kind_data[kind_data['price'] > 0]
                info[column + 'amount'] = len(kind_data)
                info[column + 'price_max'] = kind_data.price.max()
                info[column + 'price_median'] = kind_data.price.median()
                info[column + 'price_min'] = kind_data.price.min()
                info[column + 'price_sum'] = kind_data.price.sum()
                info[column + 'price_std'] = kind_data.price.std()
                info[column + 'price_average'] = round(kind_data.price.sum() / (len(kind_data) + 1), 2)
                all_info[kind] = info
                print("all_info[kind]  is ",all_info)
            brand_fe = pd.DataFrame(all_info).T.reset_index().rename(columns={"index": column})
            train_data_df = train_data_df.merge(brand_fe, how='left', on='brand')
            return train_data_df
        train_data_df = combine_data(train_data_df,"brand")
        train_data_df = combine_data(train_data_df,"power")
        train_data_df.describe(include='all')
        
        return train_data_df

    #數字特徵可視化
    def plot_nemurical(self,train_data_df,numerical_columns):
        ## 3) 每個數字特徵得分佈可視化--連續型取值的
        ##去除掉字符型的變量
        # numerical_columns = ['regDate', 'power', 'kilometer', 'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3',
        #                      'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
        #                      'v_13', 'v_14']
        f = pd.melt(train_data_df, value_vars=numerical_columns)
        g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False)
        g = g.map(sns.distplot, "value")

        ## 4) 數字特徵相互之間的關係可視化
        sns.set()
        columns = ['price', 'v_12', 'v_8', 'v_0', 'power', 'v_5', 'v_2', 'v_6', 'v_1', 'v_14']
        sns.pairplot(train_data_df[columns], size=2, kind='scatter', diag_kind='kde')
        plt.show()

    def normal_test(self,train_data_df):
        # 對於連續型指標---正態分佈檢驗
        # 判斷是否符合近似正態分佈
        # 若p_value比較小,表示不大可能來自正態分佈
        #經檢驗,都不是正態分佈,因此需要對重要的power和kilometer進行轉換。

        numerical_columns = ['regDate', 'power', 'kilometer', 'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3',
                             'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
                             'v_13', 'v_14']
        train_data_df['regDate'] = train_data_df['regDate'].astype('int')
        train_data_df['notRepairedDamage'] = train_data_df['notRepairedDamage'].replace('-', np.nan, inplace=True)
        train_data_df = train_data_df.fillna(0)
        train_data_df.info()
        print('看P-Value是否滿足正態分佈,不大表明不大可能來自正態分佈',
              list(map(lambda x: scipy.stats.normaltest(train_data_df[x])[1], numerical_columns)))

    #畫指標原圖與對數轉化後的原圖
    def log_plot(self,train_data_df):

        ## 3) 查看預測值的具體頻數
        plt.subplot(2,2,1)
        plt.hist(train_data_df['price'], orientation='vertical', histtype='bar', color='red',label='price')
        plt.subplot(2, 2, 2)
        plt.hist(train_data_df['kilometer'], orientation='vertical', histtype='bar', color='green',label='price')
        # plt.subplot(2, 3, 3)
        # plt.hist(train_data_df['power'], orientation='vertical', histtype='bar', color='yellow',label='price')

        # log變換 z之後的分佈較均勻,可以進行log變換進行預測,這也是預測問題常用的trick
        plt.subplot(2, 2, 3)
        plt.hist(np.log(train_data_df['price']), orientation='vertical', histtype='bar', color='red',label='price')
        plt.subplot(2, 2, 4)

        #會發現,這種離散的做了對數變化,正態化效果並不明顯
        # plt.hist(np.log(train_data_df['kilometer']), orientation='vertical', histtype='bar', color='red',label='kilometer')
        #轉換不了,會報錯,ValueError: supplied range of [-inf, 9.868481943337313] is not finite
        # plt.subplot(2, 3, 6)
        # plt.hist(np.log(train_data_df['power']), orientation='vertical', histtype='bar', color='red',label='power')
        plt.show()

    def change_to_nomal(self,train_data_df):
        """
        轉換爲正態分佈
        """
        train_data_df[train_data_df['train']==1]['price'] = train_data_df[train_data_df['train']==1].apply(lambda x: np.log(x))
        # train_data_df['log_kilometer'] = train_data_df['kilometer'].apply(lambda x: np.log(x))
        # train_data_df['log_power'] = train_data_df['power'].apply(lambda x: np.log(x))
        # train_data_df['log_model'] = train_data_df['model'].apply(lambda x: np.log(x))
        return train_data_df



    # 異常值檢測
    def detect_outliers(self,df, n, features):
        """

        """
        outlier_indices = []
        df_raw =  df
        #只對測試數據進行異常值檢測
        df = df[df['train']==1]
        # iterate over features(columns)
        for col in features:
            # 1st quartile (25%)
            Q1 = np.percentile(df[col], 25)
            # 3rd quartile (75%)
            Q3 = np.percentile(df[col], 75)
            # quartile spacing (IQR)
            IQR = Q3 - Q1
            # outlier step
            outlier_step = 1.5 * IQR

            # Determine a list of indices of outliers for feature col
            outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index

            # append the found outlier indices for col to the list of outlier indices
            outlier_indices.extend(outlier_list_col)

        # select observations containing more than n outliers

        outlier_indices = Counter(outlier_indices)
        print("outlier_indices is ", outlier_indices)
        print("outlier_indices length is ", outlier_indices.__len__())

        multiple_outliers = list(k for k, v in outlier_indices.items() if v > n)
        print("multiple_outliers is ",multiple_outliers)
        #刪除測試集中數據索引爲異常的那部分樣本
        df_raw = df_raw.drop(multiple_outliers)
        print("set(multiple_outliers) & set(df_raw.index) should be empty ",set(multiple_outliers) & set(df_raw.index))
        return df_raw

    def person_corr(self,train_data_df):
        numerical_columns = ['regDate', 'power', 'kilometer', 'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3',
                             'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
                             'v_13', 'v_14']
        ## 1) 與價格的相關性分析
        price_numeric = train_data_df[numerical_columns]
        correlation = price_numeric.corr()
        print(correlation['price'].sort_values(ascending=False), '\n')
        f, ax = plt.subplots(figsize=(7, 7))

        plt.title('Correlation of Numeric Features with Price', y=1, size=16)

        sns.heatmap(correlation, square=True, vmax=0.8)

        #自變量與自變量相關係數
        # 設置路徑
        os.chdir(os.getcwd())  # os.getcwd()獲取當前路徑,os.chdir(...)改變路徑爲...
        # 輸入數據
        columns = ['bodyType', 'brand', 'creatDate', 'fuelType', 'gearbox',
                   'kilometer', 'model', 'name', 'notRepairedDamage', 'offerType', 'power',
                   'regDate', 'regionCode', 'seller', 'v_0', 'v_1', 'v_10',
                   'v_11', 'v_12', 'v_13', 'v_14', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6',
                   'v_7', 'v_8', 'v_9', "year_regDate", "month_regDate", 'price']
        data = train_data_df[columns]
        # 求解相關係數
        correlations = data.corr()
        correction = abs(correlations)  # 取絕對值,只看相關程度 ,不關心正相關還是負相關
        # plot correlation matrix
        fig = plt.figure(figsize=(60, 60))
        ax = fig.add_subplot(figsize=(40, 40))  # 圖片大小爲20*20
        ax = sns.heatmap(correction, cmap=plt.cm.Oranges, linewidths=0.05, vmax=1, vmin=0, annot=True,
                         annot_kws={'size': 12, 'weight': 'bold'})
        # 熱力圖參數設置(相關係數矩陣,顏色,每個值間隔等)
        # ticks = numpy.arange(0,16,1) #生成0-16,步長爲1
        plt.xticks(np.arange(30) + 0.5, columns)  # 橫座標標註點
        plt.yticks(np.arange(30) + 0.5, columns)  # 縱座標標註點
        # ax.set_xticks(ticks) #生成刻度
        # ax.set_yticks(ticks)
        # ax.set_xticklabels(names) #生成x軸標籤
        # ax.set_yticklabels(names)
        ax.set_title('Characteristic correlation')  # 標題設置
        plt.savefig('cluster.tif', dpi=300)
        plt.show()

    def ridge_cv(self,train_data_df,feature_columns):
        """
        注意此時價格爲正態
        """

        # 使用嶺迴歸處理共線性  ;逐步迴歸法(Stepwise Regression);
        from sklearn import linear_model
        # 初始化一個Ridge Cross-Validation Regression
        # train_data_df = train_data_df.fillna(0)
        data = train_data_df[feature_columns]
        clf = linear_model.RidgeCV(fit_intercept=False)

        # 訓練模型---嶺迴歸訓練模型
        clf.fit(data, train_data_df['price'])

        print('alpha的數值 : ', clf.alpha_)
        rst = list(map(lambda x: '{:.5f}'.format(abs(x)), clf.coef_))
        rst = sorted(rst)
        print(rst)
        print(len(rst), len(feature_columns))
        print('參數的數值:', dict(zip(feature_columns, rst)))
        """
        結果是:
        參數的數值: {'name': '0.00000', 'regDate': '0.07543', 'model': '0.20068', 'brand': '1.91918', 'bodyType': '12.22141', 'fuelType': '15027.09136', 'gearbox': '1506.17263', 'power': '154331.09559', 'kilometer': '17103.78850', 'notRepairedDamage': '18457.71267', 'regionCode': '194831.93107', 'v_0': '20013.22587', 'v_1': '20882.49239', 'v_2': '2249.08699', 'v_3': '22750.43400', 'v_4': '236965.73075', 'v_5': '24.47606', 'v_6': '241.11729', 'v_7': '2456.38493', 'v_8': '2465.45254', 'v_9': '319.47281', 'v_10': '32640.53892', 'v_11': '333.91531', 'v_12': '38188.50573', 'v_13': '43.12500', 'v_14': '43445.24262', 'year_regDate': '451.61198', 'month_regDate': '9.24321', 'price': '987.79713'}
        ['0.00000', '0.07543', '0.20068', '1.91918', '12.22141', '15027.09136', '1506.17263', '154331.09559', '17103.78850', '18457.71267', '194831.93107', '20013.22587', '20882.49239', '2249.08699', '22750.43400', '236965.73075', '24.47606', '241.11729', '2456.38493', '2465.45254', '319.47281', '32640.53892', '333.91531', '38188.50573', '43.12500', '43445.24262', '451.61198', '9.24321', '987.79713']

        """


    def stepwise_selection(self,X, y,
                           initial_list=[],
                           threshold_in=0.01,
                           threshold_out=0.05,
                           verbose=True):
        """
        逐步迴歸,篩選特徵
        """

        included = list(initial_list)

        while True:
            changed = False
            # forward step
            excluded = list(set(X.columns) - set(included))
            new_pval = pd.Series(index=excluded)
            for new_column in excluded:
                model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit()
                new_pval[new_column] = model.pvalues[new_column]
            best_pval = new_pval.min()
            if best_pval < threshold_in:
                best_feature = new_pval.argmin()
                included.append(best_feature)
                changed = True
                if verbose:
                    print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

            # backward step
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
            # use all coefs except intercept
            pvalues = model.pvalues.iloc[1:]
            worst_pval = pvalues.max()  # null if pvalues is empty
            if worst_pval > threshold_out:
                changed = True
                worst_feature = pvalues.argmax()
                included.remove(worst_feature)
                if verbose:
                    print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
            if not changed:
                break
        return included

    def xgb_model_fit(self,
                      X_train, X_test, y_train, y_test,alg,  useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
        if useTrainCV:
            """訓練集訓練數據"""
            xgb_param = alg.get_xgb_params()
            xgtrain = xgb.DMatrix(X_train, label=y_train)
            cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                              metrics='mae', early_stopping_rounds=early_stopping_rounds)
            alg.set_params(n_estimators=cvresult.shape[0])

        # 擬合模型
        alg.fit(X_train, y_train, eval_metric='mae')

        # 預測訓練集、測試集
        train_data_df_predictions = alg.predict(X_train)
        test_data_df_predictions = alg.predict(X_test)


        # 迴歸問題評價標--訓練集
        print("training mean_absolute_error is : " )
        print(mean_absolute_error(y_train, train_data_df_predictions))

        #測試集
        print("test mean_absolute_error is : ")
        print(mean_absolute_error(y_test, test_data_df_predictions))

        #特徵重要度
        plt.ylabel('Feature Importance is')
        plot_importance(alg)
        plt.show()

    def light_gbm_model_fit(self,X_train, X_test, y_train, y_test):
        gbm = lgb.LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.05, n_estimators=20)
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='l1', early_stopping_rounds=5)

        print('Start lightgbm  predicting...')
        # 訓練集與測試集預測
        y_train_pred = gbm.predict(X_train, num_iteration=gbm.best_iteration_)

        y_test_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
        # 模型評估
        print('The y_train mae of test prediction is:', mean_absolute_error(y_train, y_train_pred))
        print('The y_test mae of test prediction is:', mean_absolute_error(y_test, y_test_pred) )

        # feature importances
        print('Feature importances:', list(gbm.feature_importances_))


        # 網格搜索,參數優化
        estimator = lgb.LGBMRegressor(num_leaves=64, metrics='mae', max_depth=7, min_child_samples=1000)



        param_grid = {
            'learning_rate': [0.01, 0.1, 1],
            'n_estimators': [20, 40]
        }

        gbm_grid = GridSearchCV(estimator, param_grid)

        gbm_grid = gbm_grid.fit(X_train, y_train)
        print("用網格搜索的方式開始進行預測")
        print('Best parameters found by grid search are:', gbm_grid.best_params_)

        # 訓練集與測試集預測
        y_train_pred = gbm_grid.predict(X_train)

        y_test_pred = gbm_grid.predict(X_test)
        # 模型評估
        print('grid search cv  The y_train mae of test prediction is:', mean_absolute_error(y_train, y_train_pred))
        print('grid search cv  The y_test mae of test prediction is:', mean_absolute_error(y_test, y_test_pred))

        # feature importances
        print('Feature importances:', list(gbm_grid.feature_importances_))






def data_process(data_path,test_data_path):
    data_search = DataSearch()
    # 加載數據
    data_df = data_search.load_data(data_path,test_data_path)
    # 枚舉特徵分類統計
    category_columns = ['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage', 'regionCode',
                        'seller', 'offerType']
    # data_df = data_search.categorial_statistus(data_df, category_columns)
    # 數字特徵可視化
    numerical_columns = ['power', 'kilometer', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9',
                         'v_10', 'v_11', 'v_12', 'v_13', 'v_14']
    # data_search.plot_nemurical(data_df,numerical_columns)
    # 字符特徵--擴展及去噪
    data_df = data_search.categorial_extend(data_df)

    # 數字特徵異常值檢測---price的原始值異常值檢測有10353個,---對power異常值篩選
    data_df =  data_search.detect_outliers(data_df, 0, ['price'])

    # 正態分佈檢測
    # data_search.normal_test(data_df)
    # 對數轉化圖形對比
    # data_search.log_plot(data_df)

    # 正態轉化---對價格進行對數正態變換
    data_df = data_search.change_to_nomal(data_df)

    # 數字特徵異常值檢測---price的正態化後異常值檢測有 個
    # data_search.detect_outliers(data_df, 1, ['power'])

    # 特徵選擇,根據數據分佈,手動刪除3個特徵:'seller', 'offerType','creatDate';
    all_col = ['name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType',
               'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode',
                'v_0', 'v_1', 'v_2', 'v_3',
               'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
               'v_13', 'v_14', 'year_regDate', 'month_regDate', 'price']

    print(data_df.columns)
    data_df[all_col] = data_df[all_col].apply(pd.to_numeric, errors='coerce').fillna(0.0)
    data_df.describe(include='all')
    return data_df,all_col


if __name__ == '__main__':
    data_search = DataSearch()
    #加載訓練數據
    train_data_path = r"C:\Users\ccs\Documents\dataWhale\used_car_train_20200313\used_car_train_20200313.csv"
    test_data_path = r"C:\Users\ccs\Documents\dataWhale\used_car_testA_20200313\used_car_testA_20200313.csv"
    train_data_df,all_col = data_process(train_data_path,test_data_path)

    # #嶺迴歸
    # feature_cols = list(all_col)
    # feature_cols.remove("price")
    # data_search.ridge_cv(train_data_df,feature_cols)
    #
    # #逐步迴歸
    # result = data_search.stepwise_selection(train_data_df[feature_cols], y_train)

    # 運用樹模型訓練特徵
    
    predictors = [x for x in all_col if x not in ['price', 'SaleID']]
    #選擇訓練數據進行訓練集及測試集的劃分
    train_data_df =  train_data_df[train_data_df['train'] == 1]
    X_train, X_test, y_train, y_test = train_test_split(train_data_df[predictors], train_data_df['price'], test_size=0.3)


    xgb_model = xgb.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=160, silent=False, objective='reg:gamma')

    data_search.xgb_model_fit(X_train, X_test, y_train, y_test,xgb_model)



    data_search.light_gbm_model_fit(X_train, X_test, y_train, y_test)

    #


"""
#xgboost的方法進行預測
training mean_absolute_error is : 
0.14024175116320706
test mean_absolute_error is : 
0.14639476706968618

#lightGBM的方法預測
Start lightgbm  predicting...
The y_train mae of test prediction is: 0.39791827020196974
The y_test mae of test prediction is: 0.39712612190553775

用網格搜索的方式開始進行預測
Best parameters found by grid search are: {'learning_rate': 1, 'n_estimators': 40}
grid search cv  The y_train mae of test prediction is: 0.1658337832367841
grid search cv  The y_test mae of test prediction is: 0.17248839882027442

"""

結果
“”"

#xgboost的方法進行預測
training mean_absolute_error is : 
0.14024175116320706
test mean_absolute_error is : 
0.14639476706968618

#lightGBM的方法預測
Start lightgbm  predicting...
The y_train mae of test prediction is: 0.39791827020196974
The y_test mae of test prediction is: 0.39712612190553775

#lightGBM的方法用網格搜索的方式開始進行預測
Best parameters found by grid search are: {'learning_rate': 1, 'n_estimators': 40}
grid search cv  The y_train mae of test prediction is: 0.1658337832367841
grid search cv  The y_test mae of test prediction is: 0.17248839882027442

"""
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章