train和test的列分佈差異

代碼:

import pandas as pd
import numpy as np
import multiprocessing
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import gc
from time import time
import datetime
import matplotlib.pyplot as plt 
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, TimeSeriesSplit, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import graphviz
import pandas as pd
import datatable as dt
warnings.simplefilter('ignore')
sns.set()



def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                c_prec = df[col].apply(lambda x: np.finfo(x).precision).max()
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max and c_prec == np.finfo(np.float32).precision:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df
def plot_numerical(feature):
    fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(16, 18))
    sns.kdeplot(train[feature],                      ax=axes[0][0], label='Train');#第1行圖的第1幅圖
    sns.kdeplot(test[feature],                       ax=axes[0][0], label='Test');#第1行圖的第1幅圖

    sns.kdeplot(train[train['isFraud']==0][feature], ax=axes[0][1], label='isFraud 0')
    sns.kdeplot(train[train['isFraud']==1][feature], ax=axes[0][1], label='isFraud 1')

    test[feature].index += len(train)
    axes[1][0].plot(train[feature], '.', label='Train');#第2行圖的第1幅圖
    axes[1][0].plot(test[feature], '.', label='Test');
    axes[1][0].set_xlabel('Row index');
    axes[1][0].legend()
    test[feature].index -= len(train)#減去偏置時間

    axes[1][1].plot(train[train['isFraud']==0][feature], '.', label='isFraud 0');
    axes[1][1].plot(train[train['isFraud']==1][feature], '.', label='isFraud 1');
    axes[1][1].set_xlabel('row index');
    axes[1][1].legend()

    pd.DataFrame({'train': [train[feature].isnull().sum()], 'test': [test[feature].isnull().sum()]}).plot(kind='bar', rot=0, ax=axes[2][0]);
    pd.DataFrame({'isFraud 0': [train[(train['isFraud']==0) & (train[feature].isnull())][feature].shape[0]],
                  'isFraud 1': [train[(train['isFraud']==1) & (train[feature].isnull())][feature].shape[0]]}).plot(kind='bar', rot=0, ax=axes[2][1]);


    fig.suptitle(feature, fontsize=18);
    #第1行的兩個子圖
    axes[0][0].set_title('Train/Test KDE distribution');
    axes[0][1].set_title('Target value KDE distribution');
    
    #第2行的兩個子圖
    axes[1][0].set_title('Index versus value: Train/Test distribution');
    axes[1][1].set_title('Index versus value: Target distribution');
    
    #第3行的兩個子圖
    axes[2][0].set_title('Number of NaNs');
    axes[2][1].set_title('Target value distribution among NaN values');
    plt.show()


#相當於對單列數據使用對抗
def covariate_shift(feature):
    #取一列特徵,然後各自標記上是否是測試集
    df_card1_train = pd.DataFrame(data={feature: train[feature], 'isTest': 0})
    df_card1_test = pd.DataFrame(data={feature: test[feature], 'isTest': 1})

    # 縱向拼接
    df = pd.concat([df_card1_train, df_card1_test], ignore_index=True)
    
    
    #這個只是編碼爲了圖形上顯示得緊湊些,並沒有實質的作用
    # Encoding if feature is categorical
    if str(df[feature].dtype) in ['object', 'category']:
        df[feature] = LabelEncoder().fit_transform(df[feature].astype(str))
    
    # Splitting it to a training and testing set
    X_train, X_test, y_train, y_test = train_test_split(df[feature], df['isTest'], test_size=0.33, random_state=47, stratify=df['isTest'])

    clf = lgb.LGBMClassifier(**params, num_boost_round=500)
    clf.fit(X_train.values.reshape(-1, 1), y_train)
    roc_auc =  roc_auc_score(y_test, clf.predict_proba(X_test.values.reshape(-1, 1))[:, 1])

    del df, X_train, y_train, X_test, y_test
    gc.collect();
    
    return roc_auc

處理card1

#---------------處理card1--------------------------------------
train=pd.read_csv('./ieee-fraud-detection/train1.csv')
train=reduce_mem_usage(train)
test =pd.read_csv('./ieee-fraud-detection/test1.csv')
test=reduce_mem_usage(test)
gc.collect();

y = train['isFraud']
X = pd.DataFrame()#初始化
X['card1'] = train['card1']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=47, stratify=y)

# #這個是不理會測試集的,僅僅是訓練集,下面是建模
clf = DecisionTreeClassifier(max_leaf_nodes=4)#用一個弱分類器進行
print(pd.DataFrame(y_test).columns.values.tolist())
print(pd.DataFrame(y_train).columns.values.tolist())#獲取series的列名
clf.fit(X_train, y_train)

print(X_test.columns.values.tolist())#獲取列名
print('ROC AUC score:',roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))
print('ROC AUC score:',roc_auc_score(y_test, clf.predict_proba(X_test.values.reshape(-1, 1))[:, 1]))
plot_numerical("card1")

# #--------------------處理card1-------------------------------------------------
# #下面這個函數和上面的函數重複,但是多了個繪製光標的功能,所以還是繼續放在這裏。
plt.figure(figsize=(14, 6))
#繪製核密度曲線
sns.kdeplot(X[y==1]['card1'], label='isFraud 1');#傳入數據
sns.kdeplot(X[y==0]['card1'], label='isFraud 0');#傳入數據
#下面兩個句子就是在曲線圖上繪製兩個光標,不重要。
plt.plot([10881.5, 10881.5], [0.0000, 0.0001], sns.xkcd_rgb["black"], lw=2);#lw:linewidth
plt.plot([8750.0, 8750.0], [0.0000, 0.0001], sns.xkcd_rgb["red"], lw=2);
plt.show()
# #上面第二個參數的意思是縱座標可以用來讓遊標線(紅和黑)斜着畫.
pd.DataFrame(X[y==1]['card1']).columns.values.tolist()#這裏繪圖只用了一列數據

處理TransactionAMT

train=pd.read_csv('./ieee-fraud-detection/train1.csv')
train=reduce_mem_usage(train)
test =pd.read_csv('./ieee-fraud-detection/test1.csv')
test=reduce_mem_usage(test)
gc.collect();

#--------------------------下面處理TransactionAMT-------------------------------------------
plot_numerical('TransactionAmt')
fig, axes = plt.subplots(1,1,figsize=(16, 6))
axes.set_title('Moving average of TransactionAmt', fontsize=16);
train[['TransactionDT', 'TransactionAmt']].set_index('TransactionDT').rolling(10000).mean().plot(ax=axes);#rolling是時間窗函數
test[['TransactionDT', 'TransactionAmt']].set_index('TransactionDT').rolling(10000).mean().plot(ax=axes);
axes.legend(['Train', 'Test']);
plt.show()
params = {'objective': 'binary', "boosting_type": "gbdt", "subsample": 1, "bagging_seed": 11, "metric": 'auc', 'random_state': 47}
print('Covariant shift ROC AUC:', covariate_shift('TransactionAmt'))#需要保留

處理TransactionAmt_decimal

train=pd.read_csv('./ieee-fraud-detection/train1.csv')
train=reduce_mem_usage(train)
test =pd.read_csv('./ieee-fraud-detection/test1.csv')
test=reduce_mem_usage(test)
gc.collect();

#--------------------------下面生成並處理新特徵TransactionAmt_decimal-------------------------------------------
# # #交易額的小數點後面的數值做成新的特徵☆
train['TransactionAmt_decimal'] = ((train['TransactionAmt'] - train['TransactionAmt'].astype(int)) * 1000).astype(int)
test['TransactionAmt_decimal'] = ((test['TransactionAmt'] - test['TransactionAmt'].astype(int)) * 1000).astype(int)
plot_numerical('TransactionAmt_decimal')
print('Covariant shift ROC AUC:', covariate_shift('TransactionAmt_decimal'))#需要保留

處理TransactionAmt_decimal_length


#--------------------------下面生成並處理新特徵TransactionAmt_decimal_length☆-------------------------------------------

train=pd.read_csv('./ieee-fraud-detection/train1.csv')
train=reduce_mem_usage(train)
test =pd.read_csv('./ieee-fraud-detection/test1.csv')
test=reduce_mem_usage(test)
gc.collect();

train['TransactionAmt_decimal_length'] = train['TransactionAmt'].astype(str).str.split('.', expand=True)[1].str.len()
test['TransactionAmt_decimal_length']  =  test['TransactionAmt'].astype(str).str.split('.', expand=True)[1].str.len()

# 觀察新特徵TransactionAmt_decimal_length的分佈
df_train = pd.DataFrame(data={'TransactionAmt_decimal_length': train['TransactionAmt_decimal_length'], 'isTest': 0})
df_test =  pd.DataFrame(data={'TransactionAmt_decimal_length':  test['TransactionAmt_decimal_length'], 'isTest': 1})
df = pd.concat([df_train, df_test], ignore_index=True)
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
sns.countplot(data=df.fillna('NaN'), x='TransactionAmt_decimal_length', hue='isTest', ax=axes[0]);
sns.countplot(data=train[['TransactionAmt_decimal_length', 'isFraud']].fillna('NaN'), x='TransactionAmt_decimal_length', hue='isFraud', ax=axes[1]);
axes[0].set_title('Train / Test distibution');
axes[1].set_title('Train distibution by isFraud');
axes[0].legend(['Train', 'Test']);
plt.show()
plot_numerical('TransactionAmt_decimal_length')
params = {'objective': 'binary', "boosting_type": "gbdt", "subsample": 1, "bagging_seed": 11, "metric": 'auc', 'random_state': 47}
print('Covariant shift ROC AUC:', covariate_shift('TransactionAmt_decimal_length'))#需要保留


 

其他不需要特徵工程的列的分析可以是:

#V1
plot_numerical('V1')
print('Covariate shift:', covariate_shift('V1'))

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章