數據預處理DEMO

功能包括:

  • 異常值處理
  • 空置處理
  • 訓練集和測試集劃分
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import gc
from tqdm import tqdm
import time
from contextlib import contextmanager
from sklearn.preprocessing import StandardScaler


@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

def preprocess(data: pd.DataFrame):
    """ 對數據進行預處理
    """

    def fill_outliers(col: pd.Series):
        """ Remove outliers of each col
        """
        mean = col.mean()
        std = col.std()
        upper = mean + 3 * std
        lower = mean - 3 * std
        col[col > upper] = np.floor(upper)
        col[col < lower] = np.floor(lower)
        return col.values

    # 處理離散值 & 填充空值(使用均值填充)
    columns = data.columns
    for col_name in tqdm(columns):
        data[col_name] = data[col_name].fillna(data[col_name].mean())
    #標準化    
    return data

with timer("split train and test dataset!!!"):
    # 讀取訓練集和測試集
    X_train = pd.read_csv('./dataset/atec_anti_fraud_train.csv', encoding='utf-8',
                          low_memory=False, parse_dates=['date'],index_col='id')
    X_test = pd.read_csv('./dataset/atec_anti_fraud_test_b.csv', encoding='utf-8',
                         low_memory=False, parse_dates=['date'],index_col='id')
    col_train_num, col_test_num = X_train.columns, X_test.columns
    X_train, X_test = X_train[col_train_num], X_test[col_test_num]
    X_train_label,X_train_date=X_train.pop('label'),X_train.pop('date')
    X_test_date=X_test.pop('date')
    print(X_train.shape, X_test.shape)
    print("Start filter features!!!")
    # 篩選缺失率小於0.6的特徵
    col_train, col_test = [], []
    for item in X_train.columns:
        tmp = np.sum(X_train[item].isnull()) / len(X_train)
        if tmp < 1:
            col_train.append(item)
    for item in X_test.columns:
        tmp = np.sum(X_test[item].isnull()) / len(X_test)
        if tmp <1:
            col_test.append(item)
    # 選擇訓練集和測試集的交集
    col = [item for item in col_train if item in col_test]
    print('len(col):', len(col))
    X_train, X_test = X_train[col], X_test[col]
    X_train, X_test = preprocess(X_train), preprocess(X_test)
    X_train, X_test = pd.DataFrame(X_train),pd.DataFrame(X_test)
    
    X_train=pd.concat([X_train_label,X_train_date,X_train],axis=1)
    X_test=pd.concat([X_test_date,X_test],axis=1)

    X_train_col,X_test_col=col.copy(),col.copy()   
    X_train_col.insert(0,'label')
    X_train_col.insert(1,'date')
    X_test_col.insert(0,'date')

    print(X_train.shape, X_test.shape)
    print("Start writing")
    X_train.to_csv("./dataset/x_train.csv", encoding='utf-8',header=X_train_col)
    X_test.to_csv("./dataset/x_test_b.csv", encoding='utf-8',header=X_test_col)
    del X_train,X_test
    gc.collect()

工具函數

import gc
import os
import sys
import time
import pickle
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
import multiprocessing
from functools import partial
from dateutil.parser import parse
from lightgbm import LGBMClassifier
from collections import defaultdict
from datetime import date, timedelta
from contextlib import contextmanager
from joblib import dump, load, Parallel, delayed
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import NMF, PCA, TruncatedSVD
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA



# 求解rmse的均值和標準差
def get_ave_std(c1,c2,f1,f2):
    '''
    :param c1: 提交的常數1
    :param c2: 提交的常數2
    :param f1: 得分1
    :param f2: 得分2
    :return: 均值和標準差
    '''
    f1 = f1**2; f2 = f2**2;
    a = 2; b = 2*(c1+c2); c = c1**2+c2**2-(f1-f2);
    ave = (f1 - f2 + c2 ** 2 - c1 ** 2) / 2 / (c2 - c1)
    std = (f1 - (c1 - ave) ** 2) ** 0.5
    return ave,std

# 求解rmse的均值
def get_sub_ave_std(c1,c2,f1,f2,n1,n2):
    '''
    :param c1: 提交1的常數
    :param c2: 提交2有差異的部分的常數
    :param f1: 提交1的分數
    :param f2: 提交2的分數
    :param n1: 提交總個數
    :param n2: 提交2有差異部分的個數
    :return: 提交2有差異部分的均值
    '''
    result = ((c1+c2)-((f1**2-f2**2)*n1/n2/(c1-c2)))/2
    return result


# 抽樣函數
def make_sample(n,n_sub=2,seed=None):
    import random
    if seed is not None:
        random.seed(seed)
    if type(n) is int:
        l = list(range(n))
        s = int(n / n_sub)
    else:
        l = list(n)
        s = int(len(n) / n_sub)
    random.shuffle(l)
    result = []
    for i in range(n_sub):
        if i == n_sub:
            result.append(l[i*s:])
        else:
            result.append(l[i*s: (i+1)*s])
    return result

# 統計list的value_counts
def value_counts(l):
    s = set(l)
    d = dict([(x,0) for x in s])
    for i in l:
        d[i] += 1
    result = pd.Series(d)
    result.sort_values(ascending=False,inplace=True)
    return result
    
# 統計轉化率
def convert(data,stat,key):
    key = key if type(key) == list else [key]
    stat_temp = stat[key + ['label']].copy()
    rate = stat_temp.groupby(key,as_index=False)['label'].agg({'sum':'sum', 'count':'count'})
    rate['_'.join(key)+'_convert'] = ((rate['sum']+4)/(rate['count']+46))
    data = data.merge(rate, on=key, how='left')
    return data['_'.join(key)+'_convert']
    
# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category=True, min_count=100,inplace=True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    result = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    new_columns = [c for c in result.columns if c not in original_columns]
    cat_columns = [c for c in original_columns if c not in result.columns]
    if not inplace:
        for c in cat_columns:
            result[c] = df[c]
    for c in new_columns:
        if (result[c].sum()<100) or ((result.shape[0]-result[c].sum())<100):
            del result[c]
            new_columns.remove(c)
    return result, new_columns
    
# 分類特徵轉化率
def analyse(data,name,label='label'):
    result = data.groupby(name)[label].agg({'count':'count',
                                              'sum':'sum'})
    result['rate'] = result['sum']/result['count']
    return result

# 連續特徵轉化率,等距分隔
def analyse2(data,name='id',label='label', factor=10):
    grouping = pd.cut(data[name],factor)
    rate = data.groupby(grouping)[label].agg({'sum':'sum',
                                              'count':'count'})
    rate['rate'] = rate['sum']/rate['count']
    return rate

# 連續特徵轉化率,等數分隔
def analyse3(data,name='id',label='label', factor=10):
    grouping = pd.qcut(data[name],factor)
    rate = data.groupby(grouping)[label].agg({'sum':'sum',
                                              'count':'count'})
    rate['rate'] = rate['sum']/rate['count']
    return rate

# 分組標準化
def grp_standard(data,key,names,replace=False):
    for name in names:
        new_name = name + '_' + key + '_' + 'standardize' if replace else name
        mean_std = data.groupby(key, as_index=False)[name].agg({'mean': 'mean',
                                                               'std': 'std'})
        data = data.merge(mean_std, on=key, how='left')
        data[new_name] = ((data[name]-data['mean'])/data['std']).fillna(0).astype(np.float32)
        data[new_name] = data[new_name].replace(-np.inf, 0).fillna(0)
        data.drop(['mean','std'],axis=1,inplace=True)
    return data

# 分組歸一化
def grp_normalize(data,key,names,start=0,replace=False):
    for name in names:
        new_name = name + '_' + key + '_' + 'normalize' if replace else name
        max_min = data.groupby(key,as_index=False)[name].agg({'max':'max',
                                                              'min':'min'})
        data = data.merge(max_min, on=key, how='left')
        data[new_name] = (data[name]-data['min'])/(data['max']-data['min'])
        data[new_name] = data[new_name].replace(-np.inf, start).fillna(start).astype(np.float32)
        data.drop(['max','min'],axis=1,inplace=True)
    return data

# 分組排序
def grp_rank(data,key,names,ascending=True):
    for name in names:
        data.sort_values([key, name], inplace=True, ascending=ascending)
        data['rank'] = range(data.shape[0])
        min_rank = data.groupby(key, as_index=False)['rank'].agg({'min_rank': 'min'})
        data = pd.merge(data, min_rank, on=key, how='left')
        data['rank'] = data['rank'] - data['min_rank']
        data[names] = data['rank']
    data.drop(['rank'],axis=1,inplace=True)
    return data

# 合併節約內存
def concat(L):
    result = None
    for l in L:
        if result is None:
            result = l
        else:
            result[l.columns.tolist()] = l
    return result

# 分組排序函數
def group_rank(data, key, values, ascending=True):
    if type(key)==list:
        data_temp = data[key + [values]].copy()
        data_temp.sort_values(key + [values], inplace=True, ascending=ascending)
        data_temp['rank'] = range(data_temp.shape[0])
        min_rank = data_temp.groupby(key,as_index=False)['rank'].agg({'min_rank':'min'})
        index = data_temp.index
        data_temp = data_temp.merge(min_rank,on=key,how='left')
        data_temp.index = index
    else:
        data_temp = data[[key,values]].copy()
        data_temp.sort_values(key + [values], inplace=True, ascending=ascending)
        data_temp['rank'] = range(data_temp.shape[0])
        data_temp['min_rank'] = data_temp[key].map(data_temp.groupby(key)['rank'].min())
    data_temp['rank'] = data_temp['rank'] - data_temp['min_rank']
    return data_temp['rank']

def nunique(x):
    return len(set(x))


# 前後時間差的函數:
def group_diff_time(data,key,value,n):
    data_temp = data[key+[value]].copy()
    shift_value = data_temp.groupby(key)[value].shift(n)
    data_temp['shift_value'] = data_temp[value] - shift_value
    return data_temp['shift_value']



# smape
def smape(y_true,y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    y_diff = np.abs(y_true-y_pred)
    y_sum = y_true+y_pred
    return np.mean(y_diff/y_sum)*2


# groupby 直接拼接
def groupby(data,stat,key,value,func):
    key = key if type(key)==list else [key]
    data_temp = data[key].copy()
    feat = stat.groupby(key,as_index=False)[value].agg({'feat':func})
    data_temp = data_temp.merge(feat,on=key,how='left')
    return data_temp['feat']



# 計算關係緊密程度指數
def tfidf2(df,key1,key2):
    key = key1 + key2
    tfidf2 = '_'.join(key) + '_tfidf2'
    df1 = df.groupby(key,as_index=False)[key[0]].agg({'key_count': 'size'})
    df2 = df1.groupby(key1,as_index=False)['count'].agg({'key1_count': 'sum'})
    df3 = df1.groupby(key2, as_index=False)['count'].agg({'key2_count': 'sum'})
    df1 = df1.merge(df2,on=key1,how='left').merge(df3,on=key2,how='left')
    df1[tfidf2] = df1['key_count'] / df['key2_count'] / df['key1_count']


# 相差的日期數
def diff_of_days(day1, day2):
    days = (parse(day1[:10]) - parse(day2[:10])).days
    return days

# 相差的分鐘數
def diff_of_minutes(time1,time2):
    minutes = (parse(time1) - parse(time2)).total_seconds()//60
    return abs(minutes)

# 相差的小時數
def diff_of_hours(time1,time2):
    hours = (parse(time1) - parse(time2)).total_seconds()//3600
    return abs(hours)

# 日期的加減
def date_add_days(start_date, days):
    end_date = parse(start_date[:10]) + timedelta(days=days)
    end_date = end_date.strftime('%Y-%m-%d')
    return end_date

# 日期的加減
def date_add_hours(start_date, hours):
    end_date = parse(start_date) + timedelta(hours=hours)
    end_date = end_date.strftime('%Y-%m-%d %H:%M:%S')
    return end_date

# 獲取某個類型裏面第n次的值
def get_last_values(data, stat, key, sort_value, value, shift, sort=None):
    key = key if type(key)==list else [key]
    if sort == 'ascending':
        stat_temp = stat.sort_values(sort_value, ascending=True)
    elif sort == 'descending':
        stat_temp = stat.sort_values(sort_value, ascending=False)
    else:
        stat_temp = stat.copy()
    stat_temp['value'] = stat_temp.groupby(key)[value].shift(shift)
    stat_temp.drop_duplicates(key,keep='last',inplace=True)
    data_temp = data[key].copy()
    data_temp = data_temp.merge(stat_temp,on=key,how='left')
    return data_temp['value']

# 獲取某個類型裏面第n次的值
def get_first_values(data, stat, key, sort_value, value, shift, sort=None):
    key = key if type(key)==list else [key]
    if sort == 'ascending':
        stat_temp = stat.sort_values(sort_value, ascending=True)
    elif sort == 'descending':
        stat_temp = stat.sort_values(sort_value, ascending=False)
    else:
        stat_temp = stat.copy()
    stat_temp['value'] = stat_temp.groupby(key)[value].shift(-shift)
    stat_temp.drop_duplicates(key,keep='first',inplace=True)
    data_temp = data[key].copy()
    data_temp = data_temp.merge(stat_temp,on=key,how='left')
    return data_temp['value']



# 壓縮數據
def compress(data):
    size = sys.getsizeof(data)/2**20
    def intcp(series):
        ma = max(series)
        mi = min(series)
        if (ma<128) & (mi>=-128):
            return 'int8'
        elif (ma<32768) & (mi>=-32768):
            return 'int16'
        elif (ma<2147483648) & (mi>=-2147483648):
            return 'int32'
        else:
            return None
    def floatcp(series):
        ma = max(series)
        mi = min(series)
        if (ma<32770) & (mi>-32770):
            return 'float16'
        elif (ma<2147483600) & (mi>-2147483600):
            return 'float32'
        else:
            return None

    for c in data.columns:
        ctype = None
        dtypes = data[c].dtypes
        if dtypes == np.int64:
            ctype = intcp(data[c])
        if dtypes == np.int32:
            ctype = intcp(data[c])
        if dtypes == np.int16:
            ctype = intcp(data[c])
        if dtypes == np.float64:
            ctype = floatcp(data[c])
        if dtypes == np.float32:
            ctype = floatcp(data[c])
        if ctype is None:
            continue
        try:

            data[c] = data[c].astype(ctype)
            print('{}   convet to {},     done!   {}'.format(dtypes,ctype,c))
        except:
            print('特徵{}的類型爲:{},轉化出線問題!!!'.format(c,dtypes))
    print('原始數據大小爲: {}M'.format(round(size, 2)))
    print('新數據大小爲:  {}M'.format(round(sys.getsizeof(data) / 2 ** 20,2)))
    return data




def trend(y):
    try:
        x = np.arange(0, len(y)).reshape(-1, 1)
        lr = LinearRegression()
        lr.fit(x, y)
        trend = lr.coef_[0]
    except:
        trend = np.nan
    return trend


@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))


def jiangwei(stat,data, id, feature):
    print('lda ...')
    mapping = {}
    for sample in stat[[id, feature]].values:
        mapping.setdefault(sample[0], []).append(str(sample[1]))
    ids = list(mapping.keys())
    sentences = [' '.join(mapping[cate_]) for cate_ in ids]
    stat_sentences_matrix = CountVectorizer(token_pattern='(?u)\\b\\w+\\b', min_df=2).fit_transform(sentences)
    mapping = {}
    for sample in data[[id, feature]].values:
        mapping.setdefault(sample[0], []).append(str(sample[1]))
    ids = list(mapping.keys())
    sentences = [' '.join(mapping[cate_]) for cate_ in ids]
    data_sentences_matrix = CountVectorizer(token_pattern='(?u)\\b\\w+\\b', min_df=2).fit_transform(sentences)

    lda = LDA(n_components=5,
              learning_method='online',
              batch_size=1000,
              n_jobs=40,
              random_state=520)
    lda.fit(stat_sentences_matrix)
    lda_matrix = lda.transform(data_sentences_matrix)
    lda_matrix = pd.DataFrame(lda_matrix,columns=['lda_{}_{}'.format(feature, i) for i in range(5)]).astype('float16')

    nmf = NMF(n_components=5,
              random_state=520,
              beta_loss='kullback-leibler',
              solver='mu',
              max_iter=1000,
              alpha=.1,
              l1_ratio=.5)
    nmf.fit(stat_sentences_matrix)
    nmf_matrix = nmf.transform(stat_sentences_matrix)
    nmf_matrix = pd.DataFrame(nmf_matrix,columns=['nmf_{}_{}'.format(feature, i) for i in range(5)]).astype('float16')

    pca = TruncatedSVD(5)
    pca.fit(stat_sentences_matrix)
    pca_matrix = pca.transform(stat_sentences_matrix)
    pca_matrix = pd.DataFrame(pca_matrix,
                                   columns=["%s_%s_svd_action" % ('user_sku', i) for i in range(5)]).astype('float32')

    matrix = concat([lda_matrix,nmf_matrix,pca_matrix])
    matrix[id] = ids
    return matrix
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章