import pandas as pd
import numpy as np
# 使用GBM的算法
import lightgbm as lgb
#import xgboost as xgb
# 引入科學計算的函數庫,從中引入vstack:增加維數 csr_matrix:稀疏矩陣 save_npz:保存文件,load_npz:加載文件
from scipy.sparse import vstack, csr_matrix, save_npz, load_npz
# 引入機器學習函數庫之數據預處理的庫,在數據預處理的庫中引入獨熱編碼方式以及標籤編碼方式
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# 引入機器學習函數庫值模型選擇的庫,從中導入k折交叉驗證的方法
from sklearn.model_selection import StratifiedKFold
#from sklearn.metrics import roc_auc_score
# 引入垃圾回收的庫
import gc
# 初始化垃圾回收機制
gc.enable()
# 設置數據的類型
dtypes = {
'MachineIdentifier': 'category',
'ProductName': 'category',
'EngineVersion': 'category',
'AppVersion': 'category',
'AvSigVersion': 'category',
'IsBeta': 'int8',
'RtpStateBitfield': 'float16',
'IsSxsPassiveMode': 'int8',
'DefaultBrowsersIdentifier': 'float16',
'AVProductStatesIdentifier': 'float32',
'AVProductsInstalled': 'float16',
'AVProductsEnabled': 'float16',
'HasTpm': 'int8',
'CountryIdentifier': 'int16',
'CityIdentifier': 'float32',
'OrganizationIdentifier': 'float16',
'GeoNameIdentifier': 'float16',
'LocaleEnglishNameIdentifier': 'int8',
'Platform': 'category',
'Processor': 'category',
'OsVer': 'category',
'OsBuild': 'int16',
'OsSuite': 'int16',
'OsPlatformSubRelease': 'category',
'OsBuildLab': 'category',
'SkuEdition': 'category',
'IsProtected': 'float16',
'AutoSampleOptIn': 'int8',
'PuaMode': 'category',
'SMode': 'float16',
'IeVerIdentifier': 'float16',
'SmartScreen': 'category',
'Firewall': 'float16',
'UacLuaenable': 'float32',
'Census_MDC2FormFactor': 'category',
'Census_DeviceFamily': 'category',
'Census_OEMNameIdentifier': 'float16',
'Census_OEMModelIdentifier': 'float32',
'Census_ProcessorCoreCount': 'float16',
'Census_ProcessorManufacturerIdentifier': 'float16',
'Census_ProcessorModelIdentifier': 'float16',
'Census_ProcessorClass': 'category',
'Census_PrimaryDiskTotalCapacity': 'float32',
'Census_PrimaryDiskTypeName': 'category',
'Census_SystemVolumeTotalCapacity': 'float32',
'Census_HasOpticalDiskDrive': 'int8',
'Census_TotalPhysicalRAM': 'float32',
'Census_ChassisTypeName': 'category',
'Census_InternalPrimaryDiagonalDisplaySizeInInches': 'float16',
'Census_InternalPrimaryDisplayResolutionHorizontal': 'float16',
'Census_InternalPrimaryDisplayResolutionVertical': 'float16',
'Census_PowerPlatformRoleName': 'category',
'Census_InternalBatteryType': 'category',
'Census_InternalBatteryNumberOfCharges': 'float32',
'Census_OSVersion': 'category',
'Census_OSArchitecture': 'category',
'Census_OSBranch': 'category',
'Census_OSBuildNumber': 'int16',
'Census_OSBuildRevision': 'int32',
'Census_OSEdition': 'category',
'Census_OSSkuName': 'category',
'Census_OSInstallTypeName': 'category',
'Census_OSInstallLanguageIdentifier': 'float16',
'Census_OSUILocaleIdentifier': 'int16',
'Census_OSWUAutoUpdateOptionsName': 'category',
'Census_IsPortableOperatingSystem': 'int8',
'Census_GenuineStateName': 'category',
'Census_ActivationChannel': 'category',
'Census_IsFlightingInternal': 'float16',
'Census_IsFlightsDisabled': 'float16',
'Census_FlightRing': 'category',
'Census_ThresholdOptIn': 'float16',
'Census_FirmwareManufacturerIdentifier': 'float16',
'Census_FirmwareVersionIdentifier': 'float32',
'Census_IsSecureBootEnabled': 'int8',
'Census_IsWIMBootEnabled': 'float16',
'Census_IsVirtualDevice': 'float16',
'Census_IsTouchEnabled': 'int8',
'Census_IsPenCapable': 'int8',
'Census_IsAlwaysOnAlwaysConnectedCapable': 'float16',
'Wdft_IsGamer': 'float16',
'Wdft_RegionIdentifier': 'float16',
'HasDetections': 'int8'
}
print('Download Train and Test Data.\n')
#讀取訓練集的文件,讀取的方式把定義的dtypes的字典賦給dtype參數
train = pd.read_csv('../input/train.csv', dtype=dtypes, low_memory=True)
train['MachineIdentifier'] = train.index.astype('uint32')
#讀取測試集的文件,讀取的方式把定義的dtypes的字典賦給dtype參數
test = pd.read_csv('../input/test.csv', dtype=dtypes, low_memory=True)
test['MachineIdentifier'] = test.index.astype('uint32')
# 垃圾回收機制中回收函數,在變量生命週期結束之時,回收所對的空間,防止內存泄漏
gc.collect()
print('Transform all features to category.\n')
# 第1步:如何提高分類準確性的關鍵--即:將數據提純
for usecol in train.columns.tolist()[1:-1]:# 首尾兩列不是特徵值,且末尾列是所分的類別
train[usecol] = train[usecol].astype('str')# 轉換函數,將每一列的數據類型都轉換爲字符串
test[usecol] = test[usecol].astype('str') # 轉換函數,將每一列的數據類型都轉換爲字符串
# 這樣的做的目的是將所有變量的特徵值的可能取值能夠使用獨熱編碼
# Fit LabelEncoder:一種直接按照序號進行編碼的編碼方式
le = LabelEncoder().fit(np.unique(train[usecol].unique().tolist() + test[usecol].unique().tolist()))
# 在後面由於要把缺失值用0來填充,因此在編碼時,需要+1
train[usecol] = le.transform(train[usecol])+1
test[usecol] = le.transform(test[usecol])+1
# 注意:agg和aggrigate函數是功能相同,agg_tr和agg_te的變量是存儲訓練集中
agg_tr = (train
.groupby([usecol])# 在訓練集的每一列特徵的特徵取值,統計同一個類別在這總樣本中出現的次數,並將統計結果聚合在新的一列
.aggregate({'MachineIdentifier': 'count'})# count是非NA值的數量
.reset_index()# 重新設置索引
.rename({'MachineIdentifier': 'Train'}, axis=1))
agg_te = (test
.groupby([usecol])
.aggregate({'MachineIdentifier': 'count'})# (記住:聚合時選擇不同的標題)將相同的變量取值進行分組,並統計每一個取值相同變量的個數
.reset_index()
.rename({'MachineIdentifier': 'Test'}, axis=1))# rename替換標題
# 將測試集與訓練集合並(agg即爲aggrigate的簡稱),填充缺失值
agg = pd.merge(agg_tr, agg_te, on=usecol, how='outer').replace(np.nan, 0)
'''
篩選出每一個特徵值中可能取值佔比重適中的可能取值
'''
# 選擇在訓練集中出現次數超過1000的特徵取值
agg = agg[(agg['Train'] > 1000)].reset_index(drop=True)# 將符合條件的特徵取值重新編排索引
agg['Total'] = agg['Train'] + agg['Test']
# 這一預處理目的是找出訓練集的非空取值所佔總數的合適比例,便於使用分類器是分類準確
agg = agg[(agg['Train'] / agg['Total'] > 0.2) & (agg['Train'] / agg['Total'] < 0.8)]
agg[usecol+'Copy'] = agg[usecol]
# 這是篩選後的訓練集
train[usecol] = (pd.merge(train[[usecol]],
agg[[usecol, usecol+'Copy']],
on=usecol, how='left')[usecol+'Copy']
.replace(np.nan, 0).astype('int').astype('category'))
# 這是篩選後的測試集
test[usecol] = (pd.merge(test[[usecol]],
agg[[usecol, usecol+'Copy']],
on=usecol, how='left')[usecol+'Copy']
.replace(np.nan, 0).astype('int').astype('category'))
# 不用的變量可以delete掉,這是一個好的編程習慣防止內存泄漏
del le, agg_tr, agg_te, agg, usecol
gc.collect()
# 建立訓練集目標值的數組
y_train = np.array(train['HasDetections'])
train_ids = train.index
test_ids = test.index
# 不用的變量可以delete掉,這是一個好的編程習慣防止內存泄漏
del train['HasDetections'], train['MachineIdentifier'], test['MachineIdentifier']
gc.collect()
print("If you don't want use Sparse Matrix choose Kernel Version 2 to get simple solution.\n")
print('--------------------------------------------------------------------------------------------------------')
print('Transform Data to Sparse Matrix.')
print('Sparse Matrix can be used to fit a lot of models, eg. XGBoost, LightGBM, Random Forest, K-Means and etc.')
print('To concatenate Sparse Matrices by column use hstack()')
print('Read more about Sparse Matrix https://docs.scipy.org/doc/scipy/reference/sparse.html')
print('Good Luck!')
print('--------------------------------------------------------------------------------------------------------')
# 初始化獨立編碼的對象,用訓練集進行擬合
ohe = OneHotEncoder(categories='auto', sparse=True, dtype='uint8').fit(train)
# 現在進行訓練集和測試集的採集,以下是採集方法,這種方法是分段劃分
m = 100000
# 採取等距抽樣的方式選取訓練集和測試集部分的數據,作爲要用機器學習算法的訓練集和測試集,這一些數據的都以稀疏矩陣的方式進行存儲
train = vstack([ohe.transform(train[i*m:(i+1)*m]) for i in range(train.shape[0] // m + 1)])
test = vstack([ohe.transform(test[i*m:(i+1)*m]) for i in range(test.shape[0] // m + 1)])
# 隨後將分段標記索引的數據集分別按照訓練集和測試集的類別存儲在兩個文件
save_npz('train.npz', train, compressed=True)
save_npz('test.npz', test, compressed=True)
# 不用的變量可以delete掉,這是一個好的編程習慣防止內存泄漏
del ohe, train, test
gc.collect()
# 第2步:對劃分後的訓練集和測試集進行驗證
#創建交叉驗證的對象,n_splits:劃分的組數;shuffle:是否重新洗牌;random_state:隨機種子數
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# 將訓練集和測試集進行分開
skf.get_n_splits(train_ids, y_train)
lgb_test_result = np.zeros(test_ids.shape[0])
#lgb_train_result = np.zeros(train_ids.shape[0])
#xgb_test_result = np.zeros(test_ids.shape[0])
#xgb_train_result = np.zeros(train_ids.shape[0])
counter = 0
print('\nLightGBM\n')
# 構建循環進行交叉驗證
for train_index, test_index in skf.split(train_ids, y_train):
print('Fold {}\n'.format(counter + 1))
train = load_npz('train.npz')
X_fit = vstack([train[train_index[i*m:(i+1)*m]] for i in range(train_index.shape[0] // m + 1)])
X_val = vstack([train[test_index[i*m:(i+1)*m]] for i in range(test_index.shape[0] // m + 1)])
X_fit, X_val = csr_matrix(X_fit, dtype='float32'), csr_matrix(X_val, dtype='float32')
y_fit, y_val = y_train[train_index], y_train[test_index]
# 不用的變量可以delete掉,這是一個好的編程習慣防止內存泄漏
del train
gc.collect()
lgb_model = lgb.LGBMClassifier(max_depth=-1,
n_estimators=30000,
learning_rate=0.05,
num_leaves=2**12-1,
colsample_bytree=0.28,
objective='binary',
n_jobs=-1)
#xgb_model = xgb.XGBClassifier(max_depth=6,
# n_estimators=30000,
# colsample_bytree=0.2,
# learning_rate=0.1,
# objective='binary:logistic',
# n_jobs=-1)
lgb_model.fit(X_fit,y_fit,
eval_metric='auc', eval_set=[(X_val, y_val)],
verbose=100, early_stopping_rounds=100)
#xgb_model.fit(X_fit, y_fit, eval_metric='auc',
# eval_set=[(X_val, y_val)],
# verbose=1000, early_stopping_rounds=300)
#lgb_train_result[test_index] += lgb_model.predict_proba(X_val)[:,1]
#xgb_train_result[test_index] += xgb_model.predict_proba(X_val)[:,1]
# 不用的變量可以delete掉,這是一個好的編程習慣防止內存泄漏
del X_fit, X_val, y_fit, y_val, train_index, test_index
gc.collect()
test = load_npz('test.npz')
test = csr_matrix(test, dtype='float32')
lgb_test_result += lgb_model.predict_proba(test)[:,1]
#xgb_test_result += xgb_model.predict_proba(test)[:,1]
counter += 1
# 不用的變量可以delete掉,這是一個好的編程習慣防止內存泄漏
del test
gc.collect()
#Stop fitting to prevent time limit error
#if counter == 3 : break
#print('\nLigthGBM VAL AUC Score: {}'.format(roc_auc_score(y_train, lgb_train_result)))
#print('\nXGBoost VAL AUC Score: {}'.format(roc_auc_score(y_train, xgb_train_result)))
# 第3步:用來計算該分類器進行分類的準確率
submission = pd.read_csv('../input/sample_submission.csv')
submission['HasDetections'] = lgb_test_result / counter
submission.to_csv('lgb_submission.csv', index=False)
#submission['HasDetections'] = xgb_test_result / counter
#submission.to_csv('xgb_submission.csv', index=False)
#submission['HasDetections'] = 0.5 * lgb_test_result / counter + 0.5 * xgb_test_result / counter
#submission.to_csv('lgb_xgb_submission.csv', index=False)
#完成!並結束程序
print('\nDone.')
基於GBM算法的惡意代碼檢測
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.