KDD Cup99冲突检测-python-二分类和三分类

功能仅实现二分类和三分类,涉及到的算法有:KNN、SVM、AdaBoost、DNN、随机森林、XGBoost、决策树、朴素贝叶斯算法等。

数据集介绍

采用的训练集为kddcup.data_10_percent_corrected,测试集采用相同数据量级的数据3-corrected。训练集包含494021条数据,测试集包含311029条数据。
对于三分类,本次实验仅关注“normal”、“smurf”和其他,在二分类任务中将“smurf”和其他类别归为“attack”。在训练集中其对应的数据分布情况如表所示:

类型 normal smurf others total
数量 97278 280790 115953 494021

kdd数据集相关介绍参考

数据预处理

因为数据集中包含了字符型数据,需要简单的映射为离散的数据。为了便于使用框架搭建DNN,进行了one-hot编码,但这个只在DNN中用到,所以在DNN代码中单独处理的。初次之外还有数据归一化和数据降维的操作,均在实现的代码的时候进行。
代码预处理参考,对代码进行了修改,满足了自己的需要。

utils.py

import numpy as np
import pandas as pd
import csv
import time

def preHandel_data():
    source_file = './data/1-kddcup.data_10_percent_corrected'
    # source_file = '3-corrected.txt'
    handled_file = './data/three_kddcup.data_10_percent_corrected.csv'
    data_file = open(handled_file, 'w', newline='') 
    with open(source_file, 'r') as data_source:
        csv_reader = csv.reader(data_source)
        csv_writer = csv.writer(data_file)
        count = 0  # 记录数据的行数,初始化为0
        for row in csv_reader:
            temp_line = np.array(row)  
            temp_line[1] = handleProtocol(row)  
            temp_line[2] = handleService(row) 
            temp_line[3] = handleFlag(row) 
            temp_line[41] = handleLabel(row)  
            csv_writer.writerow(temp_line)
            count += 1
            
            print(count, 'status:', temp_line[1], temp_line[2], temp_line[3], temp_line[41])
        data_file.close()

# 3种协议类型转换成数字
def handleProtocol(input):
    protocol_list = ['tcp', 'udp', 'icmp']
    if input[1] in protocol_list:
        return protocol_list.index(input[1])

# 71种网络服务类型转换成数字标识
def handleService(input):
    service_list = ['aol', 'auth', 'bgp', 'courier', 'csnet_ns', 'ctf', 'daytime', 'discard', 'domain', 'domain_u',
                    'echo', 'eco_i', 'ecr_i', 'efs', 'exec', 'finger', 'ftp', 'ftp_data', 'gopher', 'harvest',
                    'hostnames',
                    'http', 'http_2784', 'http_443', 'http_8001', 'imap4', 'IRC', 'iso_tsap', 'klogin', 'kshell',
                    'ldap',
                    'link', 'login', 'mtp', 'name', 'netbios_dgm', 'netbios_ns', 'netbios_ssn', 'netstat', 'nnsp',
                    'nntp',
                    'ntp_u', 'other', 'pm_dump', 'pop_2', 'pop_3', 'printer', 'private', 'red_i', 'remote_job', 'rje',
                    'shell',
                    'smtp', 'sql_net', 'ssh', 'sunrpc', 'supdup', 'systat', 'telnet', 'tftp_u', 'tim_i', 'time',
                    'urh_i', 'urp_i',
                    'uucp', 'uucp_path', 'vmnet', 'whois', 'X11', 'Z39_50', 'icmp']
    if input[2] in service_list:
        return service_list.index(input[2])
    else:
        print("2", input)


# 11种网络连接状态转换成数字
def handleFlag(input):
    flag_list = ['OTH', 'REJ', 'RSTO', 'RSTOS0', 'RSTR', 'S0', 'S1', 'S2', 'S3', 'SF', 'SH']
    if input[3] in flag_list:
        return flag_list.index(input[3])
    else:
        print("3", input[3])


# 攻击类型转换成数字
def handleLabel(input):
    label_list=['normal.', 'buffer_overflow.', 'loadmodule.', 'perl.', 'neptune.', 'smurf.',
    'guess_passwd.', 'pod.', 'teardrop.', 'portsweep.', 'ipsweep.', 'land.', 'ftp_write.',
    'back.', 'imap.', 'satan.', 'phf.', 'nmap.', 'multihop.', 'warezmaster.', 'warezclient.',
    'spy.', 'rootkit.']
    if input[41] == label_list[0]:
        return 0
    elif input[41] == label_list[5]:
        return 1
    else:
        return 2

if __name__ == '__main__':
    start_time = time.clock()
    preHandel_data()
    end_time = time.clock()
    print("Running time:", (end_time - start_time))

分类任务

这里不对相关算法进行讲解(其实因为我也不完全懂),只有简单的使用。这里代码需要用到sklearn和numpy包,自行安装。

评价指标:准确率、召回率、查准率、ROC曲线。
分类参考代码,修改成了自己需要的。

11_intrusion_detection.py

# 二分类代码
# 数据降维的时候,可以选择将数据分类型进行降维,也可以全部的降维,采用方法有ICA和PCA代码
import pandas as pd
import numpy as np
from time import time
import os
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler  # install scipy package
from sklearn.metrics import accuracy_score, roc_curve, auc, roc_auc_score
from sklearn.model_selection import cross_validate
from sklearn import neighbors
from sklearn.metrics import confusion_matrix
from sklearn import svm
from sklearn import tree
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
import pickle
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA as sklearnPCA
from sklearn.decomposition import FastICA
from xgboost import XGBClassifier, XGBRFClassifier

class IntrusionDetector:

    def __init__(self, train_data_path, test_kdd_path):
        self.train_kdd_path = train_data_path
        self.test_kdd_path = test_kdd_path

        self.train_kdd_data = []
        self.test_kdd_data = []

        self.train_kdd_numeric = []
        self.test_kdd_numeric = []

        self.train_kdd_binary = []
        self.test_kdd_binary = []

        self.train_kdd_nominal = []
        self.test_kdd_nominal = []

        self.train_kdd_label_2classes = []
        self.test_kdd_label_2classes = []
        #read data from file
        self.get_data()


    def get_data(self):
        col_names = ["duration","protocol_type","service","flag","src_bytes",
            "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
            "logged_in","num_compromised","root_shell","su_attempted","num_root",
            "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
            "is_host_login","is_guest_login","count","srv_count","serror_rate",
            "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
            "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
            "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
            "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
            "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]
        self.train_kdd_data = pd.read_csv(self.train_kdd_path, header=None, names = col_names)
        self.test_kdd_data = pd.read_csv(self.test_kdd_path, header=None, names = col_names)
        self.train_kdd_data.describe()

    # To reduce labels into "Normal" and "Abnormal"
    def get_2classes_labels(self):
        label_2class = self.train_kdd_data['label'].copy()
        self.train_kdd_label_2classes = label_2class.values.reshape((label_2class.shape[0], 1))

        label_2class = self.test_kdd_data['label'].copy()
        self.test_kdd_label_2classes = label_2class.values.reshape((label_2class.shape[0], 1))

    def preprocessor(self):
        # prepare 2 classes label for "abnormal" and "normal"
        self.get_2classes_labels()

        nominal_features = ["protocol_type", "service", "flag"]  # [1, 2, 3]
        binary_features = ["land", "logged_in", "root_shell", "su_attempted", "is_host_login", "is_guest_login",]  # [6, 11, 13, 14, 20, 21]
        numeric_features = [
            "duration", "src_bytes",
            "dst_bytes", "wrong_fragment", "urgent", "hot",
            "num_failed_logins", "num_compromised", "num_root",
            "num_file_creations", "num_shells", "num_access_files",
            "num_outbound_cmds", "count", "srv_count", "serror_rate",
            "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
            "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
            "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
            "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
            "dst_host_rerror_rate", "dst_host_srv_rerror_rate"
        ]

        #convert nominal features to numeric features
        #nominal features: ["protocol_type", "service", "flag"]
        self.train_kdd_nominal = self.train_kdd_data[nominal_features].astype(float)
        self.test_kdd_nominal = self.test_kdd_data[nominal_features].astype(float)
        # normalize
        # self.train_kdd_nominal = StandardScaler().fit_transform(self.train_kdd_nominal)
        # self.test_kdd_nominal = StandardScaler().fit_transform(self.test_kdd_nominal)

        self.train_kdd_binary = self.train_kdd_data[binary_features].astype(float)
        self.test_kdd_binary = self.test_kdd_data[binary_features].astype(float)
        # normalize
        # self.train_kdd_binary = StandardScaler().fit_transform(self.train_kdd_binary)
        # self.test_kdd_binary = StandardScaler().fit_transform(self.test_kdd_binary)

        # Standardizing and scaling numeric features
        self.train_kdd_numeric = self.train_kdd_data[numeric_features].astype(float)
        self.test_kdd_numeric = self.test_kdd_data[numeric_features].astype(float)
        # normalize
        self.train_kdd_numeric = StandardScaler().fit_transform(self.train_kdd_numeric)
        self.test_kdd_numeric = StandardScaler().fit_transform(self.test_kdd_numeric)

    def feature_reduction_ICA(self):

        numeric_ica = FastICA(n_components=11)
        numeric_ica = numeric_ica.fit(self.train_kdd_numeric)
        # numeric_pca = numeric_pca.fit(np.concatenate((self.train_kdd_numeric, self.test_kdd_numeric), axis=0))
        self.train_kdd_numeric = numeric_ica.transform(self.train_kdd_numeric)
        self.test_kdd_numeric = numeric_ica.transform(self.test_kdd_numeric)

        binary_features_ica = FastICA(n_components=5)
        # binary_features_pca = binary_features_pca.fit(np.concatenate((self.train_kdd_binary, self.test_kdd_binary), axis=0))
        # self.train_kdd_binary = binary_features_pca.transform(self.train_kdd_binary)
        # self.test_kdd_binary = binary_features_pca.transform(self.test_kdd_binary)
        self.train_kdd_binary = binary_features_ica.fit_transform(self.train_kdd_binary)
        self.test_kdd_binary = binary_features_ica.fit_transform(self.test_kdd_binary)

        nominal_features_ica = FastICA(n_components=2)
        self.train_kdd_nominal = nominal_features_ica.fit_transform(self.train_kdd_nominal)
        self.test_kdd_nominal = nominal_features_ica.fit_transform(self.test_kdd_nominal)
    def feature_reduction_PCA(self):

        numeric_pca = sklearnPCA(n_components=14)
        numeric_pca = numeric_pca.fit(self.train_kdd_numeric)
        # numeric_pca = numeric_pca.fit(np.concatenate((self.train_kdd_numeric, self.test_kdd_numeric), axis=0))
        self.train_kdd_numeric = numeric_pca.transform(self.train_kdd_numeric)
        self.test_kdd_numeric = numeric_pca.transform(self.test_kdd_numeric)

        # binary_features_pca = sklearnPCA(n_components=5)
        # binary_features_pca = binary_features_pca.fit(np.concatenate((self.train_kdd_binary, self.test_kdd_binary), axis=0))
        # self.train_kdd_binary = binary_features_pca.transform(self.train_kdd_binary)
        # self.test_kdd_binary = binary_features_pca.transform(self.test_kdd_binary)
        # self.train_kdd_binary = binary_features_pca.fit_transform(self.train_kdd_binary)
        # self.test_kdd_binary = binary_features_pca.fit_transform(self.test_kdd_binary)

        # nominal_features_pca = sklearnPCA(n_components=2)
        # self.train_kdd_nominal = nominal_features_pca.fit_transform(self.train_kdd_nominal)
        # self.test_kdd_nominal = nominal_features_pca.fit_transform(self.test_kdd_nominal)

    def format_data(self):

        kdd_train_data = np.concatenate([self.train_kdd_numeric, self.train_kdd_binary, self.train_kdd_nominal], axis=1)
        kdd_test_data = np.concatenate([self.test_kdd_numeric, self.test_kdd_binary, self.test_kdd_nominal], axis=1)

        kdd_train_data = np.concatenate([kdd_train_data, self.train_kdd_label_2classes],axis=1)
        # kdd_test_data = np.concatenate([self.test_kdd_numeric, self.test_kdd_binary, self.test_kdd_nominal, self.test_kdd_label_2classes], axis=1)
        kdd_test_data = np.concatenate([kdd_test_data, self.test_kdd_label_2classes], axis=1)
        self.X_train, self.X_test, y_train, y_test = kdd_train_data[:, :-1], kdd_test_data[:, :-1], kdd_train_data[:,-1], kdd_test_data[:, -1]

        data_pca = sklearnPCA(n_components=15)
        data_pca = data_pca.fit(self.X_train)
        # numeric_pca = numeric_pca.fit(np.concatenate((self.train_kdd_numeric, self.test_kdd_numeric), axis=0))
        self.X_train = data_pca.transform(self.X_train)
        self.X_test = data_pca.transform(self.X_test)

        self.y_train = np.array(list(map(int, y_train)))
        self.y_test = np.array(list(map(np.int64, y_test)))

    def predicting(self, model, model_name):
        # Predict
        predicts = model.predict(self.X_test)
        print("Classifier:")
        accuracy = accuracy_score(self.y_test, predicts)
        print("Accuracy: ", accuracy)

        model_roc_auc = roc_auc_score(self.y_test, predicts)
        print("Auc: ", model_roc_auc)
        fpr1_gnb, tpr1_gnb, thresholds1_gnb = roc_curve(self.y_test, model.predict_proba(self.X_test)[:, 1])

        con_matrix = confusion_matrix(self.y_test, predicts, labels=[0, 1])
        # con_matrix = confusion_matrix(y_test, predicts, labels=["normal.", "abnormal."])
        print("confusion matrix:")
        print(con_matrix)
        precision = con_matrix[0][0] / (con_matrix[0][0] + con_matrix[1][0])
        recall = con_matrix[0][0] / (con_matrix[0][0] + con_matrix[0][1])
        tpr = recall
        fpr = con_matrix[1][0] / (con_matrix[1][0] + con_matrix[1][1])
        print("Precision:", precision)
        print("Recall:", recall)
        print("TPR:", tpr)
        print("FPR:", fpr)
        # scores = cross_val_score(model, self.X_train, self.y_train, cv=5)
        # print('Cross validation Score:', scores)

        plt.figure()
        plt.plot(fpr1_gnb, tpr1_gnb, label='%s Model  (area = %0.2f)' %(model_name, model_roc_auc) )

        plt.plot([0, 1], [0, 1], 'r--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic')
        plt.legend(loc="lower right")
        plt.savefig('./img_all_data_reduce_dim/Log_ROC_%s' %model_name)
        plt.show()
    def boost_predicting(self, models, model_name):
        # Predict
        plt.figure()
        for i in range(3):
            predicts = models[i].predict(self.X_test[:10000])
            print("Classifier:")
            accuracy = accuracy_score(self.y_test[:10000], predicts)
            print("Accuracy: ", accuracy)

            model_roc_auc = roc_auc_score(self.y_test[:10000], predicts)
            print("Auc: ", model_roc_auc)
            fpr1_gnb, tpr1_gnb, thresholds1_gnb = roc_curve(self.y_test[:10000], models[i].predict_proba(self.X_test[:10000])[:, 1])

            con_matrix = confusion_matrix(self.y_test[:10000], predicts, labels=[0, 1])
            # con_matrix = confusion_matrix(y_test, predicts, labels=["normal.", "abnormal."])
            print("confusion matrix:")
            print(con_matrix)
            precision = con_matrix[0][0] / (con_matrix[0][0] + con_matrix[1][0])
            recall = con_matrix[0][0] / (con_matrix[0][0] + con_matrix[0][1])
            tpr = recall
            fpr = con_matrix[1][0] / (con_matrix[1][0] + con_matrix[1][1])
            print("Precision:", precision)
            print("Recall:", recall)
            print("TPR:", tpr)
            print("FPR:", fpr)
        # scores = cross_val_score(model, self.X_train, self.y_train, cv=5)
        # print('Cross validation Score:', scores)

            plt.plot(fpr1_gnb, tpr1_gnb, label='%s Model  (area = %0.2f)' %(model_name[i], model_roc_auc) )

        plt.plot([0, 1], [0, 1], 'r--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic')
        plt.legend(loc="lower right")
        # plt.savefig('Log_ROC_boost')
        # plt.savefig('Log_ROC_%s' %model_name)
        plt.show()
    def SGD_Classifier(self):

        # Create a model
        model = SGDClassifier(loss="log")
        model.fit(self.X_train, self.y_train)

        # Predict
        self.predicting(model, "SGD")
    def bayes_classifier(self):
        # model_list = [GaussianNB, MultinomialNB, BaseDiscreteNB, BaseNB, BernoulliNB,ComplementNB]
        # Create a model
        model = GaussianNB()
        #Load classifier from Pickle
        # model=pickle.load(open("naivebayes.pickle", "rb"))
        # Train the model using the training sets
        model.fit(self.X_train, self.y_train)
        # with open('naivebayes.pickle','wb') as f:
        #     pickle.dump(model,f)

        # Predict
        self.predicting(model, "CNB")

    def knn_classifier(self):

        #Load classifier from Pickle
        # model=pickle.load(open("knearestneighbor.pickle", "rb"))
        model = neighbors.KNeighborsClassifier(n_neighbors=3)
        model.fit(self.X_train, self.y_train)
        with open('knearestneighbor.pickle','wb') as f:
            pickle.dump(model,f)
        print('model trained')

        # predict
        self.predicting(model, "KNN_3")

    def svm_classifier(self):
        # Create SVM classification object
        model = svm.SVC(kernel='rbf', C=0.1, verbose= True, probability=True)
        # model = svm.SVC(kernel='rbf', C=0.8, gamma=20, decision_function_shape='ovr', probability=True)
        model.fit(self.X_train[:50000], self.y_train[:50000])

        # Predict Output
        self.predicting(model, 'SVM')

    def decision_tree_classifier(self):
        # model = tree.DecisionTreeClassifier()
        model = tree.DecisionTreeClassifier(criterion="entropy")
        # print(tree.DecisionTreeClassifier.get_params(model))
        model.fit(self.X_train, self.y_train)
        self.predicting(model, "DTC")
    def random_forest_classifier(self):
        # model = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=35, criterion="entropy")
        model = ensemble.RandomForestClassifier()
        model.fit(self.X_train, self.y_train)
        self.predicting(model, "RFC")
    def adaboost_classifier(self):
        model = ensemble.AdaBoostClassifier()
        print(ensemble.AdaBoostClassifier.get_params(model))
        model.fit(self.X_train, self.y_train)
        self.predicting(model, "AdaBoost")
    def bagging_classifier(self):
        model = ensemble.BaggingClassifier()
        print(ensemble.BaggingClassifier.get_params(model))
        model.fit(self.X_train, self.y_train)
        self.predicting(model, "bagging")
    def XGBoost(self):
        model = XGBClassifier()
        model.fit(self.X_train, self.y_train)
        self.predicting(model, "XGBoost")
    def gradient_boosting_classifier(self):
        model = ensemble.GradientBoostingClassifier()
        model.fit(self.X_train, self.y_train)
        self.predicting(model, "grad_boost")
    def xgb_rf_classifier(self):
        model = XGBRFClassifier()
        model.fit(self.X_train, self.y_train)
        self.predicting(model, "XGBRF")
    def Boost(self):
        model1 = XGBClassifier()
        model1.fit(self.X_train, self.y_train)
        model2 = ensemble.GradientBoostingClassifier()
        model2.fit(self.X_train, self.y_train)
        model3 = XGBRFClassifier()
        model3.fit(self.X_train, self.y_train)

        self.boost_predicting([model1, model2, model3], ["XGBoost", "grad_boost", "XGBRF"])
    def voting(self):
        rfc = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=35, criterion="entropy")
        ada = ensemble.AdaBoostClassifier(n_estimators=75, learning_rate=1.5)
        etc = ensemble.GradientBoostingClassifier()
        model = ensemble.VotingClassifier(estimators=[('ada', ada), ('rfc', rfc), ('etc', etc)], voting='soft', weights=[2, 1, 3],n_jobs=1)
        model.fit(self.X_train, self.y_train)
        self.predicting(model, "Voting")
def main():
    # Data path
    cwd = os.getcwd()  # current directory path
    kdd_data_path_train = cwd + "/kddcup.data_10_percent_corrected.csv"
    kdd_data_path_test = cwd + "/corrected.csv"

    i_detector = IntrusionDetector(kdd_data_path_train, kdd_data_path_test)
    i_detector.preprocessor()
    # 数据降维的两种方法,分类型的降维
    # i_detector.feature_reduction_ICA()
    # i_detector.feature_reduction_PCA(),并处理成需要的数据类型
    # 对所有数据进行的降维
    i_detector.format_data()

    while (True):
        print("\n\n")
        print("0. SGD Classifier")
        print("1. Naive Bayes Classifier")
        print("2. SVM Classifier")
        print("3. KNN Classifier")
        print("4. Decision Tree Classifier")
        print("5. Random Forest Classifier")
        print("6. AdaBoost Classifier")
        print("7. Bagging Classifier")
        print("8. XGBoost Classifier")
        print("9. Gradient Boosting Classifier")
        print("10. XGB RF Classifier")
        print("11. Three Classifier")
        print("12. Voting Classifier")
        print("13. Quit")
        option = input("Please enter a value:")
        import time
        time1 = time.time()
        if option == "0":
            i_detector.SGD_Classifier()
        elif option == "1":
            # for j in range(1, 39):
            #     print("============================================")
            #     print("Now the %d-th features" %j)
            #     i_detector.format_data(j)
            #     i_detector.bayes_classifier()
            #     print("============================================")
            i_detector.bayes_classifier()
        elif option == "2":
            i_detector.svm_classifier()
        elif option == "3":
            i_detector.knn_classifier()
        elif option == "4":
            i_detector.decision_tree_classifier()
        elif option == "5":
            i_detector.random_forest_classifier()
        elif option == "6":
            i_detector.adaboost_classifier()
        elif option == "7":
            i_detector.bagging_classifier()
        elif option == "8":
            i_detector.XGBoost()
        elif option == "9":
            i_detector.gradient_boosting_classifier()
        elif option == "10":
            i_detector.xgb_rf_classifier()
        elif option == "11":
            i_detector.Boost()
        elif option == "12":
            i_detector.voting()
        elif option == "13":
            break
        print(time.time() - time1)


if __name__ == '__main__':
    main()

代码的实现效果(决策树的实现效果,三分类的也类似):

三分类代码在最后给的链接中,原理相同。SVM的参数设置不合理,实现的效果不是很理想。

three_class_nn.py

# 人工神经网络的分类效果
import pandas
from pylab import *
import keras as ks
from keras.layers import Dense, Activation
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score,accuracy_score
from sklearn.metrics import precision_score,f1_score
from keras.optimizers import Adam,SGD,sgd
from keras.models import load_model
from sklearn.metrics import accuracy_score, roc_curve, auc, roc_auc_score, confusion_matrix
from sklearn.preprocessing import label_binarize, StandardScaler

# save loss and acc
class LossHistory(ks.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.losses = {'batch': [], 'epoch': []}
        self.accuracy = {'batch': [], 'epoch': []}
        self.val_loss = {'batch': [], 'epoch': []}
        self.val_acc = {'batch': [], 'epoch': []}

    def on_batch_end(self, batch, logs={}):
        self.losses['batch'].append(logs.get('loss'))
        self.accuracy['batch'].append(logs.get('acc'))
        self.val_loss['batch'].append(logs.get('val_loss'))
        self.val_acc['batch'].append(logs.get('val_acc'))

    def on_epoch_end(self, batch, logs={}):
        self.losses['epoch'].append(logs.get('loss'))
        self.accuracy['epoch'].append(logs.get('acc'))
        self.val_loss['epoch'].append(logs.get('val_loss'))
        self.val_acc['epoch'].append(logs.get('val_acc'))

    def loss_plot(self, loss_type):
        iters = range(len(self.losses[loss_type]))
        iindex = np.arange(0, len(self.losses[loss_type]), len(self.losses[loss_type]) / 200)
        iters = np.array(iters)
        plt.figure()
        # acc
        plt.plot(iters[iindex], np.array(self.accuracy[loss_type])[iindex], 'r', label='train acc')
        # loss
        plt.plot(iters[iindex], np.array(self.losses[loss_type])[iindex], 'g', label='train loss')
        if loss_type == 'epoch':
            # val_acc
            plt.plot(iters[iindex], np.array(self.val_acc[loss_type])[iindex], 'b', label='val acc')
            # val_loss
            plt.plot(iters[iindex], np.array(self.val_loss[loss_type])[iindex], 'k', label='val loss')
        plt.grid(True)
        plt.xlabel(loss_type)
        plt.ylabel('acc-loss')
        ylim(0, 1.5)
        plt.legend(loc="upper right")
        plt.savefig('fig.png')

        plt.show()

col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]
kdd_data_10percent = pandas.read_csv("three_kddcup.data_10_percent_corrected.csv", header=None, names = col_names)
kdd_data_test = pandas.read_csv("three_corrected.csv", header=None, names = col_names)
kdd_data_10percent.describe()

features = kdd_data_10percent[col_names[:-1]].astype(float)
np_features = np.array(features)
test_features = kdd_data_test[col_names[:-1]].astype(float)
test_np_features = np.array(test_features)

# labels
labels = kdd_data_10percent['label'].copy()
np_labels = np.array(labels)
np_labels = np.array(np_labels, dtype = np.float)

test_labels = kdd_data_test['label'].copy()
test_np_labels = np.array(test_labels)
test_np_labels = np.array(test_np_labels, dtype = np.float)


np_features = StandardScaler().fit_transform(np_features)
test_np_features = StandardScaler().fit_transform(test_np_features)


# data for Testing
labels_for_nn = label_binarize(np_labels, classes=[0, 1, 2])
np_labels_test_nn = label_binarize(test_np_labels, classes=[0, 1, 2])


train_data_num = int(len(np_features) * 0.9)


model = ks.models.Sequential()
model.add(Dense(16, input_dim=41))
model.add(Activation('relu'))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(3))
model.add(Activation('softmax'))
history = LossHistory()

model.compile(loss='categorical_crossentropy',optimizer='adadelta',metrics=['accuracy'])
# model.fit(x=data_train,y=labels_train,batch_size=128,nb_epoch=5000,verbose=1,validation_data=(data_test,labels_test),callbacks=[history])
model.fit(x=np_features[:train_data_num],y=labels_for_nn[:train_data_num],batch_size=100,nb_epoch=2,verbose=1,validation_data=(np_features[train_data_num:],labels_for_nn[train_data_num:]), callbacks=[history])
# history.loss_plot('epoch')

predicts = model.predict(test_np_features)
np_labels_test_nn = np.array(np_labels_test_nn)

fpr, tpr, roc_dict = {}, {}, {}
for i in range(3):
    fpr[i], tpr[i], _ = roc_curve(np_labels_test_nn[:, i], predicts[:, i])
    roc_dict[i] = auc(fpr[i], tpr[i])

plt.figure()
colors = ['aqua', 'darkorange', 'cornflowerblue']
classes = ['normal', 'smurf', 'attack']
for i, color in zip(range(3), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,label='ROC curve of {0} (area = {1:0.2f})'.format(classes[i], roc_dict[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
plt.savefig('3_Log_ROC_DNN')
plt.show()

上述实现的效果图如下:

完整代码地址KDD-Cup99

ROC曲线参考地址:python:实现二分类和多分类的ROC曲线

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章