KDD Cup99衝突檢測-python-二分類和三分類

功能僅實現二分類和三分類,涉及到的算法有:KNN、SVM、AdaBoost、DNN、隨機森林、XGBoost、決策樹、樸素貝葉斯算法等。

數據集介紹

採用的訓練集爲kddcup.data_10_percent_corrected,測試集採用相同數據量級的數據3-corrected。訓練集包含494021條數據,測試集包含311029條數據。
對於三分類,本次實驗僅關注“normal”、“smurf”和其他,在二分類任務中將“smurf”和其他類別歸爲“attack”。在訓練集中其對應的數據分佈情況如表所示:

類型 normal smurf others total
數量 97278 280790 115953 494021

kdd數據集相關介紹參考

數據預處理

因爲數據集中包含了字符型數據,需要簡單的映射爲離散的數據。爲了便於使用框架搭建DNN,進行了one-hot編碼,但這個只在DNN中用到,所以在DNN代碼中單獨處理的。初次之外還有數據歸一化和數據降維的操作,均在實現的代碼的時候進行。
代碼預處理參考,對代碼進行了修改,滿足了自己的需要。

utils.py

import numpy as np
import pandas as pd
import csv
import time

def preHandel_data():
    source_file = './data/1-kddcup.data_10_percent_corrected'
    # source_file = '3-corrected.txt'
    handled_file = './data/three_kddcup.data_10_percent_corrected.csv'
    data_file = open(handled_file, 'w', newline='') 
    with open(source_file, 'r') as data_source:
        csv_reader = csv.reader(data_source)
        csv_writer = csv.writer(data_file)
        count = 0  # 記錄數據的行數,初始化爲0
        for row in csv_reader:
            temp_line = np.array(row)  
            temp_line[1] = handleProtocol(row)  
            temp_line[2] = handleService(row) 
            temp_line[3] = handleFlag(row) 
            temp_line[41] = handleLabel(row)  
            csv_writer.writerow(temp_line)
            count += 1
            
            print(count, 'status:', temp_line[1], temp_line[2], temp_line[3], temp_line[41])
        data_file.close()

# 3種協議類型轉換成數字
def handleProtocol(input):
    protocol_list = ['tcp', 'udp', 'icmp']
    if input[1] in protocol_list:
        return protocol_list.index(input[1])

# 71種網絡服務類型轉換成數字標識
def handleService(input):
    service_list = ['aol', 'auth', 'bgp', 'courier', 'csnet_ns', 'ctf', 'daytime', 'discard', 'domain', 'domain_u',
                    'echo', 'eco_i', 'ecr_i', 'efs', 'exec', 'finger', 'ftp', 'ftp_data', 'gopher', 'harvest',
                    'hostnames',
                    'http', 'http_2784', 'http_443', 'http_8001', 'imap4', 'IRC', 'iso_tsap', 'klogin', 'kshell',
                    'ldap',
                    'link', 'login', 'mtp', 'name', 'netbios_dgm', 'netbios_ns', 'netbios_ssn', 'netstat', 'nnsp',
                    'nntp',
                    'ntp_u', 'other', 'pm_dump', 'pop_2', 'pop_3', 'printer', 'private', 'red_i', 'remote_job', 'rje',
                    'shell',
                    'smtp', 'sql_net', 'ssh', 'sunrpc', 'supdup', 'systat', 'telnet', 'tftp_u', 'tim_i', 'time',
                    'urh_i', 'urp_i',
                    'uucp', 'uucp_path', 'vmnet', 'whois', 'X11', 'Z39_50', 'icmp']
    if input[2] in service_list:
        return service_list.index(input[2])
    else:
        print("2", input)


# 11種網絡連接狀態轉換成數字
def handleFlag(input):
    flag_list = ['OTH', 'REJ', 'RSTO', 'RSTOS0', 'RSTR', 'S0', 'S1', 'S2', 'S3', 'SF', 'SH']
    if input[3] in flag_list:
        return flag_list.index(input[3])
    else:
        print("3", input[3])


# 攻擊類型轉換成數字
def handleLabel(input):
    label_list=['normal.', 'buffer_overflow.', 'loadmodule.', 'perl.', 'neptune.', 'smurf.',
    'guess_passwd.', 'pod.', 'teardrop.', 'portsweep.', 'ipsweep.', 'land.', 'ftp_write.',
    'back.', 'imap.', 'satan.', 'phf.', 'nmap.', 'multihop.', 'warezmaster.', 'warezclient.',
    'spy.', 'rootkit.']
    if input[41] == label_list[0]:
        return 0
    elif input[41] == label_list[5]:
        return 1
    else:
        return 2

if __name__ == '__main__':
    start_time = time.clock()
    preHandel_data()
    end_time = time.clock()
    print("Running time:", (end_time - start_time))

分類任務

這裏不對相關算法進行講解(其實因爲我也不完全懂),只有簡單的使用。這裏代碼需要用到sklearn和numpy包,自行安裝。

評價指標:準確率、召回率、查準率、ROC曲線。
分類參考代碼,修改成了自己需要的。

11_intrusion_detection.py

# 二分類代碼
# 數據降維的時候,可以選擇將數據分類型進行降維,也可以全部的降維,採用方法有ICA和PCA代碼
import pandas as pd
import numpy as np
from time import time
import os
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler  # install scipy package
from sklearn.metrics import accuracy_score, roc_curve, auc, roc_auc_score
from sklearn.model_selection import cross_validate
from sklearn import neighbors
from sklearn.metrics import confusion_matrix
from sklearn import svm
from sklearn import tree
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
import pickle
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA as sklearnPCA
from sklearn.decomposition import FastICA
from xgboost import XGBClassifier, XGBRFClassifier

class IntrusionDetector:

    def __init__(self, train_data_path, test_kdd_path):
        self.train_kdd_path = train_data_path
        self.test_kdd_path = test_kdd_path

        self.train_kdd_data = []
        self.test_kdd_data = []

        self.train_kdd_numeric = []
        self.test_kdd_numeric = []

        self.train_kdd_binary = []
        self.test_kdd_binary = []

        self.train_kdd_nominal = []
        self.test_kdd_nominal = []

        self.train_kdd_label_2classes = []
        self.test_kdd_label_2classes = []
        #read data from file
        self.get_data()


    def get_data(self):
        col_names = ["duration","protocol_type","service","flag","src_bytes",
            "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
            "logged_in","num_compromised","root_shell","su_attempted","num_root",
            "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
            "is_host_login","is_guest_login","count","srv_count","serror_rate",
            "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
            "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
            "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
            "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
            "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]
        self.train_kdd_data = pd.read_csv(self.train_kdd_path, header=None, names = col_names)
        self.test_kdd_data = pd.read_csv(self.test_kdd_path, header=None, names = col_names)
        self.train_kdd_data.describe()

    # To reduce labels into "Normal" and "Abnormal"
    def get_2classes_labels(self):
        label_2class = self.train_kdd_data['label'].copy()
        self.train_kdd_label_2classes = label_2class.values.reshape((label_2class.shape[0], 1))

        label_2class = self.test_kdd_data['label'].copy()
        self.test_kdd_label_2classes = label_2class.values.reshape((label_2class.shape[0], 1))

    def preprocessor(self):
        # prepare 2 classes label for "abnormal" and "normal"
        self.get_2classes_labels()

        nominal_features = ["protocol_type", "service", "flag"]  # [1, 2, 3]
        binary_features = ["land", "logged_in", "root_shell", "su_attempted", "is_host_login", "is_guest_login",]  # [6, 11, 13, 14, 20, 21]
        numeric_features = [
            "duration", "src_bytes",
            "dst_bytes", "wrong_fragment", "urgent", "hot",
            "num_failed_logins", "num_compromised", "num_root",
            "num_file_creations", "num_shells", "num_access_files",
            "num_outbound_cmds", "count", "srv_count", "serror_rate",
            "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
            "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
            "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
            "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
            "dst_host_rerror_rate", "dst_host_srv_rerror_rate"
        ]

        #convert nominal features to numeric features
        #nominal features: ["protocol_type", "service", "flag"]
        self.train_kdd_nominal = self.train_kdd_data[nominal_features].astype(float)
        self.test_kdd_nominal = self.test_kdd_data[nominal_features].astype(float)
        # normalize
        # self.train_kdd_nominal = StandardScaler().fit_transform(self.train_kdd_nominal)
        # self.test_kdd_nominal = StandardScaler().fit_transform(self.test_kdd_nominal)

        self.train_kdd_binary = self.train_kdd_data[binary_features].astype(float)
        self.test_kdd_binary = self.test_kdd_data[binary_features].astype(float)
        # normalize
        # self.train_kdd_binary = StandardScaler().fit_transform(self.train_kdd_binary)
        # self.test_kdd_binary = StandardScaler().fit_transform(self.test_kdd_binary)

        # Standardizing and scaling numeric features
        self.train_kdd_numeric = self.train_kdd_data[numeric_features].astype(float)
        self.test_kdd_numeric = self.test_kdd_data[numeric_features].astype(float)
        # normalize
        self.train_kdd_numeric = StandardScaler().fit_transform(self.train_kdd_numeric)
        self.test_kdd_numeric = StandardScaler().fit_transform(self.test_kdd_numeric)

    def feature_reduction_ICA(self):

        numeric_ica = FastICA(n_components=11)
        numeric_ica = numeric_ica.fit(self.train_kdd_numeric)
        # numeric_pca = numeric_pca.fit(np.concatenate((self.train_kdd_numeric, self.test_kdd_numeric), axis=0))
        self.train_kdd_numeric = numeric_ica.transform(self.train_kdd_numeric)
        self.test_kdd_numeric = numeric_ica.transform(self.test_kdd_numeric)

        binary_features_ica = FastICA(n_components=5)
        # binary_features_pca = binary_features_pca.fit(np.concatenate((self.train_kdd_binary, self.test_kdd_binary), axis=0))
        # self.train_kdd_binary = binary_features_pca.transform(self.train_kdd_binary)
        # self.test_kdd_binary = binary_features_pca.transform(self.test_kdd_binary)
        self.train_kdd_binary = binary_features_ica.fit_transform(self.train_kdd_binary)
        self.test_kdd_binary = binary_features_ica.fit_transform(self.test_kdd_binary)

        nominal_features_ica = FastICA(n_components=2)
        self.train_kdd_nominal = nominal_features_ica.fit_transform(self.train_kdd_nominal)
        self.test_kdd_nominal = nominal_features_ica.fit_transform(self.test_kdd_nominal)
    def feature_reduction_PCA(self):

        numeric_pca = sklearnPCA(n_components=14)
        numeric_pca = numeric_pca.fit(self.train_kdd_numeric)
        # numeric_pca = numeric_pca.fit(np.concatenate((self.train_kdd_numeric, self.test_kdd_numeric), axis=0))
        self.train_kdd_numeric = numeric_pca.transform(self.train_kdd_numeric)
        self.test_kdd_numeric = numeric_pca.transform(self.test_kdd_numeric)

        # binary_features_pca = sklearnPCA(n_components=5)
        # binary_features_pca = binary_features_pca.fit(np.concatenate((self.train_kdd_binary, self.test_kdd_binary), axis=0))
        # self.train_kdd_binary = binary_features_pca.transform(self.train_kdd_binary)
        # self.test_kdd_binary = binary_features_pca.transform(self.test_kdd_binary)
        # self.train_kdd_binary = binary_features_pca.fit_transform(self.train_kdd_binary)
        # self.test_kdd_binary = binary_features_pca.fit_transform(self.test_kdd_binary)

        # nominal_features_pca = sklearnPCA(n_components=2)
        # self.train_kdd_nominal = nominal_features_pca.fit_transform(self.train_kdd_nominal)
        # self.test_kdd_nominal = nominal_features_pca.fit_transform(self.test_kdd_nominal)

    def format_data(self):

        kdd_train_data = np.concatenate([self.train_kdd_numeric, self.train_kdd_binary, self.train_kdd_nominal], axis=1)
        kdd_test_data = np.concatenate([self.test_kdd_numeric, self.test_kdd_binary, self.test_kdd_nominal], axis=1)

        kdd_train_data = np.concatenate([kdd_train_data, self.train_kdd_label_2classes],axis=1)
        # kdd_test_data = np.concatenate([self.test_kdd_numeric, self.test_kdd_binary, self.test_kdd_nominal, self.test_kdd_label_2classes], axis=1)
        kdd_test_data = np.concatenate([kdd_test_data, self.test_kdd_label_2classes], axis=1)
        self.X_train, self.X_test, y_train, y_test = kdd_train_data[:, :-1], kdd_test_data[:, :-1], kdd_train_data[:,-1], kdd_test_data[:, -1]

        data_pca = sklearnPCA(n_components=15)
        data_pca = data_pca.fit(self.X_train)
        # numeric_pca = numeric_pca.fit(np.concatenate((self.train_kdd_numeric, self.test_kdd_numeric), axis=0))
        self.X_train = data_pca.transform(self.X_train)
        self.X_test = data_pca.transform(self.X_test)

        self.y_train = np.array(list(map(int, y_train)))
        self.y_test = np.array(list(map(np.int64, y_test)))

    def predicting(self, model, model_name):
        # Predict
        predicts = model.predict(self.X_test)
        print("Classifier:")
        accuracy = accuracy_score(self.y_test, predicts)
        print("Accuracy: ", accuracy)

        model_roc_auc = roc_auc_score(self.y_test, predicts)
        print("Auc: ", model_roc_auc)
        fpr1_gnb, tpr1_gnb, thresholds1_gnb = roc_curve(self.y_test, model.predict_proba(self.X_test)[:, 1])

        con_matrix = confusion_matrix(self.y_test, predicts, labels=[0, 1])
        # con_matrix = confusion_matrix(y_test, predicts, labels=["normal.", "abnormal."])
        print("confusion matrix:")
        print(con_matrix)
        precision = con_matrix[0][0] / (con_matrix[0][0] + con_matrix[1][0])
        recall = con_matrix[0][0] / (con_matrix[0][0] + con_matrix[0][1])
        tpr = recall
        fpr = con_matrix[1][0] / (con_matrix[1][0] + con_matrix[1][1])
        print("Precision:", precision)
        print("Recall:", recall)
        print("TPR:", tpr)
        print("FPR:", fpr)
        # scores = cross_val_score(model, self.X_train, self.y_train, cv=5)
        # print('Cross validation Score:', scores)

        plt.figure()
        plt.plot(fpr1_gnb, tpr1_gnb, label='%s Model  (area = %0.2f)' %(model_name, model_roc_auc) )

        plt.plot([0, 1], [0, 1], 'r--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic')
        plt.legend(loc="lower right")
        plt.savefig('./img_all_data_reduce_dim/Log_ROC_%s' %model_name)
        plt.show()
    def boost_predicting(self, models, model_name):
        # Predict
        plt.figure()
        for i in range(3):
            predicts = models[i].predict(self.X_test[:10000])
            print("Classifier:")
            accuracy = accuracy_score(self.y_test[:10000], predicts)
            print("Accuracy: ", accuracy)

            model_roc_auc = roc_auc_score(self.y_test[:10000], predicts)
            print("Auc: ", model_roc_auc)
            fpr1_gnb, tpr1_gnb, thresholds1_gnb = roc_curve(self.y_test[:10000], models[i].predict_proba(self.X_test[:10000])[:, 1])

            con_matrix = confusion_matrix(self.y_test[:10000], predicts, labels=[0, 1])
            # con_matrix = confusion_matrix(y_test, predicts, labels=["normal.", "abnormal."])
            print("confusion matrix:")
            print(con_matrix)
            precision = con_matrix[0][0] / (con_matrix[0][0] + con_matrix[1][0])
            recall = con_matrix[0][0] / (con_matrix[0][0] + con_matrix[0][1])
            tpr = recall
            fpr = con_matrix[1][0] / (con_matrix[1][0] + con_matrix[1][1])
            print("Precision:", precision)
            print("Recall:", recall)
            print("TPR:", tpr)
            print("FPR:", fpr)
        # scores = cross_val_score(model, self.X_train, self.y_train, cv=5)
        # print('Cross validation Score:', scores)

            plt.plot(fpr1_gnb, tpr1_gnb, label='%s Model  (area = %0.2f)' %(model_name[i], model_roc_auc) )

        plt.plot([0, 1], [0, 1], 'r--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic')
        plt.legend(loc="lower right")
        # plt.savefig('Log_ROC_boost')
        # plt.savefig('Log_ROC_%s' %model_name)
        plt.show()
    def SGD_Classifier(self):

        # Create a model
        model = SGDClassifier(loss="log")
        model.fit(self.X_train, self.y_train)

        # Predict
        self.predicting(model, "SGD")
    def bayes_classifier(self):
        # model_list = [GaussianNB, MultinomialNB, BaseDiscreteNB, BaseNB, BernoulliNB,ComplementNB]
        # Create a model
        model = GaussianNB()
        #Load classifier from Pickle
        # model=pickle.load(open("naivebayes.pickle", "rb"))
        # Train the model using the training sets
        model.fit(self.X_train, self.y_train)
        # with open('naivebayes.pickle','wb') as f:
        #     pickle.dump(model,f)

        # Predict
        self.predicting(model, "CNB")

    def knn_classifier(self):

        #Load classifier from Pickle
        # model=pickle.load(open("knearestneighbor.pickle", "rb"))
        model = neighbors.KNeighborsClassifier(n_neighbors=3)
        model.fit(self.X_train, self.y_train)
        with open('knearestneighbor.pickle','wb') as f:
            pickle.dump(model,f)
        print('model trained')

        # predict
        self.predicting(model, "KNN_3")

    def svm_classifier(self):
        # Create SVM classification object
        model = svm.SVC(kernel='rbf', C=0.1, verbose= True, probability=True)
        # model = svm.SVC(kernel='rbf', C=0.8, gamma=20, decision_function_shape='ovr', probability=True)
        model.fit(self.X_train[:50000], self.y_train[:50000])

        # Predict Output
        self.predicting(model, 'SVM')

    def decision_tree_classifier(self):
        # model = tree.DecisionTreeClassifier()
        model = tree.DecisionTreeClassifier(criterion="entropy")
        # print(tree.DecisionTreeClassifier.get_params(model))
        model.fit(self.X_train, self.y_train)
        self.predicting(model, "DTC")
    def random_forest_classifier(self):
        # model = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=35, criterion="entropy")
        model = ensemble.RandomForestClassifier()
        model.fit(self.X_train, self.y_train)
        self.predicting(model, "RFC")
    def adaboost_classifier(self):
        model = ensemble.AdaBoostClassifier()
        print(ensemble.AdaBoostClassifier.get_params(model))
        model.fit(self.X_train, self.y_train)
        self.predicting(model, "AdaBoost")
    def bagging_classifier(self):
        model = ensemble.BaggingClassifier()
        print(ensemble.BaggingClassifier.get_params(model))
        model.fit(self.X_train, self.y_train)
        self.predicting(model, "bagging")
    def XGBoost(self):
        model = XGBClassifier()
        model.fit(self.X_train, self.y_train)
        self.predicting(model, "XGBoost")
    def gradient_boosting_classifier(self):
        model = ensemble.GradientBoostingClassifier()
        model.fit(self.X_train, self.y_train)
        self.predicting(model, "grad_boost")
    def xgb_rf_classifier(self):
        model = XGBRFClassifier()
        model.fit(self.X_train, self.y_train)
        self.predicting(model, "XGBRF")
    def Boost(self):
        model1 = XGBClassifier()
        model1.fit(self.X_train, self.y_train)
        model2 = ensemble.GradientBoostingClassifier()
        model2.fit(self.X_train, self.y_train)
        model3 = XGBRFClassifier()
        model3.fit(self.X_train, self.y_train)

        self.boost_predicting([model1, model2, model3], ["XGBoost", "grad_boost", "XGBRF"])
    def voting(self):
        rfc = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=35, criterion="entropy")
        ada = ensemble.AdaBoostClassifier(n_estimators=75, learning_rate=1.5)
        etc = ensemble.GradientBoostingClassifier()
        model = ensemble.VotingClassifier(estimators=[('ada', ada), ('rfc', rfc), ('etc', etc)], voting='soft', weights=[2, 1, 3],n_jobs=1)
        model.fit(self.X_train, self.y_train)
        self.predicting(model, "Voting")
def main():
    # Data path
    cwd = os.getcwd()  # current directory path
    kdd_data_path_train = cwd + "/kddcup.data_10_percent_corrected.csv"
    kdd_data_path_test = cwd + "/corrected.csv"

    i_detector = IntrusionDetector(kdd_data_path_train, kdd_data_path_test)
    i_detector.preprocessor()
    # 數據降維的兩種方法,分類型的降維
    # i_detector.feature_reduction_ICA()
    # i_detector.feature_reduction_PCA(),並處理成需要的數據類型
    # 對所有數據進行的降維
    i_detector.format_data()

    while (True):
        print("\n\n")
        print("0. SGD Classifier")
        print("1. Naive Bayes Classifier")
        print("2. SVM Classifier")
        print("3. KNN Classifier")
        print("4. Decision Tree Classifier")
        print("5. Random Forest Classifier")
        print("6. AdaBoost Classifier")
        print("7. Bagging Classifier")
        print("8. XGBoost Classifier")
        print("9. Gradient Boosting Classifier")
        print("10. XGB RF Classifier")
        print("11. Three Classifier")
        print("12. Voting Classifier")
        print("13. Quit")
        option = input("Please enter a value:")
        import time
        time1 = time.time()
        if option == "0":
            i_detector.SGD_Classifier()
        elif option == "1":
            # for j in range(1, 39):
            #     print("============================================")
            #     print("Now the %d-th features" %j)
            #     i_detector.format_data(j)
            #     i_detector.bayes_classifier()
            #     print("============================================")
            i_detector.bayes_classifier()
        elif option == "2":
            i_detector.svm_classifier()
        elif option == "3":
            i_detector.knn_classifier()
        elif option == "4":
            i_detector.decision_tree_classifier()
        elif option == "5":
            i_detector.random_forest_classifier()
        elif option == "6":
            i_detector.adaboost_classifier()
        elif option == "7":
            i_detector.bagging_classifier()
        elif option == "8":
            i_detector.XGBoost()
        elif option == "9":
            i_detector.gradient_boosting_classifier()
        elif option == "10":
            i_detector.xgb_rf_classifier()
        elif option == "11":
            i_detector.Boost()
        elif option == "12":
            i_detector.voting()
        elif option == "13":
            break
        print(time.time() - time1)


if __name__ == '__main__':
    main()

代碼的實現效果(決策樹的實現效果,三分類的也類似):

三分類代碼在最後給的鏈接中,原理相同。SVM的參數設置不合理,實現的效果不是很理想。

three_class_nn.py

# 人工神經網絡的分類效果
import pandas
from pylab import *
import keras as ks
from keras.layers import Dense, Activation
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score,accuracy_score
from sklearn.metrics import precision_score,f1_score
from keras.optimizers import Adam,SGD,sgd
from keras.models import load_model
from sklearn.metrics import accuracy_score, roc_curve, auc, roc_auc_score, confusion_matrix
from sklearn.preprocessing import label_binarize, StandardScaler

# save loss and acc
class LossHistory(ks.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.losses = {'batch': [], 'epoch': []}
        self.accuracy = {'batch': [], 'epoch': []}
        self.val_loss = {'batch': [], 'epoch': []}
        self.val_acc = {'batch': [], 'epoch': []}

    def on_batch_end(self, batch, logs={}):
        self.losses['batch'].append(logs.get('loss'))
        self.accuracy['batch'].append(logs.get('acc'))
        self.val_loss['batch'].append(logs.get('val_loss'))
        self.val_acc['batch'].append(logs.get('val_acc'))

    def on_epoch_end(self, batch, logs={}):
        self.losses['epoch'].append(logs.get('loss'))
        self.accuracy['epoch'].append(logs.get('acc'))
        self.val_loss['epoch'].append(logs.get('val_loss'))
        self.val_acc['epoch'].append(logs.get('val_acc'))

    def loss_plot(self, loss_type):
        iters = range(len(self.losses[loss_type]))
        iindex = np.arange(0, len(self.losses[loss_type]), len(self.losses[loss_type]) / 200)
        iters = np.array(iters)
        plt.figure()
        # acc
        plt.plot(iters[iindex], np.array(self.accuracy[loss_type])[iindex], 'r', label='train acc')
        # loss
        plt.plot(iters[iindex], np.array(self.losses[loss_type])[iindex], 'g', label='train loss')
        if loss_type == 'epoch':
            # val_acc
            plt.plot(iters[iindex], np.array(self.val_acc[loss_type])[iindex], 'b', label='val acc')
            # val_loss
            plt.plot(iters[iindex], np.array(self.val_loss[loss_type])[iindex], 'k', label='val loss')
        plt.grid(True)
        plt.xlabel(loss_type)
        plt.ylabel('acc-loss')
        ylim(0, 1.5)
        plt.legend(loc="upper right")
        plt.savefig('fig.png')

        plt.show()

col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]
kdd_data_10percent = pandas.read_csv("three_kddcup.data_10_percent_corrected.csv", header=None, names = col_names)
kdd_data_test = pandas.read_csv("three_corrected.csv", header=None, names = col_names)
kdd_data_10percent.describe()

features = kdd_data_10percent[col_names[:-1]].astype(float)
np_features = np.array(features)
test_features = kdd_data_test[col_names[:-1]].astype(float)
test_np_features = np.array(test_features)

# labels
labels = kdd_data_10percent['label'].copy()
np_labels = np.array(labels)
np_labels = np.array(np_labels, dtype = np.float)

test_labels = kdd_data_test['label'].copy()
test_np_labels = np.array(test_labels)
test_np_labels = np.array(test_np_labels, dtype = np.float)


np_features = StandardScaler().fit_transform(np_features)
test_np_features = StandardScaler().fit_transform(test_np_features)


# data for Testing
labels_for_nn = label_binarize(np_labels, classes=[0, 1, 2])
np_labels_test_nn = label_binarize(test_np_labels, classes=[0, 1, 2])


train_data_num = int(len(np_features) * 0.9)


model = ks.models.Sequential()
model.add(Dense(16, input_dim=41))
model.add(Activation('relu'))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(3))
model.add(Activation('softmax'))
history = LossHistory()

model.compile(loss='categorical_crossentropy',optimizer='adadelta',metrics=['accuracy'])
# model.fit(x=data_train,y=labels_train,batch_size=128,nb_epoch=5000,verbose=1,validation_data=(data_test,labels_test),callbacks=[history])
model.fit(x=np_features[:train_data_num],y=labels_for_nn[:train_data_num],batch_size=100,nb_epoch=2,verbose=1,validation_data=(np_features[train_data_num:],labels_for_nn[train_data_num:]), callbacks=[history])
# history.loss_plot('epoch')

predicts = model.predict(test_np_features)
np_labels_test_nn = np.array(np_labels_test_nn)

fpr, tpr, roc_dict = {}, {}, {}
for i in range(3):
    fpr[i], tpr[i], _ = roc_curve(np_labels_test_nn[:, i], predicts[:, i])
    roc_dict[i] = auc(fpr[i], tpr[i])

plt.figure()
colors = ['aqua', 'darkorange', 'cornflowerblue']
classes = ['normal', 'smurf', 'attack']
for i, color in zip(range(3), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,label='ROC curve of {0} (area = {1:0.2f})'.format(classes[i], roc_dict[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
plt.savefig('3_Log_ROC_DNN')
plt.show()

上述實現的效果圖如下:

完整代碼地址KDD-Cup99

ROC曲線參考地址:python:實現二分類和多分類的ROC曲線

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章