利用sklearn實現支持向量機分類和預測模型

  • 分類模型:

  • 數據集線性可分

from sklearn import svm
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt

def loadDataSet(filename, delim="\t"):
    f = open(filename)
    stringArr = [line.strip().split(delim) for line in f.readlines()]
    datArr = [list(map(float, line))for line in stringArr]
    return np.mat(datArr)

def svc_classification(dataMat):
    x = dataMat[:, :2]
    y = dataMat[:, 2]

    #劃分出訓練集和測試集,測試集佔總樣本30%
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
    clf = svm.SVC(kernel="linear")  #選用線性核函數,通過構造函數SVC生成支持向量機對象
    clf.fit(x_train, y_train)     #訓練數據集
    pre_train = clf.predict(x_train)    #得到模型關於訓練數據集的分類結果
    pre_test = clf.predict(x_test)      #得到模型關於測試數據集的分類結果

    #獲取準確度,即正確分類樣本佔總訓練數據集或測試數據集的比例
    print("Train Accuracy:%.4f\n" % metrics.accuracy_score(y_train, pre_train))
    print("Test  Accuracy:%.4f\n" % metrics.accuracy_score(y_test, pre_test))

    #只有線性核函數才能獲得coef_,即w^Tx+b=0的w平面係數矩陣
    return x_train, y_train, clf.support_vectors_, clf.coef_, clf.intercept_

if __name__=="__main__":
    dataMat = loadDataSet("testSet.txt")
    x_train, y_train, support_vectors, w, b = svc_classification(dataMat)
    x = np.linspace(1, 8, 1000)
    y = -(w[0, 0] / w[0, 1]) * x - b / w[0, 1]   #獲取超平面

    #獲取支持向量機所在平面
    y1 = -(w[0, 0] / w[0, 1]) * x - b / w[0, 1] + 1 / w[0, 1]
    y_1 = -(w[0, 0] / w[0, 1]) * x - b / w[0, 1] - 1 / w[0, 1]

    plt.rcParams['xtick.direction'] = 'in'
    plt.rcParams['ytick.direction'] = 'in'

    for i in range(len(x_train)):
        if y_train[i] == 1:
            plt.scatter(x_train[i, 0], x_train[i, 1], marker="o", color="royalblue", s=80)
        else:
            plt.scatter(x_train[i, 0], x_train[i, 1], marker="s", color="royalblue", s=80)

    #標記支持向量機
    for j in range(len(support_vectors)):
        plt.scatter(support_vectors[j, 0], support_vectors[j, 1], marker="o", edgecolors="r", color="orange", s=80)

    plt.plot(x, y, linewidth=3, color="r")
    plt.plot(x, y1, linewidth=3, color="k", ls="--")
    plt.plot(x, y_1, linewidth=3, color="k", ls="--")

    ax = plt.gca()
    ax.spines['bottom'].set_linewidth(3)
    ax.spines['left'].set_linewidth(3)
    ax.spines['right'].set_linewidth(3)
    ax.spines['top'].set_linewidth(3)

    plt.show()

在這裏插入圖片描述

Train Accuracy:1.0000

Test  Accuracy:1.0000
  • 數據集非線性可分

此時不能再使用線性核函數,使用線性核函數的錯誤率會大大增加,應當使用RBF徑向基核函數。

from sklearn import svm
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt

def loadDataSet(filename, delim="\t"):
    f = open(filename)
    stringArr = [line.strip().split(delim) for line in f.readlines()]
    datArr = [list(map(float, line))for line in stringArr]
    return np.mat(datArr)

def svc_classification(dataMat):
    x = dataMat[:, :2]
    y = dataMat[:, 2]

    #劃分出訓練集和測試集,測試集佔總樣本30%
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
    clf = svm.SVC(kernel="rbf")  #選用線性核函數,通過構造函數SVC生成支持向量機對象
    clf.fit(x_train, y_train)     #訓練數據集
    pre_train = clf.predict(x_train)    #得到模型關於訓練數據集的分類結果
    pre_test = clf.predict(x_test)      #得到模型關於測試數據集的分類結果

    #獲取準確度,即正確分類樣本佔總訓練數據集或測試數據集的比例
    print("Train Accuracy:%.4f\n" % metrics.accuracy_score(y_train, pre_train))
    print("Test  Accuracy:%.4f\n" % metrics.accuracy_score(y_test, pre_test))

    #只有線性核函數才能獲得coef_,即w^Tx+b=0的w平面係數矩陣
    return x_train, y_train, clf.support_vectors_

if __name__=="__main__":
    dataMat = loadDataSet("testSetRBF2.txt")
    x_train, y_train, support_vectors = svc_classification(dataMat)

    plt.rcParams['xtick.direction'] = 'in'
    plt.rcParams['ytick.direction'] = 'in'

    for i in range(len(x_train)):
        if y_train[i] == 1:
            plt.scatter(x_train[i, 0], x_train[i, 1], marker="^", color="royalblue", s=80)
        else:
            plt.scatter(x_train[i, 0], x_train[i, 1], marker="s", color="royalblue", s=80)

    #標記支持向量機
    for j in range(len(support_vectors)):
        plt.scatter(support_vectors[j, 0], support_vectors[j, 1], marker="o", edgecolors="r", color="orange", s=30)



    ax = plt.gca()
    ax.spines['bottom'].set_linewidth(3)
    ax.spines['left'].set_linewidth(3)
    ax.spines['right'].set_linewidth(3)
    ax.spines['top'].set_linewidth(3)

    plt.show()

在這裏插入圖片描述

Train Accuracy:0.9857

Test  Accuracy:0.9667
  • 利用10折交叉驗證進行數據集的訓練與測試

from sklearn import svm
from sklearn.model_selection import KFold
import numpy as np
from sklearn import metrics

def loadDataSet(filename, delim="\t"):
    f = open(filename)
    stringArr = [line.strip().split(delim) for line in f.readlines()]
    datArr = [list(map(float, line))for line in stringArr]
    return np.mat(datArr)

def svc_classification(dataMat):
    iter = 1
    x = dataMat[:, :2]
    y = dataMat[:, 2]

    clf = svm.SVC(kernel="rbf")
    kf = KFold(n_splits=10)
    for train, test in kf.split(dataMat):
        clf.fit(x[train, :], y[train, :])     #訓練數據集
        pre_train = clf.predict(x[train, :])    #得到模型關於訓練數據集的分類結果
        pre_test = clf.predict(x[test, :])      #得到模型關於測試數據集的分類結果

        #獲取準確度,即正確分類樣本佔總訓練數據集或測試數據集的比例
        print("The "+str(iter)+"th cross validation:")
        print("Train Accuracy:%.4f" % metrics.accuracy_score(y[train, :], pre_train) + \
              "\tTest  Accuracy:%.4f\n" % metrics.accuracy_score(y[test, :], pre_test))
        iter = iter + 1

if __name__=="__main__":
    dataMat = loadDataSet("testSetRBF2.txt")
    svc_classification(dataMat)
The 1th cross validation:
Train Accuracy:0.9556	Test  Accuracy:1.0000

The 2th cross validation:
Train Accuracy:0.9778	Test  Accuracy:1.0000

The 3th cross validation:
Train Accuracy:0.9778	Test  Accuracy:1.0000

The 4th cross validation:
Train Accuracy:0.9667	Test  Accuracy:0.9000

The 5th cross validation:
Train Accuracy:0.9667	Test  Accuracy:0.9000

The 6th cross validation:
Train Accuracy:0.9667	Test  Accuracy:0.9000

The 7th cross validation:
Train Accuracy:0.9444	Test  Accuracy:1.0000

The 8th cross validation:
Train Accuracy:0.9778	Test  Accuracy:1.0000

The 9th cross validation:
Train Accuracy:0.9778	Test  Accuracy:0.8000

The 10th cross validation:
Train Accuracy:0.9667	Test  Accuracy:0.9000
  • 預測模型:

from sklearn import svm
import numpy as np
import matplotlib.pyplot as plt

X = np.sort(5 * np.random.rand(40, 1), axis=0)
y = np.sin(X).ravel()

# 從第一個y開始每5個步長添加噪聲數值
y[::5] += 3 * (0.5 - np.random.rand(8))

# epsilon爲容忍度,C爲正則化懲罰係數,即當預測值與真實值之間的誤差大於容忍度時進行L2正則化
# C越大懲罰力度越大,得到的擬合結果越好
# degree爲多項式核函數的多項式階數
svr_rbf = svm.SVR(kernel="rbf", C=100, gamma=0.1, epsilon=.1)
svr_lin = svm.SVR(kernel="linear", C=100, gamma="auto")
svr_poly = svm.SVR(kernel="poly", C=100, gamma="auto", degree=3,\
                   epsilon=.1, coef0=1)

lw = 2   # 線寬
svrs = [svr_rbf, svr_lin, svr_poly]
kernel_label = ['RBF', 'Linear', 'Polynomial']
model_color = ['m', 'c', 'g']

# sharey參數爲是否共享縱軸,默認爲False
# subplots返回畫布(fig)和畫布區域(axes)
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 10), sharey=True)
for ix, svr in enumerate(svrs):
    axes[ix].plot(X, svr.fit(X, y).predict(X), color=model_color[ix], lw=lw,
                  label='{} model'.format(kernel_label[ix]))

    # support_是支持向量機的索引
    axes[ix].scatter(X[svr.support_], y[svr.support_], facecolor="none",
                     edgecolor=model_color[ix], s=50,
                     label='{} support vectors'.format(kernel_label[ix]))

    # 尋找非支持向量機的數據集
    axes[ix].scatter(X[np.setdiff1d(np.arange(len(X)), svr.support_)],
                     y[np.setdiff1d(np.arange(len(X)), svr.support_)],
                     facecolor="none", edgecolor="k", s=50,
                     label='other training data')
    axes[ix].legend(loc='upper center', bbox_to_anchor=(0.5, 1.1),
                    ncol=1, fancybox=True, shadow=True)

fig.text(0.5, 0.04, 'data', ha='center', va='center')
fig.text(0.06, 0.5, 'target', ha='center', va='center', rotation='vertical')
fig.suptitle("Support Vector Regression", fontsize=14)
plt.show()

在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章