Coursera NG 機器學習第六週 SVM分類 Spam Classifier Python 實現

原創

2020-06-09 03:28

SVM 分類

ex6.py

import numpy as np
from scipy.io import loadmat
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from ex6modules import *
 
#Exmaple 1
data=loadmat('ex6data1.mat')
X=data['X']
y=data['y'].ravel()
plotData(X,y)
plt.show()
 
#Plot C=1's boundary
clf=SVC(C=1,kernel='linear')
clf.fit(X,y)
visualizeBoundaryLinear(X,y,clf)
 
#Plot C=100's boundary
clf=SVC(C=100,kernel='linear')
clf.fit(X,y)
visualizeBoundaryLinear(X,y,clf)
 
#Examing Gaussian Kernel
sim=gaussianKernel(np.array([1,2,1]),np.array([0,4,-1]),sigma=2)
print("similarity :",sim)
 
#Example 2
data=loadmat('ex6data2.mat')
X=data['X']
y=data['y'].ravel()
plotData(X,y)
plt.show()
 
clf=SVC(C=1,kernel='rbf',gamma=50) #gamma=1/(2*sigma**2)
clf.fit(X,y)
visualizeBoundary(X,y,clf)
 
#Exmaple 3
data=loadmat('ex6data3.mat')
X=data['X']
y=data['y'].ravel()
Xval=data['Xval']
yval=data['yval']
plotData(X,y)
plt.show()
 
Csteps=np.array([.01,.03,.1,.3,1,3,10,30])
gammasteps=np.array([1/(2*.01**2),1/(2*.03**2),1/(2*.1**2),1/(2*.3**2),\
                     1/(2*1**2),1/(2*3**2),1/(2*10**2),1/(2*30**2)])
 
Cmin,gammamin=findBest(Csteps,gammasteps,X,y,Xval,yval)
 
clf=SVC(C=Cmin,kernel='rbf',gamma=gammamin)
clf.fit(X,y)
visualizeBoundary(X,y,clf)

ex6modules.py

import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import SVC
 
def plotData(X,y):
    pos = np.where(y == 1)
    neg = np.where(y == 0)
    plt.scatter(X[pos, 0], X[pos, 1], c='k', marker='+')
    plt.scatter(X[neg, 0], X[neg, 1], c='y', marker='o', edgecolors='k')
 
def visualizeBoundary(X,y,clf):
    plotData(X,y)
    x1plot = np.linspace(np.min(X[:, 0]), np.max(X[:, 0]), 100)
    x2plot = np.linspace(np.min(X[:, 1]), np.max(X[:, 1]), 100)
    X1, X2 = np.meshgrid(x1plot, x2plot)
    vals = np.zeros(X1.shape)
    for i in range(X1.shape[1]):
        this_X = np.column_stack((X1[:, i], X2[:, i]))
        vals[:, i] = clf.predict(this_X)
    plt.contour(X1, X2, vals,[0.5],colors='blue')
    plt.show()
 
def visualizeBoundaryLinear(X,y,clf):
    W=clf.coef_
    b=clf.intercept_
    xp=np.linspace(np.min(X[:,0]),np.max(X[:,1]),100)
    yp=(W[0][0]*xp+b)/(-1*W[0][1])
    plotData(X,y)
    plt.plot(xp,yp,c='b',linewidth=0.5)
    plt.show()
 
def gaussianKernel(x1,x2,sigma):
    return np.exp(-np.sum(np.square(x1-x2))/(2*sigma**2))
 
def findBest(Csteps,gammasteps,X,y,Xval,yval):
    errors = np.zeros((Csteps.shape[0], gammasteps.shape[0]))
    for Cstep in Csteps:
        for gammastep in gammasteps:
            clf = SVC(C=Cstep, kernel='rbf', gamma=gammastep)
            clf.fit(X, y)
            errors[np.where(Csteps == Cstep), np.where(gammasteps == gammastep)] = 1 - clf.score(Xval, yval)
    idx = np.argmin(errors)
    i = int(idx / Csteps.shape[0])
    j = idx - i * Csteps.shape[0]
    Cmin = Csteps[i]
    gammamin = gammasteps[j]
    return Cmin,gammamin

這裏要注意的是：sklearn.svm.SVC函數的參數gamma=1/(2*sigma**2) 。

C=1時，意味正則化lambda很大，解決overfitting C=100時，存在overfitting問題

利用高斯核，來分離非線性數據通過設置不同的C和sigma，找到最優的分離器

Spam Classifier 垃圾郵件分類

作業裏面原理很簡單的一個簡化版spam classifier，樸素貝葉斯+SVM，高級的還是得rnn。

由於郵件預處理需要調用課件裏面的porterStemmer處理派生詞的函數，重寫這個函數有點難度，所以就直接把處理後的郵件輸出到一個txt裏面，再用python讀。

porterStemmer.m

email_contents = readFile('spamSample2.txt');

% Load Vocabulary
vocabList = getVocabList();

% Process Email
email_contents = lower(email_contents);
email_contents = regexprep(email_contents, '<[^<>]+>', ' ');
email_contents = regexprep(email_contents, '[0-9]+', 'number');
email_contents = regexprep(email_contents, ...
                           '(http|https)://[^\s]*', 'httpaddr');
email_contents = regexprep(email_contents, '[^\s]+@[^\s]+', 'emailaddr');
email_contents = regexprep(email_contents, '[$]+', 'dollar');

fprintf('\n==== Processed Email ====\n\n');
l = 0;

% Create output cell
ProcessedEmail=cell(length(email_contents),1);

while ~isempty(email_contents)

    % Tokenize and also get rid of any punctuation
    [str, email_contents] = ...
       strtok(email_contents, ...
              [' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]);
   
    % Remove any non alphanumeric characters
    str = regexprep(str, '[^a-zA-Z0-9]', '');

    % Stem the word 
    % (the porterStemmer sometimes has issues, so we use a try catch block)
    try str = porterStemmer(strtrim(str)); 
    catch str = ''; continue;
    end;

    % Skip the word if it is too short
    if length(str) < 1
       continue;
    end
    ProcessedEmail=[ProcessedEmail;str];
end

% Write processed Email
fid=fopen('PYspamSample2.txt','w');
for i=1:size(ProcessedEmail,1)
    a = ProcessedEmail(i);
    a = cell2mat(a);
    fprintf(fid,'%s\n',a);
end

ex6spam.py

from ex6spamModules import *
from scipy.io import loadmat
from sklearn.svm import SVC

x_emailSample1=EmailToFeatures('PYemailSample1.txt')
x_emailSample2=EmailToFeatures('PYemailSample2.txt')
x_spamSample1=EmailToFeatures('PYspamSample1.txt')
x_spamSample2=EmailToFeatures('PYspamSample2.txt')

dataTrain=loadmat('spamTrain.mat')
dataTest=loadmat('spamTest.mat')
X=dataTrain['X']
y=dataTrain['y'].ravel()
Xtest=dataTest['Xtest']
ytest=dataTest['ytest'].ravel()

clf=SVC(C=0.1,kernel='linear')
clf.fit(X,y)

print("Traning Accuracy: ",clf.score(X,y))
print("Test Accuracy: ",clf.score(Xtest,ytest))

print(clf.predict(x_emailSample1))
print(clf.predict(x_emailSample2))
print(clf.predict(x_spamSample1))
print(clf.predict(x_spamSample2))

ex6spamModules.py

import re
import numpy as np

#處理單詞表txt裏面的單詞編號
def processVocab():
    p = re.compile(r'[0-9]')
    vocab = []
    with open('vocab.txt') as f:
        lines = f.readlines()
        for line in lines:
            line = re.sub(p, '', line)
            line = line.split('\t')
            line[1] = line[1].split('\n')
            vocab.append(line[1][0])
    return vocab

#處理由matlab輸出來的郵件
def processEmail(filename):
    words = []
    with open(filename) as f:
        lines = f.readlines()
        for line in lines:
            if (line != '\n'):
                line = line.split('\n')
                words.append(line[0])
    return words

#找到郵件對應的單詞表索引
def findWordIndex(words,vocabs):
    Index = []
    for word in words:
        if (word in vocabs):
            idx = np.where(vocabs == word)
            Index.append(idx[0][0])
    return Index

#單詞表出現過的單詞置1
def emailFeatures(word_indices,vocabs):
    n=vocabs.shape[0]
    x=np.zeros(n)
    for word_index in word_indices:
        x[word_index]=1
    x=x.reshape((1,n))
    return x

def EmailToFeatures(filename):
    Email_words = np.array(processEmail(filename))
    vocabs = np.array(processVocab())
    word_indices = findWordIndex(Email_words, vocabs)
    x = emailFeatures(word_indices, vocabs)
    return x

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

Coursera NG 機器學習第六週 SVM分類 Spam Classifier Python 實現

探究職業發展的關鍵：能力模型解讀

如何在低代碼平臺中引用 JavaScript ？

高效率使用windows

智能決策新時代：可視化大屏是否能夠超越傳統白板？

解密Prompt系列28. LLM Agent之金融領域摸索：FinMem & FinAgent

分享幾個.NET開源的AI和LLM相關項目框架

Coursera NG 機器學習第六週 SVM分類 Spam Classifier Python 實現

Coursera NG 機器學習第八週異常檢測推薦系統 Python實現

Coursera NG 機器學習第三週手寫識別邏輯迴歸神經網 Python實現

cs224n Lecture 2 ：Word2Vec Skip-Gram CBOW Negative Sampling 總結

Coursera NG 機器學習第七週 KMeans PCA 圖像壓縮 Python實現

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結

Coursera NG 機器學習 第六週 SVM分類 Spam Classifier Python 實現

Coursera NG 機器學習第六週 SVM分類 Spam Classifier Python 實現