Coursera NG 機器學習 第六週 SVM分類 Spam Classifier Python 實現

SVM 分類

ex6.py

import numpy as np
from scipy.io import loadmat
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from ex6modules import *
 
#Exmaple 1
data=loadmat('ex6data1.mat')
X=data['X']
y=data['y'].ravel()
plotData(X,y)
plt.show()
 
#Plot C=1's boundary
clf=SVC(C=1,kernel='linear')
clf.fit(X,y)
visualizeBoundaryLinear(X,y,clf)
 
#Plot C=100's boundary
clf=SVC(C=100,kernel='linear')
clf.fit(X,y)
visualizeBoundaryLinear(X,y,clf)
 
#Examing Gaussian Kernel
sim=gaussianKernel(np.array([1,2,1]),np.array([0,4,-1]),sigma=2)
print("similarity :",sim)
 
#Example 2
data=loadmat('ex6data2.mat')
X=data['X']
y=data['y'].ravel()
plotData(X,y)
plt.show()
 
clf=SVC(C=1,kernel='rbf',gamma=50) #gamma=1/(2*sigma**2)
clf.fit(X,y)
visualizeBoundary(X,y,clf)
 
#Exmaple 3
data=loadmat('ex6data3.mat')
X=data['X']
y=data['y'].ravel()
Xval=data['Xval']
yval=data['yval']
plotData(X,y)
plt.show()
 
Csteps=np.array([.01,.03,.1,.3,1,3,10,30])
gammasteps=np.array([1/(2*.01**2),1/(2*.03**2),1/(2*.1**2),1/(2*.3**2),\
                     1/(2*1**2),1/(2*3**2),1/(2*10**2),1/(2*30**2)])
 
Cmin,gammamin=findBest(Csteps,gammasteps,X,y,Xval,yval)
 
clf=SVC(C=Cmin,kernel='rbf',gamma=gammamin)
clf.fit(X,y)
visualizeBoundary(X,y,clf)

ex6modules.py 

import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import SVC
 
def plotData(X,y):
    pos = np.where(y == 1)
    neg = np.where(y == 0)
    plt.scatter(X[pos, 0], X[pos, 1], c='k', marker='+')
    plt.scatter(X[neg, 0], X[neg, 1], c='y', marker='o', edgecolors='k')
 
def visualizeBoundary(X,y,clf):
    plotData(X,y)
    x1plot = np.linspace(np.min(X[:, 0]), np.max(X[:, 0]), 100)
    x2plot = np.linspace(np.min(X[:, 1]), np.max(X[:, 1]), 100)
    X1, X2 = np.meshgrid(x1plot, x2plot)
    vals = np.zeros(X1.shape)
    for i in range(X1.shape[1]):
        this_X = np.column_stack((X1[:, i], X2[:, i]))
        vals[:, i] = clf.predict(this_X)
    plt.contour(X1, X2, vals,[0.5],colors='blue')
    plt.show()
 
def visualizeBoundaryLinear(X,y,clf):
    W=clf.coef_
    b=clf.intercept_
    xp=np.linspace(np.min(X[:,0]),np.max(X[:,1]),100)
    yp=(W[0][0]*xp+b)/(-1*W[0][1])
    plotData(X,y)
    plt.plot(xp,yp,c='b',linewidth=0.5)
    plt.show()
 
def gaussianKernel(x1,x2,sigma):
    return np.exp(-np.sum(np.square(x1-x2))/(2*sigma**2))
 
def findBest(Csteps,gammasteps,X,y,Xval,yval):
    errors = np.zeros((Csteps.shape[0], gammasteps.shape[0]))
    for Cstep in Csteps:
        for gammastep in gammasteps:
            clf = SVC(C=Cstep, kernel='rbf', gamma=gammastep)
            clf.fit(X, y)
            errors[np.where(Csteps == Cstep), np.where(gammasteps == gammastep)] = 1 - clf.score(Xval, yval)
    idx = np.argmin(errors)
    i = int(idx / Csteps.shape[0])
    j = idx - i * Csteps.shape[0]
    Cmin = Csteps[i]
    gammamin = gammasteps[j]
    return Cmin,gammamin

這裏要注意的是:sklearn.svm.SVC函數的參數gamma=1/(2*sigma**2) 。     

C=1時,意味正則化lambda很大,解決overfitting                         C=100時,存在overfitting問題

 

                        利用高斯核,來分離非線性數據                              通過設置不同的C和sigma,找到最優的分離器


Spam Classifier 垃圾郵件分類

作業裏面原理很簡單的一個簡化版spam classifier,樸素貝葉斯+SVM,高級的還是得rnn。

由於郵件預處理需要調用課件裏面的porterStemmer處理派生詞的函數,重寫這個函數有點難度,所以就直接把處理後的郵件輸出到一個txt裏面,再用python讀。

porterStemmer.m

email_contents = readFile('spamSample2.txt');

% Load Vocabulary
vocabList = getVocabList();

% Process Email
email_contents = lower(email_contents);
email_contents = regexprep(email_contents, '<[^<>]+>', ' ');
email_contents = regexprep(email_contents, '[0-9]+', 'number');
email_contents = regexprep(email_contents, ...
                           '(http|https)://[^\s]*', 'httpaddr');
email_contents = regexprep(email_contents, '[^\s]+@[^\s]+', 'emailaddr');
email_contents = regexprep(email_contents, '[$]+', 'dollar');

fprintf('\n==== Processed Email ====\n\n');
l = 0;

% Create output cell
ProcessedEmail=cell(length(email_contents),1);

while ~isempty(email_contents)

    % Tokenize and also get rid of any punctuation
    [str, email_contents] = ...
       strtok(email_contents, ...
              [' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]);
   
    % Remove any non alphanumeric characters
    str = regexprep(str, '[^a-zA-Z0-9]', '');

    % Stem the word 
    % (the porterStemmer sometimes has issues, so we use a try catch block)
    try str = porterStemmer(strtrim(str)); 
    catch str = ''; continue;
    end;

    % Skip the word if it is too short
    if length(str) < 1
       continue;
    end
    ProcessedEmail=[ProcessedEmail;str];
end

% Write processed Email
fid=fopen('PYspamSample2.txt','w');
for i=1:size(ProcessedEmail,1)
    a = ProcessedEmail(i);
    a = cell2mat(a);
    fprintf(fid,'%s\n',a);
end

ex6spam.py

from ex6spamModules import *
from scipy.io import loadmat
from sklearn.svm import SVC

x_emailSample1=EmailToFeatures('PYemailSample1.txt')
x_emailSample2=EmailToFeatures('PYemailSample2.txt')
x_spamSample1=EmailToFeatures('PYspamSample1.txt')
x_spamSample2=EmailToFeatures('PYspamSample2.txt')

dataTrain=loadmat('spamTrain.mat')
dataTest=loadmat('spamTest.mat')
X=dataTrain['X']
y=dataTrain['y'].ravel()
Xtest=dataTest['Xtest']
ytest=dataTest['ytest'].ravel()

clf=SVC(C=0.1,kernel='linear')
clf.fit(X,y)

print("Traning Accuracy: ",clf.score(X,y))
print("Test Accuracy: ",clf.score(Xtest,ytest))

print(clf.predict(x_emailSample1))
print(clf.predict(x_emailSample2))
print(clf.predict(x_spamSample1))
print(clf.predict(x_spamSample2))

ex6spamModules.py

import re
import numpy as np

#處理單詞表txt裏面的單詞編號
def processVocab():
    p = re.compile(r'[0-9]')
    vocab = []
    with open('vocab.txt') as f:
        lines = f.readlines()
        for line in lines:
            line = re.sub(p, '', line)
            line = line.split('\t')
            line[1] = line[1].split('\n')
            vocab.append(line[1][0])
    return vocab

#處理由matlab輸出來的郵件
def processEmail(filename):
    words = []
    with open(filename) as f:
        lines = f.readlines()
        for line in lines:
            if (line != '\n'):
                line = line.split('\n')
                words.append(line[0])
    return words

#找到郵件對應的單詞表索引
def findWordIndex(words,vocabs):
    Index = []
    for word in words:
        if (word in vocabs):
            idx = np.where(vocabs == word)
            Index.append(idx[0][0])
    return Index

#單詞表出現過的單詞置1
def emailFeatures(word_indices,vocabs):
    n=vocabs.shape[0]
    x=np.zeros(n)
    for word_index in word_indices:
        x[word_index]=1
    x=x.reshape((1,n))
    return x

def EmailToFeatures(filename):
    Email_words = np.array(processEmail(filename))
    vocabs = np.array(processVocab())
    word_indices = findWordIndex(Email_words, vocabs)
    x = emailFeatures(word_indices, vocabs)
    return x

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章