SVM 分類
ex6.py
import numpy as np
from scipy.io import loadmat
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from ex6modules import *
#Exmaple 1
data=loadmat('ex6data1.mat')
X=data['X']
y=data['y'].ravel()
plotData(X,y)
plt.show()
#Plot C=1's boundary
clf=SVC(C=1,kernel='linear')
clf.fit(X,y)
visualizeBoundaryLinear(X,y,clf)
#Plot C=100's boundary
clf=SVC(C=100,kernel='linear')
clf.fit(X,y)
visualizeBoundaryLinear(X,y,clf)
#Examing Gaussian Kernel
sim=gaussianKernel(np.array([1,2,1]),np.array([0,4,-1]),sigma=2)
print("similarity :",sim)
#Example 2
data=loadmat('ex6data2.mat')
X=data['X']
y=data['y'].ravel()
plotData(X,y)
plt.show()
clf=SVC(C=1,kernel='rbf',gamma=50) #gamma=1/(2*sigma**2)
clf.fit(X,y)
visualizeBoundary(X,y,clf)
#Exmaple 3
data=loadmat('ex6data3.mat')
X=data['X']
y=data['y'].ravel()
Xval=data['Xval']
yval=data['yval']
plotData(X,y)
plt.show()
Csteps=np.array([.01,.03,.1,.3,1,3,10,30])
gammasteps=np.array([1/(2*.01**2),1/(2*.03**2),1/(2*.1**2),1/(2*.3**2),\
1/(2*1**2),1/(2*3**2),1/(2*10**2),1/(2*30**2)])
Cmin,gammamin=findBest(Csteps,gammasteps,X,y,Xval,yval)
clf=SVC(C=Cmin,kernel='rbf',gamma=gammamin)
clf.fit(X,y)
visualizeBoundary(X,y,clf)
ex6modules.py
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import SVC
def plotData(X,y):
pos = np.where(y == 1)
neg = np.where(y == 0)
plt.scatter(X[pos, 0], X[pos, 1], c='k', marker='+')
plt.scatter(X[neg, 0], X[neg, 1], c='y', marker='o', edgecolors='k')
def visualizeBoundary(X,y,clf):
plotData(X,y)
x1plot = np.linspace(np.min(X[:, 0]), np.max(X[:, 0]), 100)
x2plot = np.linspace(np.min(X[:, 1]), np.max(X[:, 1]), 100)
X1, X2 = np.meshgrid(x1plot, x2plot)
vals = np.zeros(X1.shape)
for i in range(X1.shape[1]):
this_X = np.column_stack((X1[:, i], X2[:, i]))
vals[:, i] = clf.predict(this_X)
plt.contour(X1, X2, vals,[0.5],colors='blue')
plt.show()
def visualizeBoundaryLinear(X,y,clf):
W=clf.coef_
b=clf.intercept_
xp=np.linspace(np.min(X[:,0]),np.max(X[:,1]),100)
yp=(W[0][0]*xp+b)/(-1*W[0][1])
plotData(X,y)
plt.plot(xp,yp,c='b',linewidth=0.5)
plt.show()
def gaussianKernel(x1,x2,sigma):
return np.exp(-np.sum(np.square(x1-x2))/(2*sigma**2))
def findBest(Csteps,gammasteps,X,y,Xval,yval):
errors = np.zeros((Csteps.shape[0], gammasteps.shape[0]))
for Cstep in Csteps:
for gammastep in gammasteps:
clf = SVC(C=Cstep, kernel='rbf', gamma=gammastep)
clf.fit(X, y)
errors[np.where(Csteps == Cstep), np.where(gammasteps == gammastep)] = 1 - clf.score(Xval, yval)
idx = np.argmin(errors)
i = int(idx / Csteps.shape[0])
j = idx - i * Csteps.shape[0]
Cmin = Csteps[i]
gammamin = gammasteps[j]
return Cmin,gammamin
這裏要注意的是:sklearn.svm.SVC函數的參數gamma=1/(2*sigma**2) 。
C=1時,意味正則化lambda很大,解決overfitting C=100時,存在overfitting問題
利用高斯核,來分離非線性數據 通過設置不同的C和sigma,找到最優的分離器
Spam Classifier 垃圾郵件分類
作業裏面原理很簡單的一個簡化版spam classifier,樸素貝葉斯+SVM,高級的還是得rnn。
由於郵件預處理需要調用課件裏面的porterStemmer處理派生詞的函數,重寫這個函數有點難度,所以就直接把處理後的郵件輸出到一個txt裏面,再用python讀。
porterStemmer.m
email_contents = readFile('spamSample2.txt');
% Load Vocabulary
vocabList = getVocabList();
% Process Email
email_contents = lower(email_contents);
email_contents = regexprep(email_contents, '<[^<>]+>', ' ');
email_contents = regexprep(email_contents, '[0-9]+', 'number');
email_contents = regexprep(email_contents, ...
'(http|https)://[^\s]*', 'httpaddr');
email_contents = regexprep(email_contents, '[^\s]+@[^\s]+', 'emailaddr');
email_contents = regexprep(email_contents, '[$]+', 'dollar');
fprintf('\n==== Processed Email ====\n\n');
l = 0;
% Create output cell
ProcessedEmail=cell(length(email_contents),1);
while ~isempty(email_contents)
% Tokenize and also get rid of any punctuation
[str, email_contents] = ...
strtok(email_contents, ...
[' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]);
% Remove any non alphanumeric characters
str = regexprep(str, '[^a-zA-Z0-9]', '');
% Stem the word
% (the porterStemmer sometimes has issues, so we use a try catch block)
try str = porterStemmer(strtrim(str));
catch str = ''; continue;
end;
% Skip the word if it is too short
if length(str) < 1
continue;
end
ProcessedEmail=[ProcessedEmail;str];
end
% Write processed Email
fid=fopen('PYspamSample2.txt','w');
for i=1:size(ProcessedEmail,1)
a = ProcessedEmail(i);
a = cell2mat(a);
fprintf(fid,'%s\n',a);
end
ex6spam.py
from ex6spamModules import *
from scipy.io import loadmat
from sklearn.svm import SVC
x_emailSample1=EmailToFeatures('PYemailSample1.txt')
x_emailSample2=EmailToFeatures('PYemailSample2.txt')
x_spamSample1=EmailToFeatures('PYspamSample1.txt')
x_spamSample2=EmailToFeatures('PYspamSample2.txt')
dataTrain=loadmat('spamTrain.mat')
dataTest=loadmat('spamTest.mat')
X=dataTrain['X']
y=dataTrain['y'].ravel()
Xtest=dataTest['Xtest']
ytest=dataTest['ytest'].ravel()
clf=SVC(C=0.1,kernel='linear')
clf.fit(X,y)
print("Traning Accuracy: ",clf.score(X,y))
print("Test Accuracy: ",clf.score(Xtest,ytest))
print(clf.predict(x_emailSample1))
print(clf.predict(x_emailSample2))
print(clf.predict(x_spamSample1))
print(clf.predict(x_spamSample2))
ex6spamModules.py
import re
import numpy as np
#處理單詞表txt裏面的單詞編號
def processVocab():
p = re.compile(r'[0-9]')
vocab = []
with open('vocab.txt') as f:
lines = f.readlines()
for line in lines:
line = re.sub(p, '', line)
line = line.split('\t')
line[1] = line[1].split('\n')
vocab.append(line[1][0])
return vocab
#處理由matlab輸出來的郵件
def processEmail(filename):
words = []
with open(filename) as f:
lines = f.readlines()
for line in lines:
if (line != '\n'):
line = line.split('\n')
words.append(line[0])
return words
#找到郵件對應的單詞表索引
def findWordIndex(words,vocabs):
Index = []
for word in words:
if (word in vocabs):
idx = np.where(vocabs == word)
Index.append(idx[0][0])
return Index
#單詞表出現過的單詞置1
def emailFeatures(word_indices,vocabs):
n=vocabs.shape[0]
x=np.zeros(n)
for word_index in word_indices:
x[word_index]=1
x=x.reshape((1,n))
return x
def EmailToFeatures(filename):
Email_words = np.array(processEmail(filename))
vocabs = np.array(processVocab())
word_indices = findWordIndex(Email_words, vocabs)
x = emailFeatures(word_indices, vocabs)
return x