上一篇博客記錄了樸素貝葉斯,本篇博客爲實現,實現代碼如下:
from __future__ import division
class Bayes:
def __init__(self):
self.X = None
self.y = None
self.x_dims = 0
self.unique_x = {}
self.unique_y = {}
self.x_prob_dict = {} #record the prob of each unique x_i
self.y_prob_dict = {} #record the prob of each unique y
#when Lambda != 0, laplace smooth is used
def train(self, train_set = None, train_label = None, Lambda = 0):
if not train_set or not train_label:
raise ValueError('train_set or train_lable not exists')
if len(train_set) != len(train_label):
raise ValueError('The dimensions of the two inputs must be consistent')
self.X = train_set
self.y = train_label
self.unique_y = set(self.y) #each item in distinct y is unique
num_train = len(self.X) #number of trainset
self.x_dims = len(self.X[0]) #dimension of each train example
for i in range(self.x_dims):
xi_list = [x[i] for x in self.X]
#print i,xi_list
self.unique_x[i] = set(xi_list)
for y in self.unique_y:
#print y,self.y.count(y)
prob = (self.y.count(y) + Lambda) / (num_train + len(self.unique_y) * Lambda ) #p(y=c_k)
self.y_prob_dict[y] = prob
self.x_prob_dict[y] = {}
x_y = [self.X[i] for i in range(len(self.X)) if self.y[i] == y] # whole x[i] subset when y[i] == y
for i in range(self.x_dims):
self.x_prob_dict[y][i] = {}
for x in self.unique_x[i]:
x_list = [x_[i] for x_ in x_y]
prob = (x_list.count(x) + Lambda) / (len(x_list) + Lambda * len(self.unique_x[i]))
# y:c_k, i: dimension of charater vector, x:val of np.array(X[:,i])
self.x_prob_dict[y][i][x] = prob
def test(self,test_set):
if len(test_set[0]) != self.x_dims:
print "len(test_set[0]):",len(test_set[0]),'self.x_dims:' ,self.x_dims
raise ValueError('The dimensions of the inputs must be consistent to the trainer ')
test_pred = []
res = []
for test_example in test_set:
for y in self.unique_y:
prob_num = self.y_prob_dict[y] #P(Y=c_k)
for i in range(self.x_dims): # i: dimension of charater vector
#multiple each prior prob P(X^(j) = x^(j) | Y = c_k)
prob_num *= self.x_prob_dict[y][i][test_example[i]]
res.append( (y,prob_num) )
res.sort(key = lambda x:x[1], reverse = True)
#print the prob and predict result
print res
return res[0][0]
使用例4.1進行測試
X = [[1,'S'],[1,'M'],[1,'M'], [1,'S'], [1,'S'], [2,'S'],[2,'M'],[2,'M'],[2,'L'],[2,'L'],[3,'L'],[3,'M'],[3,'M'],[3,'L'],[3,'L']]
y=[-1,-1,1,1,-1,-1,-1,1,1,1,1,1,1,1,-1]
bayes = Bayes()
bayes.train(X,y)
bayes.test([[2,'S']])
結果:
[(-1, 0.06666666666666667), (1, 0.02222222222222222)]
-1
最上一行是排序好的(分類,概率),最下一行是預測結果在使用例子4.2測試Laplace平滑結果,令lambda=1
bayes.train(X,y,Lambda=1)
結果: