統計學習方法筆記, 第四章,樸素貝葉斯python實現

上一篇博客記錄了樸素貝葉斯,本篇博客爲實現,實現代碼如下:


from __future__ import division  
class Bayes:
    def __init__(self):
        self.X = None
        self.y = None
        self.x_dims = 0
        self.unique_x = {}
        self.unique_y = {}
        self.x_prob_dict = {} #record the prob of each unique x_i
        self.y_prob_dict = {} #record the prob of each unique y

    #when Lambda != 0, laplace smooth is used
    def train(self, train_set = None, train_label = None, Lambda = 0):
        if not train_set or not train_label:
            raise ValueError('train_set or train_lable not exists')
        if len(train_set) != len(train_label):
            raise ValueError('The dimensions of the two inputs must be consistent')
        self.X = train_set
        self.y = train_label
        self.unique_y = set(self.y)   #each item in distinct y is unique
        num_train = len(self.X) #number of trainset
        self.x_dims = len(self.X[0])   #dimension of each train example
        for i in range(self.x_dims):
            xi_list = [x[i] for x in self.X]
            #print i,xi_list
            self.unique_x[i] = set(xi_list)
        for y in self.unique_y:
            #print y,self.y.count(y)
            prob = (self.y.count(y) + Lambda) / (num_train + len(self.unique_y) * Lambda )   #p(y=c_k)
            self.y_prob_dict[y] = prob
            self.x_prob_dict[y] = {}
            x_y = [self.X[i] for i in range(len(self.X)) if self.y[i] == y]  # whole x[i] subset when y[i] == y
            for i in range(self.x_dims):
                self.x_prob_dict[y][i] = {}               
                for x in self.unique_x[i]:
                    x_list = [x_[i] for x_ in x_y]
                    prob = (x_list.count(x) + Lambda) / (len(x_list) + Lambda * len(self.unique_x[i]))
                    # y:c_k, i: dimension  of charater vector, x:val of np.array(X[:,i])
                    self.x_prob_dict[y][i][x] = prob  
                    



    def test(self,test_set):
        if len(test_set[0]) != self.x_dims:
            print "len(test_set[0]):",len(test_set[0]),'self.x_dims:' ,self.x_dims
            raise ValueError('The dimensions of the inputs must be consistent to the trainer ')
        test_pred = []
        res = []
        for test_example in test_set:
            for y in self.unique_y:
                prob_num = self.y_prob_dict[y]   #P(Y=c_k)
                for i in range(self.x_dims): # i: dimension  of charater vector
                    #multiple each prior prob P(X^(j) = x^(j) | Y = c_k)
                    prob_num *= self.x_prob_dict[y][i][test_example[i]]

                res.append( (y,prob_num) )
            res.sort(key = lambda x:x[1], reverse = True)
            #print the prob and predict result
            print res
        return res[0][0]

使用例4.1進行測試


X = [[1,'S'],[1,'M'],[1,'M'], [1,'S'], [1,'S'], [2,'S'],[2,'M'],[2,'M'],[2,'L'],[2,'L'],[3,'L'],[3,'M'],[3,'M'],[3,'L'],[3,'L']]
y=[-1,-1,1,1,-1,-1,-1,1,1,1,1,1,1,1,-1]
bayes = Bayes()
bayes.train(X,y)
bayes.test([[2,'S']])

結果:



[(-1, 0.06666666666666667), (1, 0.02222222222222222)]
-1
最上一行是排序好的(分類,概率),最下一行是預測結果

在使用例子4.2測試Laplace平滑結果,令lambda=1


bayes.train(X,y,Lambda=1)


結果:

[(-1, 0.06100217864923746), (1, 0.0326797385620915)]
Out[318]:
-1

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章