naive bayesian classifier

__author__ = 'HM'

f = open('data.txt','r')
first_line = f.readline().split()
attributes = first_line[:-1]
attr_len = len(attributes)
classname = first_line[-1]
data_set_raw = []
class_label_pool = set()
attribute_discrete_pool = {}
for line in f:
    raw_data = line.split()
   # new_record = {classname:raw_data[-1]}
    new_record = {'class_label':raw_data[-1]}
    class_label_pool.add(raw_data[-1])
    for i in xrange(attr_len):
        attribute_name = attributes[i]
        new_record[attribute_name] = raw_data[i]

        attribute_discrete_pool[attribute_name]=attribute_discrete_pool.get(attribute_name,set()).union(set([raw_data[i]]))
    data_set_raw.append(new_record)


for d in data_set_raw:
    print d
def train_classifier(data):
    attr_value_count = {}#{'yes':{'credit_rating':{'fair':10,'excellent':30},}}
    class_value_count = {}#{'yes':10,'no':20}
    #initial attr_value_count
    for c in class_label_pool:
        attr_value_count[c]=dict()
        for a in attributes:
            attr_value_count[c][a]=dict()
            for attr_value in attribute_discrete_pool[a]:
                attr_value_count[c][a][attr_value] = 0#not Use Laplacian correction(+1)

    print attr_value_count
    #initial  class_value_count
    for c in class_label_pool:
        class_value_count[c] = 0

    for d in data:
        for a in attributes:
            attr_value_count[d['class_label']][a][d[a]] += 1
        class_value_count[d['class_label']] += 1

    return attr_value_count,class_value_count

def predict(data,dataset_len,attr_value_count,class_value_count):
    print attr_value_count
    print class_value_count
    p_c_x_table = {}
    for c in class_label_pool:
        p_c = class_value_count[c]/float(dataset_len)
        print 'pc',p_c
        p_x_c = 1
        for key in data:
            p_x_c *= attr_value_count[c][key][data[key]]/float(class_value_count[c])

            print 'p_x_c',p_x_c,data[key],attr_value_count[c][key][data[key]]
        p_c_x = p_x_c*p_c
        p_c_x_table[c] = p_c_x
    print p_c_x_table

d = {'age':'<=30','income':'medium','student':'yes','credit_rating':'fair'}

predict(d,len(data_set_raw),*train_classifier(data_set_raw))

dataset:

age income student credit_rating buys_compute
<=30 high no fair no
<=30 high no excellent no
31…40 high no fair yes
>40 medium no fair yes
>40 low yes fair yes
>40 low yes excellent no
31…40 low yes excellent yes
<=30 medium no fair no
<=30 low yes fair yes
>40 medium yes fair yes
<=30 medium yes excellent yes
31…40 medium no excellent yes
31…40 high yes fair yes
>40 medium no excellent no

備註:表示數據的方法有點麻煩(各種字典套字典。。。),找個方法優化之。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章