《python 機器學習算法-logistics regression》

訓練data以及源文件(python2.x)見作者(趙志勇)的github:

https://github.com/zhaozhiyong19890102/Python-Machine-Learning-Algorithm


以下的文件是修改過的,適用於 python 3.x

1. 訓練文件 train.py

# coding:UTF-8

import numpy as np

def sig(x):
    '''sigmoid function

    :param x:
    :return:
    '''
    return 1.0/(1+np.exp(-x)) # 1.0 equals to 1, the calculation returns a double

def error_rate(h, label):
    '''compute lost fuction value
    
    :param h:
    :param label:
    :return:
    '''
    m = np.shape(h)[0] # return the row of h; shape() return the size of h
    sum_err = 0.0
    for i in range(m):
        if h[i, 0] > 0 and (1-h[i,0])>0:
            temp = h[i,0] + (1-label[i,0]) * np.log(1-h[i,0])
            if temp < 0 : # input of log should be positive; 
                m-=1
                continue
            sum_err -= (label[i,0]*np.log(temp))
        else:
            sum_err -= 0

    return sum_err / m

def lr_train_bgd(feature, label, maxCycle, alpha):
    ''' gradient descent

    :param feature:
    :param label:
    :param maxCycle:
    :param alpha:
    :return:
    '''
    n = np.shape(feature)[1] # shape return the size of feature, [rows, cols]; [1] means set n as the second element of the return list;
    w = np.mat(np.ones((n,1))) # new matrix, rows = n, cols = 1; initialize w as ones
    i = 0
    while i<= maxCycle:
        i += 1

        # feature size should be (m,n), w size is (n, 1), h size is (m, 1);
        # m is the number of sample, n is the dimension of a sample;
        h = sig(feature * w) 

        err = label - h # err size is (m, 1)
        if i % 100 == 0:
            print("\t--------iter=" + str(i) + \
                  ", train error rate= " + str(error_rate(h, label)))
            # w updating rule of batch gradient decent (matrix style);
            # alpha is a number; feature size is (m, n); T means to transpose a matrix; err size is (m, 1)
            # so the size of w is (n, 1);
            w = w + alpha * feature.T * err # w size is (n, 1)

    return w

def load_data(file_name):
    '''導入訓練數據
    input:  file_name(string)訓練數據的位置
    output: feature_data(mat)特徵
            label_data(mat)標籤
    '''
    f = open(file_name)  # 打開文件
    feature_data = [] # declare a list
    label_data = []
    for line in f.readlines(): # read a line, and loop each elements of it 
        feature_tmp = []
        lable_tmp = []

        # string.strip() means to remove the leading and trailing whitespace
        # string.split("x") means to separate the string into several sub_strings with "x"
        lines = line.strip().split("\t") # lines is a list
        feature_tmp.append(1)  # formula: x0 = 1, check the book for detail;
        for i in range(len(lines) - 1):
            feature_tmp.append(float(lines[i])) # conbine a sample with the elements;
        lable_tmp.append(float(lines[-1])) # list[-1] means the last element
        
        feature_data.append(feature_tmp) # add element to list
        label_data.append(lable_tmp)
    f.close()  # closing file
    return np.mat(feature_data), np.mat(label_data) # mat 1Xn,these data will be reshape in next function;

def save_model(file_name, w):
    m = np.shape(w)[0]
    f_w = open(file_name, "w")
    w_array = []
    for i in range(m): # xrange is not suitable for python 3.x, should be replaced by range;
        w_array.append(str(w[i, 0]))
    f_w.write("\t".join(w_array)) # add "\t" to the intervals of every two adjacent letters;
    f_w.close()

if __name__ == "__main__":
    # load file 
    print("---------- 1.load data ------------")
    feature, label = load_data("data.txt")
    # train
    print("---------- 2.training ------------")
    w = lr_train_bgd(feature, label, 1000, 0.01)
    # save
    print("---------- 3.save model ------------")
    save_model("weights", w)

2 測試文件 test.py

# coding:UTF-8

import numpy as np
from logistic_regression import sig

def load_weight(w):
    f = open(w)
    w = []
    for line in f.readlines():
        lines = line.strip().split("\t")
        w_tmp = []
        for x in lines:
            w_tmp.append(float(x))
        w.append(w_tmp)    
    f.close()
    return np.mat(w)

def load_data(file_name, n):
    f = open(file_name)
    feature_data = []
    for line in f.readlines():
        feature_tmp = []
        lines = line.strip().split("\t")
        # print lines[2]
        if len(lines) != n - 1:
            continue
        feature_tmp.append(1)
        for x in lines:
            # print x
            feature_tmp.append(float(x))
        feature_data.append(feature_tmp)
    f.close()
    return np.mat(feature_data)

def predict(data, w):
    h = sig(data * w.T)#sig
    m = np.shape(h)[0]
    for i in range(m):
        if h[i, 0] < 0.5:
            h[i, 0] = 0.0
        else:
            h[i, 0] = 1.0
    return h

def save_result(file_name, result):

    m = np.shape(result)[0]
    tmp = []
    for i in range(m):
        tmp.append(str(result[i, 0]))
    f_result = open(file_name, "w")
    f_result.write("\t".join(tmp))
    f_result.close()    

if __name__ == "__main__":
    # 1
    print("---------- 1.load model ------------")
    w = load_weight("weights")
    n = np.shape(w)[1]
    # 2
    print ("---------- 2.load data ------------")
    testData = load_data("test_data", n)
    # 3
    print ("---------- 3.get prediction ------------")
    h = predict(testData, w)#
    # 4
    print ("---------- 4.save prediction ------------")
    save_result("result2", h)
    











發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章