【python】實現logistic regression

數據使用的是uci裏的Breast+Cancer數據。
Breast+Cancer數據

# -coding: utf-8
import numpy as np
import random

ages = ['10-19','20-29','30-39','40-49','50-59','60-69','70-79','80-89','90-99']
menos = ['lt40','ge40','premeno']
tumos = ['0-4','5-9','10-14','15-19','20-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59']
invs = ['0-2','3-5','6-8','9-11','12-14','15-17','18-20','21-23','24-26','27-29','30-32','33-35','36-39']
caps = ['yes','no']
degs = ['1','2','3']
bres = ['left','right']
quads = ['left_up','left_low','right_up','right_low','central']
irrs = ['yes','no']

# 讀取數據並分類計數
f = open(r"C:\Users\65465\Documents\data\breast-cancer.txt")
line = f.readline()
data_list = []
while line:
    nume = list(map(str,line.split()))
    data_list.append(nume)
    line = f.readline()
f.close()

result = []
for element in data_list:
    for e in element:
        result.append(e)

ala = []
q = 0
for ele in result:
    strm = ele
    strm.split(",")
    ala.insert(q,strm.split(","))
    q += 1

suma = 0
summ = 0
sumt = 0
sumi = 0
sumc = 0
sumd = 0
sumb = 0
sumq = 0
sumir = 0
for ele in ala:
    if ele[0] == 'no-recurrence-events':     # 分類標籤數據   全部化爲數值型
        ele[0] = 0
    elif ele[0] == 'recurrence-events':
        ele[0] = 1
    for i in range(0,9):                     # 分類屬性數據
        if ele[1] == ages[i]:
            ele[1] = i
            suma += i
    for i in range(0,3):
        if ele[2] == menos[i]:
            ele[2] = i
            summ += i
    for i in range(0,12):
        if ele[3] == tumos[i]:
            ele[3] = i
            sumt += i
    for i in range(0,13):
        if ele[4] == invs[i]:
            ele[4] = i
            sumi += i
    for i in range(0,2):
        if ele[5] == caps[i]:
            ele[5] = i
            sumc += i
    for i in range(0,3):
        if ele[6] == degs[i]:
            ele[6] = i
            sumd += i
    for i in range(0,2):
        if ele[7] == bres[i]:
            ele[7] = i
            sumb += i
    for i in range(0,5):
        if ele[8] == quads[i]:
            ele[8] = i
            sumq += i
    for i in range(0,2):
        if ele[9] == irrs[i]:
            ele[9] = i
            sumir += i

for ele in ala:                # 處理遺漏數據,取平均值
    for i in range(0,9):                     # 分類屬性數據
        if ele[1] == '?':
            ele[1] = suma/len(ala)
    for i in range(0,3):
        if ele[2] == '?':
            ele[2] = summ/len(ala)
    for i in range(0,12):
        if ele[3] == '?':
            ele[3] = sumt/len(ala)
    for i in range(0,13):
        if ele[4] == '?':
            ele[4] = sumi/len(ala)
    for i in range(0,2):
        if ele[5] == '?':
            ele[5] = sumc/len(ala)
    for i in range(0,3):
        if ele[6] == '?':
            ele[6] = sumd/len(ala)
    for i in range(0,2):
        if ele[7] == '?':
            ele[7] = sumb/len(ala)
    for i in range(0,5):
        if ele[8] == '?':
            ele[8] = sumq/len(ala)
    for i in range(0,2):
        if ele[9] == '?':
            ele[9] = sumir/len(ala)

alasam = random.sample(ala,95)      # 隨機抽取95個作爲訓練集

test_attrl = []
test_labell = []
for ele in ala:
    test_labell.append(ele[0])
    at = []
    for i in range(1,9):
        at.append(ele[i])
    test_attrl.append(at)

data_attrl = []
data_labell = []
for ele in alasam:
    data_labell.append(ele[0])
    at = []
    for i in range(1,9):
        at.append(ele[i])
    data_attrl.append(at)  # 分類屬性數據

# 將列表轉爲矩陣
test_attr = np.mat(test_attrl)
test_label = np.mat(test_labell).transpose()
data_attr = np.mat(data_attrl)
data_label = np.mat(data_labell).transpose()


# 初始化參數w
w = np.ones((len(data_attrl[0])+1, 1))

# 屬性矩陣最後添加一列全1列(參數w中有常數參數)
a = np.ones((len(data_attrl), 1))
data_attr = np.c_[data_attr, a]

# 步長
n = 0.0001

def sigmoid(z):                            # 對數機率函數
	return 1.0 / (1 + np.exp(-z))

def test(dataset, labelset, w):
    data = np.mat(dataset)
    a = np.ones((len(dataset), 1))
    data = np.c_[data, a]

    # 使用訓練好的參數w進行計算
    y = sigmoid(np.dot(data, w))
    b, c = np.shape(y)

    # 記錄預測正確的個數,用於計算正確率
    rightcount = 0

    for i in range(b):
        flag = -1       # 預測標籤
        if y[i, 0] > 0.5:           # 大於0.5的爲正例
            flag = 1
        else:                       # 小於等於0.5的爲反例
            flag = 0
        # 記錄預測正確的個數
        if labelset[i] == flag:
            rightcount += 1
    # 正確率
    rightrate = rightcount / len(dataset)
    return rightrate

rightrate = 0
dest = input('Please input the value of final right rate: ')
while rightrate < float(dest):
    # 計算當前參數w下的預測值
    c = sigmoid(np.dot((data_attr.astype(float)), w))

    # 梯度下降的計算過程,對照着梯度下降的公式
    b = c - data_label
    change = np.dot(np.transpose(data_attr), b)
    w = w - change * n

    # 預測,更新正確率
    rightrate = test(test_attr, test_label, w)


# 最終測試
for sample in test_attr:
    data = np.mat(test_attr)
    a = np.ones((len(test_attr), 1))
    data = np.c_[data, a]

    # 使用訓練好的參數w進行計算
    y = sigmoid(np.dot(data, w))
    b, c = np.shape(y)

right = 0
for i in range(b):
    if y[i, 0] > 0.5:    # 大於0.5的爲正例
        if test_labell[i] == 1:
            right += 1
    if y[i, 0] < 0.5:    # 小於0.5的爲反例
        if test_labell[i] == 0:
            right += 1
right_rate = right / b
print('Rightrate is ',rightrate)

輸出結果爲正確率。也可設置爲輸出該例爲正確還是錯誤。

參考資料:
機器學習 對數機率迴歸模型(Python實現)

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章