數據使用的是uci裏的Breast+Cancer數據。
Breast+Cancer數據
# -coding: utf-8
import numpy as np
import random
ages = ['10-19','20-29','30-39','40-49','50-59','60-69','70-79','80-89','90-99']
menos = ['lt40','ge40','premeno']
tumos = ['0-4','5-9','10-14','15-19','20-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59']
invs = ['0-2','3-5','6-8','9-11','12-14','15-17','18-20','21-23','24-26','27-29','30-32','33-35','36-39']
caps = ['yes','no']
degs = ['1','2','3']
bres = ['left','right']
quads = ['left_up','left_low','right_up','right_low','central']
irrs = ['yes','no']
# 讀取數據並分類計數
f = open(r"C:\Users\65465\Documents\data\breast-cancer.txt")
line = f.readline()
data_list = []
while line:
nume = list(map(str,line.split()))
data_list.append(nume)
line = f.readline()
f.close()
result = []
for element in data_list:
for e in element:
result.append(e)
ala = []
q = 0
for ele in result:
strm = ele
strm.split(",")
ala.insert(q,strm.split(","))
q += 1
suma = 0
summ = 0
sumt = 0
sumi = 0
sumc = 0
sumd = 0
sumb = 0
sumq = 0
sumir = 0
for ele in ala:
if ele[0] == 'no-recurrence-events': # 分類標籤數據 全部化爲數值型
ele[0] = 0
elif ele[0] == 'recurrence-events':
ele[0] = 1
for i in range(0,9): # 分類屬性數據
if ele[1] == ages[i]:
ele[1] = i
suma += i
for i in range(0,3):
if ele[2] == menos[i]:
ele[2] = i
summ += i
for i in range(0,12):
if ele[3] == tumos[i]:
ele[3] = i
sumt += i
for i in range(0,13):
if ele[4] == invs[i]:
ele[4] = i
sumi += i
for i in range(0,2):
if ele[5] == caps[i]:
ele[5] = i
sumc += i
for i in range(0,3):
if ele[6] == degs[i]:
ele[6] = i
sumd += i
for i in range(0,2):
if ele[7] == bres[i]:
ele[7] = i
sumb += i
for i in range(0,5):
if ele[8] == quads[i]:
ele[8] = i
sumq += i
for i in range(0,2):
if ele[9] == irrs[i]:
ele[9] = i
sumir += i
for ele in ala: # 處理遺漏數據,取平均值
for i in range(0,9): # 分類屬性數據
if ele[1] == '?':
ele[1] = suma/len(ala)
for i in range(0,3):
if ele[2] == '?':
ele[2] = summ/len(ala)
for i in range(0,12):
if ele[3] == '?':
ele[3] = sumt/len(ala)
for i in range(0,13):
if ele[4] == '?':
ele[4] = sumi/len(ala)
for i in range(0,2):
if ele[5] == '?':
ele[5] = sumc/len(ala)
for i in range(0,3):
if ele[6] == '?':
ele[6] = sumd/len(ala)
for i in range(0,2):
if ele[7] == '?':
ele[7] = sumb/len(ala)
for i in range(0,5):
if ele[8] == '?':
ele[8] = sumq/len(ala)
for i in range(0,2):
if ele[9] == '?':
ele[9] = sumir/len(ala)
alasam = random.sample(ala,95) # 隨機抽取95個作爲訓練集
test_attrl = []
test_labell = []
for ele in ala:
test_labell.append(ele[0])
at = []
for i in range(1,9):
at.append(ele[i])
test_attrl.append(at)
data_attrl = []
data_labell = []
for ele in alasam:
data_labell.append(ele[0])
at = []
for i in range(1,9):
at.append(ele[i])
data_attrl.append(at) # 分類屬性數據
# 將列表轉爲矩陣
test_attr = np.mat(test_attrl)
test_label = np.mat(test_labell).transpose()
data_attr = np.mat(data_attrl)
data_label = np.mat(data_labell).transpose()
# 初始化參數w
w = np.ones((len(data_attrl[0])+1, 1))
# 屬性矩陣最後添加一列全1列(參數w中有常數參數)
a = np.ones((len(data_attrl), 1))
data_attr = np.c_[data_attr, a]
# 步長
n = 0.0001
def sigmoid(z): # 對數機率函數
return 1.0 / (1 + np.exp(-z))
def test(dataset, labelset, w):
data = np.mat(dataset)
a = np.ones((len(dataset), 1))
data = np.c_[data, a]
# 使用訓練好的參數w進行計算
y = sigmoid(np.dot(data, w))
b, c = np.shape(y)
# 記錄預測正確的個數,用於計算正確率
rightcount = 0
for i in range(b):
flag = -1 # 預測標籤
if y[i, 0] > 0.5: # 大於0.5的爲正例
flag = 1
else: # 小於等於0.5的爲反例
flag = 0
# 記錄預測正確的個數
if labelset[i] == flag:
rightcount += 1
# 正確率
rightrate = rightcount / len(dataset)
return rightrate
rightrate = 0
dest = input('Please input the value of final right rate: ')
while rightrate < float(dest):
# 計算當前參數w下的預測值
c = sigmoid(np.dot((data_attr.astype(float)), w))
# 梯度下降的計算過程,對照着梯度下降的公式
b = c - data_label
change = np.dot(np.transpose(data_attr), b)
w = w - change * n
# 預測,更新正確率
rightrate = test(test_attr, test_label, w)
# 最終測試
for sample in test_attr:
data = np.mat(test_attr)
a = np.ones((len(test_attr), 1))
data = np.c_[data, a]
# 使用訓練好的參數w進行計算
y = sigmoid(np.dot(data, w))
b, c = np.shape(y)
right = 0
for i in range(b):
if y[i, 0] > 0.5: # 大於0.5的爲正例
if test_labell[i] == 1:
right += 1
if y[i, 0] < 0.5: # 小於0.5的爲反例
if test_labell[i] == 0:
right += 1
right_rate = right / b
print('Rightrate is ',rightrate)
輸出結果爲正確率。也可設置爲輸出該例爲正確還是錯誤。
參考資料:
機器學習 對數機率迴歸模型(Python實現)