3.3 西瓜書聚集Logistic迴歸分類
用TensorFlow提供的GradientDescent進行求解/手擼GradientDes兩種方法
西瓜數據集
Density | Sugar | Quality |
---|---|---|
0.697 | 0.46 | 1 |
0.774 | 0.376 | 1 |
0.634 | 0.264 | 1 |
0.608 | 0.318 | 1 |
0.556 | 0.215 | 1 |
0.403 | 0.237 | 1 |
0.481 | 0.149 | 1 |
0.437 | 0.211 | 1 |
0.666 | 0.091 | 0 |
0.243 | 0.267 | 0 |
0.245 | 0.057 | 0 |
0.343 | 0.099 | 0 |
0.639 | 0.161 | 0 |
0.657 | 0.198 | 0 |
0.36 | 0.37 | 0 |
0.593 | 0.042 | 0 |
0.719 | 0.103 | 0 |
使用TF,並提供一些TF的小tips
import tensorflow as tf
import xlrd
import math
import matplotlib.pyplot as plt
import numpy as np
data = xlrd.open_workbook('./wmdata.xlsx')
sheet = data.sheet_by_index(0)
Den = sheet.col_values(0)
Sug = sheet.col_values(1)
Res = sheet.col_values(2)
# train data
Train_X = np.array([Den, Sug])
Train_X = np.transpose(Train_X)
Train_Y = np.reshape(np.array(Res), (len(np.array(Res)), 1))
# Logistic Regression
X = tf.placeholder(tf.float32, [None, 2]) # n * 2
Y = tf.placeholder(tf.float32, [None, 1]) # n * 1
w = tf.Variable(tf.zeros([2, 1]), name = "weight") # X * w + b = n * 1
b = tf.Variable(0.0, name = "bias")
loss = - tf.matmul(Y, tf.matmul(X, w) + b, transpose_a = True) + len(Train_Y) * tf.reduce_mean(tf.log(1 + tf.exp(tf.matmul(X, w) + b)))
# loss = \sum_1 ^ m { - y_i * \beta * x_i + ln (1 + exp(\beta * x_i))}
predict = 1 / (1 + tf.exp(tf.matmul(X, w) + b))
# start
alpha = 0.1 # learning rate
train_op = tf.train.GradientDescentOptimizer(alpha).minimize(loss)
init = tf.global_variables_initializer()
sess = tf.Session()
res = sess.run(init)
for i in range(3000):
_, w_v, b_v = sess.run([train_op, w, b], feed_dict={X: Train_X, Y: Train_Y})
if i % 50 == 0:
print(sess.run(loss, feed_dict={X: Train_X, Y: Train_Y}))
sess.run(predict ,feed_dict={X: Train_X} )
plt.plot(Train_X[np.where(Train_Y > 0)[0]][:, 0], Train_X[np.where(Train_Y > 0)[0]][:, 1], "+r")
plt.plot(Train_X[np.where(Train_Y == 0)[0]][:, 0], Train_X[np.where(Train_Y == 0)[0]][:, 1], "*b")
plt.plot(Train_X[:, 0], -w_v[0]/w_v[1] * Train_X[:, 0] - b_v/w_v[1])
plt.xlabel("Density")
plt.ylabel("Sugar")
plt.show()
# [x, y] * [w1; w2] + b = 0 w1 x + w2 y + b = 0 y = - w1 / w2 x - b
Attention:
- loss 函數好好寫,就不會出錯
- tf.matmul 使用的矩陣乘法是正常的矩陣乘法。Mat1 * Mat2是matlab中的Mat1 .* Mat2
- 設輸入序列x_i是一個n維變量。Logistic迴歸裏面的wX+b, 其實是有n+1個參數需要求解。但實際上若要確定n維空間中的分界面(超平面),只需要n個參數就夠了。so多求的一個參數的作用是什麼? 不是很理解。
GradientDescent
//TODO
3.4 UCI數據集
選擇Iris數據集,轉成txt,實現Logistic迴歸。該數據集有3個屬性,4個自變量。考慮使用OvR(One versus Rest)法進行分類。
首先先隨機sort一下數據集,再依次選取數據集中的1/10作爲測試集。
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import math
import random
def readTxtData(Filename):
Train_X = []
Train_Y = []
with open(Filename, "r") as f:
for line in f:
x = []
iris = line.strip().split(",")
for attr in iris[0:4]: # 0~3
x.append(float(attr))
if iris[4]=="Iris-setosa":
Train_X.append(x)
Train_Y.append(0)
elif iris[4]=="Iris-versicolor":
Train_X.append(x)
Train_Y.append(1)
elif iris[4] == "Iris-virginica":
Train_X.append(x)
Train_Y.append(2)
else:
pass
return Train_X, Train_Y
def swap(a, b):
return b, a
def randomSort(X, Y):
Len = len(X)
for k in range(Len):
i = random.randint(0, Len-1)
j = random.randint(0, Len-1)
while i == j:
j = random.randint(0, Len-1)
X[i], X[j] = swap(X[i], X[j])
Y[i], Y[j] = swap(Y[i], Y[j])
return X, Y
def genTrainData(Train_X, Train_Y, i):# generate i th set of data out of 10
Num = len(Train_Y) # number of training data
Span = math.floor(Num / 10)
TF_Test_X = Train_X[i * Span : (i + 1) * Span]
TF_Test_Y = Train_Y[i * Span : (i + 1) * Span]
TF_Train_X = np.append(Train_X[0 : i * Span], Train_X[(i + 1) * Span : -1])
TF_Train_Y = np.append(Train_Y[0 : i * Span], Train_Y[(i + 1) * Span : -1])
return TF_Train_X, TF_Train_Y, TF_Test_X, TF_Test_Y
def genTrainY(temp_Y, j):
temp = []
for i in range(len(temp_Y)):
if temp_Y[i] == j:
temp.append(1)
else:
temp.append(0)
return np.reshape(np.array(temp), (len(temp), 1))
def runLogistic(Train_X, Train_Y, num):
X = tf.placeholder(tf.float64, [None, 4]) # n * 4
Y = tf.placeholder(tf.float64, [None, 1]) # n * 1
w = tf.Variable(tf.zeros([4, 1],dtype = tf.float64), name = "weight", dtype = tf.float64) # X * w + b = n * 1
b = tf.Variable(0.0, name = "bias", dtype = tf.float64)
# print(len(Train_Y))
loss = - tf.matmul(Y, tf.matmul(X, w) + b, transpose_a = True)/num + tf.reduce_mean(tf.log(1 + tf.exp(tf.matmul(X, w) + b)))
'''
the cost function should better be \sum{blabla} / num, otherwise the optimization process may not converge
'''
alpha = 0.05 # learning rate
train_op = tf.train.GradientDescentOptimizer(float(alpha)).minimize(loss)
init = tf.global_variables_initializer()
sess = tf.Session()
res = sess.run(init)
for k in range(500):
_, w_v, b_v = sess.run([train_op, w, b], feed_dict={X: Train_X, Y: Train_Y})
if k % 100 == 0:
print(k, "th loss is", sess.run(loss, feed_dict={X: Train_X, Y: Train_Y}))
return w_v, b_v
def runTest(Test_X, Test_Y, W, B, classNum):
# run this for num times
value = []
faul = 0
for i in range(classNum):
w_v = W[i]
b_v = B[i]
val = 1 / ( 1 + np.exp( - (np.array(TF_Test_X).dot(w_v) + b_v)))
value.append(val)
value = np.array(value)
for i in range(len(Test_X)):
# find the max corresponding classification
j = (np.where(value[:, i] == np.max(value[:, i])))[0][0]
if j != Test_Y[i]:
faul = faul + 1
print(faul)
return faul / len(Test_Y)
# main
Train_X, Train_Y = readTxtData("Iris.txt")
Train_X, Train_Y = randomSort(Train_X, Train_Y)
Res = []
# Ten Fold
for i in range(10):
# Divide training data and testing data
W = []
B = []
temp_X, temp_Y, TF_Test_X, TF_Test_Y = genTrainData(Train_X, Train_Y, i)
raw = len(np.array(temp_X))
TF_Train_X = np.reshape(np.array(temp_X), (int(raw/4), 4)) # 4 attributes
for j in range(3):
# OvR
TF_Train_Y = genTrainY(temp_Y, j) # if temp_Y == j, set 1
w_v, b_v = runLogistic(TF_Train_X, TF_Train_Y, len(TF_Train_Y))
W.append(w_v)
B.append(b_v)
# test
res = runTest(TF_Test_X, TF_Test_Y, W, B, 3)
Res.append(res)
程序是可以跑通的,但是我偷懶只跑了十折法中的第一折,測試集錯誤率爲0。可見分類還是蠻準的。
And, 還有一個需要注意的小地方,寫代價函數的時候最好寫成\sum / num的形式,不要直接寫成\sum和的形式,不然不好收斂。