3.3 西瓜書聚集Logistic迴歸分類

用TensorFlow提供的GradientDescent進行求解/手擼GradientDes兩種方法

西瓜數據集

Density	Sugar	Quality
0.697	0.46	1
0.774	0.376	1
0.634	0.264	1
0.608	0.318	1
0.556	0.215	1
0.403	0.237	1
0.481	0.149	1
0.437	0.211	1
0.666	0.091	0
0.243	0.267	0
0.245	0.057	0
0.343	0.099	0
0.639	0.161	0
0.657	0.198	0
0.36	0.37	0
0.593	0.042	0
0.719	0.103	0

使用TF，並提供一些TF的小tips

import tensorflow as tf
import xlrd
import math
import matplotlib.pyplot as plt
import numpy as np

data = xlrd.open_workbook('./wmdata.xlsx')
sheet = data.sheet_by_index(0)
Den = sheet.col_values(0)
Sug = sheet.col_values(1)
Res = sheet.col_values(2)

# train data
Train_X = np.array([Den, Sug])
Train_X = np.transpose(Train_X)
Train_Y = np.reshape(np.array(Res), (len(np.array(Res)), 1))

# Logistic Regression
X = tf.placeholder(tf.float32, [None, 2]) # n * 2
Y = tf.placeholder(tf.float32, [None, 1]) # n * 1
w = tf.Variable(tf.zeros([2, 1]), name = "weight") #  X * w + b = n * 1
b = tf.Variable(0.0, name = "bias")
loss =   - tf.matmul(Y, tf.matmul(X, w) + b, transpose_a = True) + len(Train_Y) * tf.reduce_mean(tf.log(1 + tf.exp(tf.matmul(X, w) + b)))
# loss = \sum_1 ^ m { - y_i * \beta * x_i + ln (1 + exp(\beta * x_i))}
predict = 1 / (1 + tf.exp(tf.matmul(X, w) + b))

# start 
alpha = 0.1 # learning rate
train_op = tf.train.GradientDescentOptimizer(alpha).minimize(loss)

init = tf.global_variables_initializer()
sess = tf.Session()
res = sess.run(init)
for i in range(3000):
    _, w_v, b_v = sess.run([train_op, w, b], feed_dict={X: Train_X, Y: Train_Y})
    if i % 50 == 0:
        print(sess.run(loss, feed_dict={X: Train_X, Y: Train_Y}))

sess.run(predict ,feed_dict={X: Train_X} )

plt.plot(Train_X[np.where(Train_Y > 0)[0]][:, 0], Train_X[np.where(Train_Y > 0)[0]][:, 1], "+r")
plt.plot(Train_X[np.where(Train_Y == 0)[0]][:, 0], Train_X[np.where(Train_Y == 0)[0]][:, 1], "*b")
plt.plot(Train_X[:, 0],  -w_v[0]/w_v[1] * Train_X[:, 0] - b_v/w_v[1])
plt.xlabel("Density")
plt.ylabel("Sugar")
plt.show()
# [x, y] * [w1; w2] + b = 0 w1 x + w2 y + b = 0 y = - w1 / w2 x - b

Attention:

loss 函數好好寫，就不會出錯
tf.matmul 使用的矩陣乘法是正常的矩陣乘法。Mat1 * Mat2是matlab中的Mat1 .* Mat2
設輸入序列x_i是一個n維變量。Logistic迴歸裏面的wX+b, 其實是有n+1個參數需要求解。但實際上若要確定n維空間中的分界面（超平面），只需要n個參數就夠了。so多求的一個參數的作用是什麼？不是很理解。

GradientDescent

//TODO

3.4 UCI數據集

選擇Iris數據集，轉成txt，實現Logistic迴歸。該數據集有3個屬性，4個自變量。考慮使用OvR（One versus Rest)法進行分類。

首先先隨機sort一下數據集，再依次選取數據集中的1/10作爲測試集。

import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import math
import random

def readTxtData(Filename):
    Train_X = []
    Train_Y = []
    with open(Filename, "r") as f:
        for line in f:
            x = []
            iris = line.strip().split(",")
            for attr in iris[0:4]: # 0~3
                x.append(float(attr))

            if iris[4]=="Iris-setosa":
                Train_X.append(x)
                Train_Y.append(0)
            elif iris[4]=="Iris-versicolor":
                Train_X.append(x)
                Train_Y.append(1)
            elif iris[4] == "Iris-virginica":
                Train_X.append(x)
                Train_Y.append(2)
            else:
                pass
    return Train_X, Train_Y

def swap(a, b):
    return b, a

def randomSort(X, Y):
    Len = len(X)
    for k in range(Len):
        i = random.randint(0, Len-1)
        j = random.randint(0, Len-1)
        while i == j:
            j = random.randint(0, Len-1)
        X[i], X[j] = swap(X[i], X[j])
        Y[i], Y[j] = swap(Y[i], Y[j])
    return X, Y

def genTrainData(Train_X, Train_Y, i):# generate i th set of data out of 10
    Num = len(Train_Y) # number of training data
    Span = math.floor(Num / 10)
    TF_Test_X = Train_X[i * Span : (i + 1) * Span]
    TF_Test_Y = Train_Y[i * Span : (i + 1) * Span]
    TF_Train_X = np.append(Train_X[0 : i * Span], Train_X[(i + 1) * Span : -1])
    TF_Train_Y = np.append(Train_Y[0 : i * Span], Train_Y[(i + 1) * Span : -1])
    return TF_Train_X, TF_Train_Y, TF_Test_X, TF_Test_Y

def genTrainY(temp_Y, j):
    temp = []
    for i in range(len(temp_Y)):
        if temp_Y[i] == j:
            temp.append(1)
        else:
            temp.append(0)
    return np.reshape(np.array(temp), (len(temp), 1))

def runLogistic(Train_X, Train_Y, num):
    X = tf.placeholder(tf.float64, [None, 4]) # n * 4
    Y = tf.placeholder(tf.float64, [None, 1]) # n * 1
    w = tf.Variable(tf.zeros([4, 1],dtype = tf.float64), name = "weight", dtype = tf.float64) #  X * w + b = n * 1
    b = tf.Variable(0.0, name = "bias", dtype = tf.float64)
    # print(len(Train_Y))
    loss = - tf.matmul(Y, tf.matmul(X, w) + b, transpose_a = True)/num + tf.reduce_mean(tf.log(1 + tf.exp(tf.matmul(X, w) + b)))
    '''
    the cost function should better be \sum{blabla} / num, otherwise the optimization process may not converge
    '''
    alpha = 0.05 # learning rate
    train_op = tf.train.GradientDescentOptimizer(float(alpha)).minimize(loss)

    init = tf.global_variables_initializer()
    sess = tf.Session()
    res = sess.run(init)
    for k in range(500):
        _, w_v, b_v = sess.run([train_op, w, b], feed_dict={X: Train_X, Y: Train_Y})
        if k % 100 == 0:
            print(k, "th loss is", sess.run(loss, feed_dict={X: Train_X, Y: Train_Y}))

    return w_v, b_v

def runTest(Test_X, Test_Y, W, B, classNum):
    # run this for num  times
    value = []
    faul = 0
    for i in range(classNum):
        w_v = W[i]
        b_v = B[i]
        val = 1 / ( 1 + np.exp( - (np.array(TF_Test_X).dot(w_v) + b_v)))
        value.append(val)

    value = np.array(value)
    for i in range(len(Test_X)):
        # find the max corresponding classification
        j = (np.where(value[:, i] == np.max(value[:, i])))[0][0]
        if j != Test_Y[i]:
            faul = faul + 1

    print(faul)
    return faul / len(Test_Y)

# main
Train_X, Train_Y = readTxtData("Iris.txt")
Train_X, Train_Y = randomSort(Train_X, Train_Y)

Res = []
# Ten Fold
for i in range(10):
    # Divide training data and testing data
    W = []
    B = []
    temp_X, temp_Y, TF_Test_X, TF_Test_Y = genTrainData(Train_X, Train_Y, i)
    raw = len(np.array(temp_X))
    TF_Train_X = np.reshape(np.array(temp_X), (int(raw/4), 4)) # 4 attributes
    for j in range(3):
        # OvR
        TF_Train_Y = genTrainY(temp_Y, j) # if temp_Y == j, set 1
        w_v, b_v = runLogistic(TF_Train_X, TF_Train_Y, len(TF_Train_Y))
        W.append(w_v)
        B.append(b_v)
    # test
    res = runTest(TF_Test_X, TF_Test_Y, W, B, 3)
    Res.append(res)

程序是可以跑通的，但是我偷懶只跑了十折法中的第一折，測試集錯誤率爲0。可見分類還是蠻準的。

And, 還有一個需要注意的小地方，寫代價函數的時候最好寫成\sum / num的形式，不要直接寫成\sum和的形式，不然不好收斂。

西瓜書習題3.3，3.4 Based on TensorFlow

3.3 西瓜書聚集Logistic迴歸分類

西瓜數據集

使用TF，並提供一些TF的小tips

Attention:

GradientDescent

3.4 UCI數據集

ROUGE與pyrouge的安裝：centos server(無sudo)

NLP+深度學習：paper(1) Bengio: A Neural Probabilistic Language Model， word2vec藍本NNLM方法

實體鏈接Entity Linking開源工具：dexter2

西瓜書習題3.3，3.4 Based on TensorFlow

遺傳算法（三）解TSP問題

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結