李宏毅機器學習課程筆記-4.5邏輯迴歸Python實戰

本文爲作者學習李宏毅機器學習課程時參照樣例完成homework2的記錄。

關注我的公衆號：臭鹹魚，回覆LHY可獲取課程PPT、數據和代碼下載鏈接。

任務描述（Task Description）

二分類（Binary Classification）

根據個人資料，判斷每個人的年收入是否超過50000美元。
數據集描述（Dataset Description）
- train.csv
- test_no_label.csv
- x_train.csv
- Y_train.csv
- X_test.csv
參考鏈接

https://colab.research.google.com/drive/1JaMKJU7hvnDoUfZjvUKzm9u-JLeX6B2C
代碼

import numpy as np
import matplotlib.pyplot as plt
np.random.seed(0) # 使每次隨機生成的數字相同


## 函數定義
# 歸一化
def _normalize(X, train=True, specified_column=None, X_mean=None, X_std=None):
    # This function normalizes specific columns of X.
    # The mean and standard variance of training data will be reused when processing testing data.
    #
    # Arguments:
    #     X: data to be processed
    #     train: 'True' when processing training data, 'False' for testing data
    #     specific_column: indexes of the columns that will be normalized. If 'None', all columns
    #         will be normalized.
    #     X_mean: mean value of training data, used when train = 'False'
    #     X_std: standard deviation of training data, used when train = 'False'
    # Outputs:
    #     X: normalized data
    #     X_mean: computed mean value of training data
    #     X_std: computed standard deviation of training data
    if specified_column is None:
        specified_column = np.arange(X.shape[1])
    if train:
        X_mean = np.mean(X[:, specified_column], axis=0).reshape(1, -1)
        X_std = np.std(X[:, specified_column], axis=0).reshape(1, -1)
    X[:, specified_column] = (X[:, specified_column] - X_mean) / (X_std + 1e-8)
    return X, X_mean, X_std

# 訓練集劃分
def _train_valid_split(X, Y, valid_ratio=0.25):
    # This function splits data into training set and validation set.
    train_size = int(len(X) * (1 - valid_ratio))
    return X[:train_size], Y[:train_size], X[train_size:], Y[train_size:]

# 數據打亂
def _shuffle(X, Y):
    # This function shuffles two equal-length list/array, X and Y, together.
    randomize = np.arange(len(X))
    np.random.shuffle(randomize)
    return (X[randomize], Y[randomize])

# sigmoid函數
def _sigmoid(z):
    # Sigmoid function can be used to calculate probability.
    # To avoid overflow, minimum/maximum output value is set.
    return np.clip(1.0 / (1.0 + np.exp(-z)), 1e-8, 1 - ( 1e-8))

# forward
def _f(X, w, b):
    # This is the logistic regression function, parameterized by w and b
    #
    # Arguements:
    #     X: input data, shape = [batch_size, data_dimension]
    #     w: weight vector, shape = [data_dimension, ]
    #     b: bias, scalar
    # Output:
    #     predicted probability of each row of X being positively labeled, shape = [batch_size, ]
    return _sigmoid(np.matmul(X, w) + b)

# 預測
def _predict(X, w, b):
    # This function returns a truth value prediction for each row of X 
    # by rounding the result of logistic regression function.
    return np.round(_f(X, w, b)).astype(np.int)

# 計算精度
def _accuracy(Y_pred, Y_label):
    # This function calculates prediction accuracy
    return 1 - np.mean(np.abs(Y_pred - Y_label))

# 交叉熵損失函數
def _cross_entropy_loss(y_pred, Y_label):
    # This function computes the cross entropy.
    #
    # Arguements:
    #     y_pred: probabilistic predictions, float vector
    #     Y_label: ground truth labels, bool vector
    # Output:
    #     cross entropy, scalar
    return -np.dot(Y_label, np.log(y_pred)) - np.dot((1 - Y_label), np.log(1 - y_pred))

# 梯度計算
def _gradient(X, Y_label, w, b):
    # This function computes the gradient of cross entropy loss with respect to weight w and bias b.
    Y_pred = _f(X, w, b)
    pred_error = Y_label - Y_pred
    w_grad = -np.sum(pred_error * X.T, axis=1)
    b_grad = -np.sum(pred_error)
    return w_grad, b_grad


## 文件路徑
X_train_fpath = '../data/X_train.csv'
Y_train_fpath = '../data/Y_train.csv'
X_test_fpath = '../data/X_test.csv'
output_fpath = 'output.csv'


## 讀取數據
with open(X_train_fpath) as f:
    next(f) # 不需要第一行的表頭
    X_train = np.array([line.strip('\n').split(',')[1:] for line in f], dtype=float) # 不要第一列的ID
    # print(X_train)
with open(Y_train_fpath) as f:
    next(f) # 不需要第一行的表頭
    Y_train = np.array([line.strip('\n').split(',')[1] for line in f], dtype=float)# 不要第一列的ID，只取第二列
    # print(Y_train)
with open(X_test_fpath) as f:
    next(f) # 不需要第一行的表頭
    X_test = np.array([line.strip('\n').split(',')[1:] for line in f], dtype=float)
    # print(X_test)


## 數據集處理
# 訓練集和測試集normalization
X_train, X_mean, X_std = _normalize(X_train, train=True)
X_test, _, _ = _normalize(X_test, train=False, specified_column=None, X_mean=X_mean, X_std=X_std)
# 訓練集驗證集劃分
X_train, Y_train, X_valid, Y_valid = _train_valid_split(X_train,Y_train, valid_ratio=0.1)
train_size = X_train.shape[0]
valid_size = X_valid.shape[0]
test_size = X_test.shape[0]
data_dim = X_train.shape[1]
print('Size of training set: {}'.format(train_size))
print('Size of validation set: {}'.format(valid_size))
print('Size of testing set: {}'.format(test_size))
print('Dimension of data: {}'.format(data_dim))

## 訓練（使用小批次梯度下降法，Mini-batch training）
# 參數初始化
w = np.zeros((data_dim, ))
b = np.zeros((1, ))
# 訓練參數
max_iter = 10
batch_size = 8
learning_rate = 0.2
# 保存每個epoch的loss以作圖
train_loss = []
valid_loss = []
train_acc = []
valid_acc = []
step = 1
# 迭代
for epoch in range(max_iter):
    # 打亂訓練集
    X_train, Y_train = _shuffle(X_train, Y_train)
    # Mini-batch training
    for idx in range(int(np.floor(X_train.shape[0] / batch_size))):
        # 取batch
        X = X_train[idx * batch_size : idx * batch_size + batch_size]
        Y = Y_train[idx * batch_size : idx * batch_size + batch_size]
        # 計算梯度
        w_grad, b_grad = _gradient(X, Y, w, b)
        # 梯度下降（learning rate decay with time）
        w = w - learning_rate / np.sqrt(step) * w_grad
        b = b - learning_rate / np.sqrt(step) * b_grad
        step = step + 1
    # 計算訓練集和驗證集的loss和精度
    y_train_pred = _f(X_train, w, b)
    Y_train_pred = np.round(y_train_pred)
    train_acc.append(_accuracy(Y_train_pred, Y_train))
    train_loss.append(_cross_entropy_loss(y_train_pred, Y_train) / train_size)
    y_valid_pred = _f(X_valid, w, b)
    Y_valid_pred = np.round(y_valid_pred)
    valid_acc.append(_accuracy(Y_valid_pred, Y_valid))
    valid_loss.append(_cross_entropy_loss(y_valid_pred, Y_valid) / valid_size)
print('Training loss: {}'.format(train_loss[-1]))
print('Validation loss: {}'.format(valid_loss[-1]))
print('Training accuracy: {}'.format(train_acc[-1]))
print('Validation accuracy: {}'.format(valid_acc[-1]))


## 訓練過程可視化
# loss可視化
plt.plot(train_loss)
plt.plot(valid_loss)
plt.title('Loss')
plt.legend(['train', 'valid'])
plt.savefig('Loss.png')
plt.show()
# accuracy可視化
plt.plot(train_acc)
plt.plot(valid_acc)
plt.title('Accuracy')
plt.legend(['train', 'valid'])
plt.savefig('Accuracy.png')
plt.show()


## 預測測試集
predictions = _predict(X_test, w, b)
with open(output_fpath, 'w') as f:
    f.write('id,label\n')
    for i, label in enumerate(predictions):
        f.write('{},{}\n'.format(i, label))


## 尋找最重要的10個維度的特徵
index = np.argsort(np.abs(w))[::-1] # 將w按絕對值從大到小排序
with open(X_test_fpath) as f:
    features = np.array(f.readline().strip('\n').split(','))
    for i in index[:10]:
        print(features[i], w[i])

Github（github.com）：@chouxianyu

Github Pages（github.io）：@臭鹹魚

知乎（zhihu.com）：@臭鹹魚

博客園（cnblogs.com）：@臭鹹魚

B站（bilibili.com）：@絕版臭鹹魚

微信公衆號：@臭鹹魚

轉載請註明出處，歡迎討論和交流!

李宏毅機器學習課程筆記-4.5邏輯迴歸Python實戰

[轉帖]使用NMT和pmap解決JVM資源泄漏問題原創

Python實現大麥網搶票的四大關鍵技術點解析

Python 安裝庫指令大全

salesforce零基礎學習（一百三十八）零碎知識點小總結（十）

一款開源的.NET程序集反編譯、編輯和調試神器

關於接口協議，你必須要知道這些！

【2024-05-21】以茶會友

最近被曠視的YOLOX刷屏了！

李宏毅機器學習課程筆記-15.2無監督學習之主成分分析

李宏毅機器學習課程筆記-15.1無監督學習簡介

Boundary Aware PoolNet：基於PoolNet和BASNet的顯著性目標檢測

Boundary Aware PoolNet(2)：BASNet模型與代碼介紹

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結