線性迴歸

簡單線性迴歸——梯度下降代碼實現

[Python] 純文本查看複製代碼

import numpy as np

import matplotlib as plt

# 1. 導入數據（我們這裏用一組現成的數據----data.csv）

points = np.genfromtxt("data.csv",delimiter=',')

N = len(points)

# 提取 x, y 

# 每行的第一個列

x = np.array(points[:,0])

# 每行的第二列

y = np.array(points[:,1])

# 用 plt 做出散點圖

plt.scatter(x,y)

plt.show

# 2. 定義損失函數

# 定義線性模式 y = mx + b

def compute_cost(points, b, m):

    total_cost = 0

    N = len(points)

    # 計算平方損失函數(計算所有誤差平方和)

    # sum(y - mx - b)^2

    for i in range(N):

        x = points[i,0]

        y = points[i,1]

        total_cost = total_cost + (y - m * x - b)**2

    # 返回平均誤差

    reutrn total_cost / float(N)

# 3. 定義模型的超參數

# 包括：步長(學習速率)，初始點，迭代次數

learning_rate = 0.0001

initial_b = 0

initial_m = 0

num_iteration = 10

# 4. 實現核心算法(梯度下降)

def gradient_descent(points, initial_b, initial_m, learning_rate,num_iteration):

    b = initial_b

    m = initial_m

    # 用一個 list 記錄所有的損失函數值

    cost_list = []

    for i in range(num_iteration):

        # 計算損失函數

        cost_list.append(compute_cost(points, b, m))

        b, m = step_grad_desc(b, m, np.array(points), learning_rate)

    return[b, m, cost_list]

# 每一步的梯度下降

def step_grad_desc(current_b, current_m, points, learning_rate):

    m_grade = 0

    b_grade = 0

    N = len(points)

    for i in range(N):

        x = points[i, 0]

        y = points[i, 1]

        m_grade += (current_m * x + current_b - y) * x

        b_grade += current_m * x + current_b

    m_update = current_m - learning_rate * m_grade * (2 / float(N))

    b_update = current_b - learning_rate * b_grade * (2 / float(N))

    return b_update, m_update

# 5. 測試：運行梯度下降函數來計算最優 m, b

b, m, cost_list = gradient_descent(points, initial_b,initial_m, learning_rate, num_iteration)

print("final m is: ", m)

print("final b is: ", b)

print(cost_list)

# 6. 測一下損失函數

print("final cost: ", compute_cost(points, b, m))

# 7. 畫出損失函數隨着迭代下降的過程

plt.plot(cost_list)

plt.show()

# 8. 畫出擬合曲線

plt.scatter(x, y)

y_pred = m * x + b

plt.plot(x, y_pred, c='r')

plt.show()

K 近鄰(KNN)算法(找自己最近的鄰居)

最簡單最初級的分類器，就是將全部的訓練數據所對應的類別都記錄下來
- 當測試對象的屬性和某個訓練對象的屬性完全匹配時，便可以對其進行分類
KNN 是一種基本分類方法，通過測量不同特徵值之間的距離進行分類
- 如果一個樣本在特徵空間中的 k 哥最相似的樣本中的大數據屬於一個類別，則該樣本也屬於這個類別
- 其中 k 通常是不大於 20 的證書
KNN 算法中，所選擇的鄰居都是已經正確的分類的對象
KNN 算法的結果很大程度取決於K的選擇
- K 一般選擇奇數

[Python] 純文本查看複製代碼

# 添加依賴

import numpy as np

# 數值計算，數值分析庫

import pandas as pd

# 裏面有實例數據

from sklearn.datasets import load_iris

# 切分數據集(訓練集|數據集)

from sklearn.model_selection import train_test_split

# 準確度評分（計算分類數據的預測準確度）

from sklearn.metrics import accuracy_score

if __name__ == '__main__':

    # 數據加載和預處理(json格式)

    iris = load_iris()

    # print(iris)

    df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

    # 添加一個屬性【類別】標籤

    df['class'] = iris.target

    df['class'] = df['class'].map({0: iris.target_names[0],

                                   1: iris.target_names[1],

                                   2: iris.target_names[2]})

    # 默認返回5條數據

    print(df.head())

    # 查看描述(統計數據)

    print(df.describe())

    # 二維數組數據

    x = iris.data

    # 分類結果列表(重新排列成二維數組)

    y = iris.target.reshape(-1, 1)

    print(x.shape, y.shape)

    # (150, 4) (150, 1)

    # 劃分訓練集和測試集

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, stratify=y)

    print("x_train.shape, y_train.shape")

    print(x_train.shape, y_train.shape)

    print("x_test.shape, y_test.shape")

    print(x_test.shape, y_test.shape)

    # 真正的算法實現

    # 距離函數

    def l1_dis(a, b):

        """計算2個一維向量距離,axis 表示求和之後最後形成的是1列(0表示行)"""

        return np.sum(np.abs(a - b), axis=1)

    def l2_dis(a, b):

        return np.sqrt(np.sum((a - b) ** 2, axis=1))

    # 分類器的實現

    class kNN(object):

        def __init__(self, k_neighbors=1, disc_func=l1_dis):

            """類的構造方法"""

            self.k_neighbors = k_neighbors

            self.disc_func = disc_func

        def fit(self, x, y):

            self.x_train = x

            self.y_train = y

        def predict(self, test):

            # 預測數組初始化爲0

            y_pred = np.zeros((test.shape[0], 1), dtype=self.y_train.dtype)

            for i, x_test in enumerate(test):

                # 計算矩陣距離

                distances = self.disc_func(self.x_train, x_test)

                # 按距離大小排序(取出對應索引值)

                nn_index = np.argsort(distances)

                # 取前 k 個值，計算分類頻率

                nn_pred = self.y_train[nn_index[:self.k_neighbors]].ravel()

                y_pred[i] = np.argmax(np.bincount(nn_pred))

            return y_pred

    # 測試

    knn = kNN(k_neighbors=5)

    knn.fit(x_train, y_train)

    y_pred = knn.predict(x_test)

    # print("y_pred = {}".format(y_pred))

    print("分類準確率：{}%".format(accuracy_score(y_test, y_pred) * 100))

文章來源於公總號黑馬程序員廣州中心（itheimagz）更多資源請關注