單變量回歸
問題描述:你的數據集中,x 是某個城市的人口數量,y 是你的餐車在那個城市的盈虧數額。對這個數據集進行挖掘,幫助你進行決策。
首先導入並分析數據:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def loadData(firename):
return pd.read_csv(firename, header=None, names=['Population', 'Profit'])
data = loadData('ex1data1.txt')
print(data.head())
print(data.describe())
data.plot(kind='scatter', x='Population', y='Profit', figsize=(12, 8))
plt.show()
輸出得到 data 的數據形式以及數據特徵:
Population Profit
0 6.1101 17.5920
1 5.5277 9.1302
2 8.5186 13.6620
3 7.0032 11.8540
4 5.8598 6.8233
Population Profit
count 97.000000 97.000000
mean 8.159800 5.839135
std 3.869884 5.510262
min 5.026900 -2.680700
25% 5.707700 1.986900
50% 6.589400 4.562300
75% 8.578100 7.046700
max 22.203000 24.147000
Process finished with exit code 0
繪製數據集的散點圖:
接着對數據進行預處理:
def initData(data):
# 爲每個樣本添加 x0 = 1
# 將數據集分爲特徵集和標籤集
data.insert(0, 'Ones', 1)
cols = data.shape[1]
X = data.iloc[:, 0:cols - 1]
y = data.iloc[:, cols - 1:cols]
# 將特徵集和標籤集轉化爲 numpy 矩陣
X = np.matrix(X.values)
y = np.matrix(y.values)
# 初始化 alpha 爲全 0
theta = np.matrix(np.array([0, 0]))
return X, y, theta
首先將 X 、y 分別轉化爲:
Ones Population
0 1 6.1101
1 1 5.5277
2 1 8.5186
3 1 7.0032
4 1 5.8598
.. ... ...
92 1 5.8707
93 1 5.3054
94 1 8.2934
95 1 13.3940
96 1 5.4369
[97 rows x 2 columns]
Profit
0 17.59200
1 9.13020
2 13.66200
3 11.85400
4 6.82330
.. ...
92 7.20290
93 1.98690
94 0.14454
95 9.05510
96 0.61705
[97 rows x 1 columns]
然後轉化爲 Numpy 矩陣,返回用於矩陣運算的 X 、y 、theta,可以查看三個矩陣的大小:
data = loadData('ex1data1.txt')
X, y, theta = initData(data)
print(X.shape, theta.shape, y.shape)
(97, 2) (1, 2) (97, 1)
Process finished with exit code 0
根據如下公式計算 cost :
# 對給定 theta 計算 cost
def computeCost(X, y, theta):
inner = np.power(((X * theta.T) - y), 2)
return np.sum(inner) / (2 * len(X))
根據如下公式執行梯度下降算法:
# 梯度下降算法
# 需設定 alpha —— 學習率、 iters —— 迭代次數
def gradientDescent(X, y, theta, alpha, iters):
# temp 用於緩存要更改的 theta
temp = np.matrix(np.zeros(theta.shape))
# theta 的元素個數
parameters = int(theta.ravel().shape[1])
# 初始化 cost 數組
cost = np.zeros(iters)
# 在設定的迭代次數內
for i in range(iters):
error = (X * theta.T) - y
# 計算 theta j 要更改的值,保存在 temp 中
for j in range(parameters):
term = np.multiply(error, X[:, j])
temp[0, j] = theta[0, j] - ((alpha / len(X)) * np.sum(term))
# 更新 theta
theta = temp
# 計算並保存當前的 cost
cost[i] = computeCost(X, y, theta)
# 返回
return theta, cost
設置 alpha 爲 0.01,iters 爲 1000,運行並繪圖:
alpha = 0.01
iters = 1000
data = loadData('ex1data1.txt')
X, y, theta = initData(data)
g, cost = gradientDescent(X, y, theta, alpha, iters)
print(g)
x = np.linspace(data.Population.min(), data.Population.max(), 100)
f = g[0, 0] + (g[0, 1] * x)
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(x, f, 'r', label='Prediction')
ax.scatter(data.Population, data.Profit, label='Traning Data')
ax.legend(loc=2)
ax.set_xlabel('Population')
ax.set_ylabel('Profit')
ax.set_title('Predicted Profit vs. Population Size')
plt.show()
[[-3.24140214 1.1272942 ]]
Process finished with exit code 0
繪製 cost - iters 圖像:
alpha = 0.01
iters = 1000
data = loadData('ex1data1.txt')
X, y, theta = initData(data)
g, cost = gradientDescent(X, y, theta, alpha, iters)
print(g)
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(np.arange(iters), cost, 'r')
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title('Error vs. Training Epoch')
plt.show()
單變量回歸完整程序:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def loadData(firename):
return pd.read_csv(firename, header=None, names=['Population', 'Profit'])
def showData(data):
data.plot(kind='scatter', x='Population', y='Profit', figsize=(12, 8))
plt.show()
def initData(data):
# 爲每個樣本添加 x0 = 1
# 將數據集分爲特徵集和標籤集
data.insert(0, 'Ones', 1)
cols = data.shape[1]
X = data.iloc[:, 0:cols - 1]
y = data.iloc[:, cols - 1:cols]
# 將特徵集和標籤集轉化爲 numpy 矩陣
X = np.matrix(X.values)
y = np.matrix(y.values)
# 初始化 alpha 爲全 0
theta = np.matrix(np.array([0, 0]))
return X, y, theta
# 對給定 theta 計算 cost
def computeCost(X, y, theta):
inner = np.power(((X * theta.T) - y), 2)
return np.sum(inner) / (2 * len(X))
# 梯度下降算法
# 需設定 alpha —— 學習率、 iters —— 迭代次數
def gradientDescent(X, y, theta, alpha, iters):
# temp 用於緩存要更改的 theta
temp = np.matrix(np.zeros(theta.shape))
# theta 的元素個數
parameters = int(theta.ravel().shape[1])
# 初始化 cost 數組
cost = np.zeros(iters)
# 在設定的迭代次數內
for i in range(iters):
error = (X * theta.T) - y
# 計算 theta j 要更改的值,保存在 temp 中
for j in range(parameters):
term = np.multiply(error, X[:, j])
temp[0, j] = theta[0, j] - ((alpha / len(X)) * np.sum(term))
# 更新 theta
theta = temp
# 計算並保存當前的 cost
cost[i] = computeCost(X, y, theta)
# 返回
return theta, cost
def showRegression(data, g):
x = np.linspace(data.Population.min(), data.Population.max(), 100)
f = g[0, 0] + (g[0, 1] * x)
fig, ax = plt.subplots(figsize=(12, 8))
ax.plot(x, f, 'r', label='Prediction')
ax.scatter(data.Population, data.Profit, label='Traning Data')
ax.legend(loc=2)
ax.set_xlabel('Population')
ax.set_ylabel('Profit')
ax.set_title('Predicted Profit vs. Population Size')
plt.show()
def showCost_Iters(iters, cost):
fig, ax = plt.subplots(figsize=(12, 8))
ax.plot(np.arange(iters), cost, 'r')
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title('Error vs. Training Epoch')
plt.show()
alpha = 0.01
iters = 1000
data = loadData('ex1data1.txt')
X, y, theta = initData(data)
g, cost = gradientDescent(X, y, theta, alpha, iters)
print(g)
#showRegression(data, g)
#showCost_Iters(iters, cost)
多變量回歸
問題描述:你的數據集中,x1 代表房屋面積、x2 代表臥室數量,y 代表售價。要求基於這個數據集訓練一個預測房價的模型。
對單變量回歸的完整函數稍作修改:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def loadData(firename):
return pd.read_csv(firename, header=None, names=['Size', 'Bedrooms', 'Price'])
def initData(data):
# 特徵縮放
data = (data - data.mean()) / data.std()
# 爲每個樣本添加 x0 = 1
# 將數據集分爲特徵集和標籤集
data.insert(0, 'Ones', 1)
cols = data.shape[1]
X = data.iloc[:, 0:cols - 1]
y = data.iloc[:, cols - 1:cols]
# 將特徵集和標籤集轉化爲 numpy 矩陣
X = np.matrix(X.values)
y = np.matrix(y.values)
# 初始化 alpha 爲全 0
theta = np.matrix(np.array([0, 0, 0]))
return X, y, theta
# 對給定 theta 計算 cost
def computeCost(X, y, theta):
inner = np.power(((X * theta.T) - y), 2)
return np.sum(inner) / (2 * len(X))
# 梯度下降算法
# 需設定 alpha —— 學習率、 iters —— 迭代次數
def gradientDescent(X, y, theta, alpha, iters):
# temp 用於緩存要更改的 theta
temp = np.matrix(np.zeros(theta.shape))
# theta 的元素個數
parameters = int(theta.ravel().shape[1])
# 初始化 cost 數組
cost = np.zeros(iters)
# 在設定的迭代次數內
for i in range(iters):
error = (X * theta.T) - y
# 計算 theta j 要更改的值,保存在 temp 中
for j in range(parameters):
term = np.multiply(error, X[:, j])
temp[0, j] = theta[0, j] - ((alpha / len(X)) * np.sum(term))
# 更新 theta
theta = temp
# 計算並保存當前的 cost
cost[i] = computeCost(X, y, theta)
# 返回
return theta, cost
def showCost_Iters(iters, cost):
fig, ax = plt.subplots(figsize=(12, 8))
ax.plot(np.arange(iters), cost, 'r')
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title('Error vs. Training Epoch')
plt.show()
alpha = 0.01
iters = 1000
data = loadData('ex1data2.txt')
X, y, theta = initData(data)
g, cost = gradientDescent(X, y, theta, alpha, iters)
print(g)
showCost_Iters(iters, cost)
[[-1.10868761e-16 8.78503652e-01 -4.69166570e-02]]
Process finished with exit code 0
使用 Sklearn 庫
以單變量回歸爲例
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
def loadData(firename):
return pd.read_csv(firename, header=None, names=['Population', 'Profit'])
def initData(data):
# 爲每個樣本添加 x0 = 1
# 將數據集分爲特徵集和標籤集
data.insert(0, 'Ones', 1)
cols = data.shape[1]
X = data.iloc[:, 0:cols - 1]
y = data.iloc[:, cols - 1:cols]
# 將特徵集和標籤集轉化爲 numpy 矩陣
X = np.matrix(X.values)
y = np.matrix(y.values)
return X, y
def showRegression(X, model):
x = np.array(X[:, 1].A1)
f = model.predict(X).flatten()
fig, ax = plt.subplots(figsize=(12, 8))
ax.plot(x, f, 'r', label='Prediction')
ax.scatter(data.Population, data.Profit, label='Traning Data')
ax.legend(loc=2)
ax.set_xlabel('Population')
ax.set_ylabel('Profit')
ax.set_title('Predicted Profit vs. Population Size')
plt.show()
data = loadData('ex1data1.txt')
X, y = initData(data)
model = linear_model.LinearRegression()
model.fit(X, y)
showRegression(X, model)
正規方程法
以單變量回歸爲例
import numpy as np
import pandas as pd
def loadData(firename):
return pd.read_csv(firename, header=None, names=['Population', 'Profit'])
def initData(data):
# 爲每個樣本添加 x0 = 1
# 將數據集分爲特徵集和標籤集
data.insert(0, 'Ones', 1)
cols = data.shape[1]
X = data.iloc[:, 0:cols - 1]
y = data.iloc[:, cols - 1:cols]
# 將特徵集和標籤集轉化爲 numpy 矩陣
X = np.matrix(X.values)
y = np.matrix(y.values)
return X, y
def normalEqn(X, y):
# X.T@X 等價於 X.T.dot(X)
theta = np.linalg.inv(X.T@X)@X.T@y
return theta
data = loadData('ex1data1.txt')
X, y = initData(data)
theta = normalEqn(X, y)
print(theta)
[[-3.89578088]
[ 1.19303364]]
Process finished with exit code 0