參考資料:
機器學習實戰
'''
@version: 0.0.1
@Author: tqrs
@dev: python3 vscode
@Date: 2019-11-06 20:00:17
@LastEditTime: 2019-11-07 19:33:18
@FilePath: \\機器學習實戰\\08-線性迴歸\\regression.py
@Descripttion: 線性迴歸
'''
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
def loadDataSet(fileName):
"""
[summary]:加載文件中的數據
Arguments:
fileName -- 文件名
Returns:
dataMat -- 數據
labelMat -- 標籤
"""
numFeat = len(open(fileName).readline().split('\t')) - 1
dataMat = []
labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr = []
curLine = line.strip().split('\t')
for i in range(numFeat):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat, labelMat
def plotDataSet():
"""
[summary]:繪製數據集
"""
dataMat, labelMat = loadDataSet(r'.\08-線性迴歸\ex0.txt')
n = len(dataMat)
xcord = []
ycord = []
for i in range(n):
xcord.append(dataMat[i][1])
ycord.append(labelMat[i])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord, ycord, s=20, c='blue', alpha=0.5)
plt.title('DataSet')
plt.xlabel('X')
plt.show()
def standRegres(xArr, yArr):
"""
[summary]:計算最佳擬合直線
Arguments:
xArr -- x數據集
yArr -- y標籤集
Returns:
ws -- 迴歸係數
"""
xMat = np.mat(xArr)
yMat = np.mat(yArr).T
xTx = xMat.T * xMat
if np.linalg.det(xTx) == 0.0:
print("這個矩陣不可逆")
return
ws = xTx.I * (xMat.T * yMat)
return ws
def plotRegression():
xArr, yArr = loadDataSet(r'.\08-線性迴歸\ex0.txt')
ws = standRegres(xArr, yArr)
xMat = np.mat(xArr)
yMat = np.mat(yArr)
xCopy = xMat.copy()
xCopy.sort(0)
yHat = xCopy * ws
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(xCopy[:, 1], yHat, c='red')
ax.scatter(xMat[:, 1].flatten().A[0],
yMat.flatten().A[0],
s=20,
c='blue',
alpha=.5)
plt.title('DataSet')
plt.xlabel('X')
plt.show()
def lwlr(testPoint, xArr, yArr, k=1.0):
"""
[summary]:局部加權線性迴歸,每次必須在整個數據集上運行
Arguments:
testPoint -- 測試樣本點
xArr -- x數據集
yArr -- y數據集
Keyword Arguments:
k {float} -- 高斯核的k (default: {1.0})
Returns:
ws -- 迴歸係數
"""
xMat = np.mat(xArr)
yMat = np.mat(yArr).T
m = np.shape(xMat)[0]
weights = np.mat(np.eye((m)))
for j in range(m):
diffMat = testPoint - xMat[j, :]
weights[j, j] = np.exp(diffMat * diffMat.T / (-2.0 * k**2))
xTx = xMat.T * (weights * xMat)
if np.linalg.det(xTx) == 0.0:
print("這個矩陣不可逆")
return
ws = xTx.I * (xMat.T * (weights * yMat))
return testPoint * ws
def lwlrTest(testArr, xArr, yArr, k=1.0):
"""
[summary]:爲數據集中每個點調用lwlr()函數
Arguments:
testArr -- 測試數據集
xArr -- x數據集
yArr -- y數據集
Keyword Arguments:
k {float} -- 高斯核的k (default: {1.0})
Returns:
ws -- 迴歸係數
"""
m = np.shape(testArr)[0]
yHat = np.zeros(m)
for i in range(m):
yHat[i] = lwlr(testArr[i], xArr, yArr, k)
return yHat
def plotlwlrRegression():
"""
[summary]:繪製多條局部加權迴歸曲線
"""
font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14)
xArr, yArr = loadDataSet(r'.\08-線性迴歸\ex0.txt')
yHat_1 = lwlrTest(xArr, xArr, yArr, 1.0)
yHat_2 = lwlrTest(xArr, xArr, yArr, 0.01)
yHat_3 = lwlrTest(xArr, xArr, yArr, 0.003)
xMat = np.mat(xArr)
yMat = np.mat(yArr)
srtInd = xMat[:, 1].argsort(0)
xSort = xMat[srtInd][:, 0, :]
fig, axs = plt.subplots(nrows=3,
ncols=1,
sharex=False,
sharey=False,
figsize=(10, 8))
axs[0].plot(xSort[:, 1], yHat_1[srtInd], c='red')
axs[1].plot(xSort[:, 1], yHat_2[srtInd], c='red')
axs[2].plot(xSort[:, 1], yHat_3[srtInd], c='red')
axs[0].scatter(xMat[:, 1].flatten().A[0],
yMat.flatten().A[0],
s=20,
c='blue',
alpha=.5)
axs[1].scatter(xMat[:, 1].flatten().A[0],
yMat.flatten().A[0],
s=20,
c='blue',
alpha=.5)
axs[2].scatter(xMat[:, 1].flatten().A[0],
yMat.flatten().A[0],
s=20,
c='blue',
alpha=.5)
axs0_title_text = axs[0].set_title(u'局部加權迴歸曲線,k=1.0', FontProperties=font)
axs1_title_text = axs[1].set_title(u'局部加權迴歸曲線,k=0.01', FontProperties=font)
axs2_title_text = axs[2].set_title(u'局部加權迴歸曲線,k=0.003',
FontProperties=font)
plt.setp(axs0_title_text, size=8, weight='bold', color='red')
plt.setp(axs1_title_text, size=8, weight='bold', color='red')
plt.setp(axs2_title_text, size=8, weight='bold', color='red')
plt.xlabel('X')
plt.show()
def lwlrTestPlot(xArr, yArr, k=1.0):
yHat = np.zeros(np.shape(yArr))
xCopy = np.mat(xArr)
xCopy.sort(0)
for i in range(np.shape(xArr)[0]):
yHat[i] = lwlr(xCopy[i], xArr, yArr, k)
return yHat, xCopy
def rssError(yArr, yHatArr):
"""
[summary]:計算平方和誤差
Arguments:
yArr -- 真實值
yHatArr -- 預測值
Returns:
[type] -- [description]
"""
return ((yArr - yHatArr)**2).sum()
def ridgeRegres(xMat, yMat, lam=0.2):
"""
[summary]:嶺迴歸 用來處理特徵數多於樣本數的情況,在估計中加入偏差
Arguments:
xMat -- 數據矩陣
yMat -- 標籤矩陣
Keyword Arguments:
lam {float} -- 縮減係數 (default: {0.2})
Returns:
ws -- 迴歸係數
"""
xTx = xMat.T * xMat
denom = xTx + np.eye(np.shape(xMat)[1]) * lam
if np.linalg.det(denom) == 0.0:
print("不可逆矩陣")
return
ws = denom.I * (xMat.T * yMat)
return ws
def ridgeTest(xArr, yArr):
"""
[summary]:嶺迴歸測試,數據標準化
所有特徵減去各自的均值併除以方差
Arguments:
xArr -- 數據數組
yArr -- 標籤數組
Returns:
[type] -- [description]
"""
xMat = np.mat(xArr)
yMat = np.mat(yArr).T
yMean = np.mean(yMat, 0)
yMat = yMat - yMean
xMeans = np.mean(xMat, 0)
xVar = np.var(xMat, 0)
xMat = (xMat - xMeans) / xVar
numTestPts = 30
wMat = np.zeros((numTestPts, np.shape(xMat)[1]))
for i in range(numTestPts):
ws = ridgeRegres(xMat, yMat, np.exp(i - 10))
wMat[i, :] = ws.T
return wMat
def plotwMat():
"""
[summary]:繪製嶺迴歸係數矩陣
"""
font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14)
abX, abY = loadDataSet(r'.\08-線性迴歸\abalone.txt')
redgeWeights = ridgeTest(abX, abY)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(redgeWeights)
ax_title_text = ax.set_title(u'log(lambada)與迴歸係數的關係', FontProperties=font)
ax_xlabel_text = ax.set_xlabel(u'log(lambada)', FontProperties=font)
ax_ylabel_text = ax.set_ylabel(u'迴歸係數', FontProperties=font)
plt.setp(ax_title_text, size=20, weight='bold', color='red')
plt.setp(ax_xlabel_text, size=10, weight='bold', color='black')
plt.setp(ax_ylabel_text, size=10, weight='bold', color='black')
plt.show()
def regularize(xMat, yMat):
"""
[summary]:數據標準化
Arguments:
xMat -- x數據集
yMat -- y數據集
Returns:
inxMat - 標準化後的x數據集
inyMat - 標準化後的y數據集
"""
inxMat = xMat.copy()
inyMat = yMat.copy()
yMean = np.mean(inyMat, 0)
inyMat = inyMat - yMean
inMeans = np.mean(inxMat, 0)
inVar = np.var(inxMat, 0)
inxMat = (inxMat - inMeans) / inVar
return inxMat, inyMat
def stageWise(xArr, yArr, eps=0.01, numIt=100):
"""
[summary]:前向逐步線性迴歸
數據標準化,使其分佈滿足0均值和單位方差
在每輪迭代過程中:
設置當前最小誤差lowestError爲正無窮
對每個特徵:
增大或縮小:
改變一個係數得到一個新的W
計算新W下的誤差
如果誤差Error小於當前最小誤差lowestError:設置Wbest等於當前的W
將W設置爲新的Wbest
Arguments:
xArr -- 數據數組
yArr -- 標籤數據
Keyword Arguments:
eps {float} -- 每次迭代需要調整的步長 (default: {0.01})
numIt {int} -- 迭代次數 (default: {100})
Returns:
[type] -- [description]
"""
xMat = np.mat(xArr)
yMat = np.mat(yArr).T
yMean = np.mean(yMat, 0)
yMat = yMat - yMean
xMat = regularize(xMat)
m, n = np.shape(xMat)
returnMat = np.zeros((numIt, n))
ws = np.zeros((n, 1))
wsTest = ws.copy()
wsMax = ws.copy()
for i in range(numIt):
print(ws.T)
lowestError = np.inf
for j in range(n):
for sign in [-1, 1]:
wsTest = ws.copy()
wsTest[j] += eps * sign
yTest = xMat * wsTest
rssE = rssError(yMat.A, yTest.A)
if rssE < lowestError:
lowestError = rssE
wsMax = wsTest
ws = wsMax.copy()
returnMat[i, :] = ws.T
return returnMat
from bs4 import BeautifulSoup
from time import sleep
import json
import urllib.request
def scrapePage(retX, retY, inFile, yr, numPce, origPrc):
"""
[summary]:從頁面讀取數據,生成retX和retY列表
Arguments:
retX {[type]} -- 數據X
retY {[type]} -- 數據Y
inFile {[type]} -- HTML文件
yr {[type]} -- 年份
numPce {[type]} -- 樂高部件數目
origPrc {[type]} -- 原價
"""
with open(inFile, encoding='utf-8') as f:
html = f.read()
soup = BeautifulSoup(html)
i = 1
currentRow = soup.find_all('table', r="%d" % i)
while (len(currentRow) != 0):
currentRow = soup.find_all('table', r="%d" % i)
title = currentRow[0].find_all('a')[1].text
lwrTitle = title.lower()
if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1):
newFlag = 1.0
else:
newFlag = 0.0
soldUnicde = currentRow[0].find_all('td')[3].find_all('span')
if len(soldUnicde) == 0:
print("商品 #%d 沒有出售" % i)
else:
soldPrice = currentRow[0].find_all('td')[4]
priceStr = soldPrice.text
priceStr = priceStr.replace('$', '')
priceStr = priceStr.replace(',', '')
if len(soldPrice) > 1:
priceStr = priceStr.replace('Free shipping', '')
sellingPrice = float(priceStr)
if sellingPrice > origPrc * 0.5:
print("%d\t%d\t%d\t%f\t%f" %
(yr, numPce, newFlag, origPrc, sellingPrice))
retX.append([yr, numPce, newFlag, origPrc])
retY.append(sellingPrice)
i += 1
currentRow = soup.find_all('table', r="%d" % i)
def setDataCollect(retX, retY):
scrapePage(retX, retY, r'08-線性迴歸\setHtml\lego8288.html', 2006, 800, 49.99)
scrapePage(retX, retY, r'08-線性迴歸\setHtml\lego10030.html', 2002, 3096,
269.99)
scrapePage(retX, retY, r'08-線性迴歸\setHtml\lego10179.html', 2007, 5195,
499.99)
scrapePage(retX, retY, r'08-線性迴歸\setHtml\lego10181.html', 2007, 3428,
199.99)
scrapePage(retX, retY, r'08-線性迴歸\setHtml\lego10189.html', 2008, 5922,
299.99)
scrapePage(retX, retY, r'08-線性迴歸\setHtml\lego10196.html', 2009, 3263,
249.99)
def crossValidation(xArr, yArr, numVal=10):
"""
[summary]:交叉驗證嶺迴歸
Arguments:
xArr {[type]} -- x數據集
yArr {[type]} -- y數據集
Keyword Arguments:
numVal {int} -- 交叉驗證次數 (default: {10})
Returns:
wMat -- 迴歸係數矩陣
"""
m = len(yArr)
indexList = list(range(m))
errorMat = np.zeros((numVal, 30))
for i in range(numVal):
trainX = []
trainY = []
testX = []
testY = []
np.random.shuffle(indexList)
for j in range(m):
if j < m * 0.9:
trainX.append(xArr[indexList[j]])
trainY.append(yArr[indexList[j]])
else:
testX.append(xArr[indexList[j]])
testY.append(yArr[indexList[j]])
wMat = ridgeTest(trainX, trainY)
for k in range(30):
matTestX = np.mat(testX)
matTrainX = np.mat(trainX)
meanTrain = np.mean(matTrainX, 0)
varTrain = np.var(matTrainX, 0)
matTestX = (matTestX - meanTrain) / varTrain
yEst = matTestX * np.mat(wMat[k, :]).T + np.mean(
trainY)
errorMat[i, k] = rssError(yEst.T.A, np.array(testY))
meanErrors = np.mean(errorMat, 0)
minMean = float(min(meanErrors))
bestWeights = wMat[np.nonzero(meanErrors == minMean)]
xMat = np.mat(xArr)
yMat = np.mat(yArr).T
meanX = np.mean(xMat, 0)
varX = np.var(xMat, 0)
unReg = bestWeights / varX
print('%f%+f*年份%+f*部件數量%+f*是否爲全新%+f*原價' %
((-1 * np.sum(np.multiply(meanX, unReg)) + np.mean(yMat)),
unReg[0, 0], unReg[0, 1], unReg[0, 2], unReg[0, 3]))
def useStandRegres():
"""
[summary]:使用簡單的線性迴歸
"""
lgX = []
lgY = []
setDataCollect(lgX, lgY)
data_num, features_num = np.shape(lgX)
lgX1 = np.mat(np.ones((data_num, features_num + 1)))
lgX1[:, 1:5] = np.mat(lgX)
ws = standRegres(lgX1, lgY)
print('%f%+f*年份%+f*部件數量%+f*是否爲全新%+f*原價' %
(ws[0], ws[1], ws[2], ws[3], ws[4]))
def usesklearn():
"""
[summary]:使用sklearn
"""
from sklearn import linear_model
reg = linear_model.Ridge(alpha=.5)
lgX = []
lgY = []
setDataCollect(lgX, lgY)
reg.fit(lgX, lgY)
print('%f%+f*年份%+f*部件數量%+f*是否爲全新%+f*原價' %
(reg.intercept_, reg.coef_[0], reg.coef_[1], reg.coef_[2],
reg.coef_[3]))
if __name__ == '__main__':
usesklearn()