#coding=utf-8
#数据导入行数
from numpy import *
def loadDataSet(fileName):
numFeat = len(open(fileName).readline().split('\t')) - 1
dataMat = []; labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr = []
curLine = line.strip().split('\t')
for i in range(numFeat):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat, labelMat
#标准回归函数
def standRegress(xArr, yArr):
xMat = mat(xArr); yMat = mat(yArr).T
xTx = xMat.T*xMat
if linalg.det(xTx) == 0.0:
print "this matrix is singular, cannot do inverse"
return
ws = xTx.I * (xMat.T*yMat)
return ws
#局部加权线性回归函数,给定一点,用局部加权线性回归预测该点yMat值
def lwlr(testPoint, xArr, yArr, k = 1.0):
xMat = mat(xArr); yMat = mat(yArr).T
m = shape(xMat)[0]
weights = mat(eye((m))) #对角矩阵w[i][i] = 1其余均为0
for j in range(m):
diffMat = testPoint - xMat[j,:]
weights[j,j] = exp(diffMat * diffMat.T)/(-2.0*k**2)
xTx = xMat.T * (weights * xMat)
if linalg.det(xTx) == 0.0:
print "this matrix is singular, cannot do inverse"
return
ws = xTx.I * (xMat.T * (weights * yMat))
return testPoint * ws
#测试函数,为每一个测试数据点调用lwlr函数
def lwlrTest(testArr, xArr, yArr, k = 1.0):
m = shape(testArr)[0]
yHat = zeros(m)
for i in range(m):
yHat[i] = lwlr(testArr[i], xArr, yArr, k)
return yHat
def rssError(yArr, yHatArr):
return ((yArr - yHatArr)**2).sum()
#岭回归,计算回归系数
def ridgeRegress(xMat, yMat, lam = 0.2):
xTx = xMat.T * xMat
denom = xTx + eye(shape(xMat)[1]) * lam
if linalg.det(denom) == 0.0:
print "This matrix is singular, cannot do inverse"
return
ws = denom.I * (xMat.T * yMat)
return ws
#在一组lam上测试结果
def ridgeTest(xArr, yArr):
xMat = mat(xArr); yMat = mat(yArr).T
yMean = mean(yMat, 0) #对各列求均值
yMat = yMat - yMean
xMeans = mean(xMat, 0) #对各列求均值
xVar = var(xMat, 0) #对各列求方差
xMat = (xMat - xMeans)/xVar #标准化数据
numTestPts = 30
wMat = zeros((numTestPts, shape(xMat)[1]))
for i in range(numTestPts): #在30个不同的lam下计算回归系数
ws = ridgeRegress(xMat, yMat, exp(i - 10))
wMat[i,:] = ws.T
return wMat
#数据标准化函数
def regularize(xMat):#regularize by columns
inMat = xMat.copy()
inMeans = mean(inMat,0) #calc mean then subtract it off
inVar = var(inMat,0) #calc variance of Xi then divide by it
inMat = (inMat - inMeans)/inVar
return inMat
#前向逐步线性回归算法实现,eps迭代需要调整的步长,numIt迭代次数
def stageWise(xArr, yArr, eps = 0.01, numIt = 100):
xMat = mat(xArr); yMat = mat(yArr).T
yMean = mean(yMat, 0)
yMat = yMat - yMean
xMat = regularize(xMat)
m, n = shape(xMat)
returnMat = zeros((numIt, n))
ws = zeros((n, 1)); wsTest = ws.copy(); wsMax = ws.copy()
for i in range(numIt):
print ws.T
lowestError = inf #初始平方误差无穷
for j in range(n):
for sign in [-1, 1]: #分别计算增加或者减少该特征对平方误差的影响
wsTest = ws.copy()
wsTest[j] += eps*sign
yTest = xMat * wsTest
rssE = rssError(yMat.A, yTest.A)
if rssE < lowestError: #取平方误差较小者
lowestError = rssE
wsMax = wsTest
ws = wsMax.copy()
returnMat[i,:] =ws.T
return returnMat
#购物信息获取函数
from time import sleep
import json
import urllib2
from time import sleep
import json
import urllib2
def searchForSet(retX, retY, setNum, yr, numPce, origPrc):
sleep(10)
myAPIstr = 'AIzaSyD2cR2KFyx12hXu6PFU-wrWot3NXvko8vY'
searchURL = 'https://www.googleapis.com/shopping/search/v1/public/products?key=%s&country=US&q=lego+%d&alt=json' % (
myAPIstr, setNum)
pg = urllib2.urlopen(searchURL)
retDict = json.loads(pg.read())
for i in range(len(retDict['items'])):
try:
currItem = retDict['items'][i]
if currItem['product']['condition'] == 'new':
newFlag = 1
else:
newFlag = 0
listOfInv = currItem['product']['inventories']
for item in listOfInv:
sellingPrice = item['price']
if sellingPrice > origPrc * 0.5:
print "%d\t%d\t%d\t%f\t%f" % (yr, numPce, newFlag, origPrc, sellingPrice)
retX.append([yr, numPce, newFlag, origPrc])
retY.append(sellingPrice)
except:
print 'problem with item %d' % i
def setDataCollect(retX, retY):
searchForSet(retX, retY, 8288, 2006, 800, 49.99)
searchForSet(retX, retY, 10030, 2002, 3096, 269.99)
searchForSet(retX, retY, 10179, 2007, 5195, 499.99)
searchForSet(retX, retY, 10181, 2007, 3428, 199.99)
searchForSet(retX, retY, 10189, 2008, 5922, 299.99)
searchForSet(retX, retY, 10196, 2009, 3263, 249.99)
def crossValidation(xArr, yArr, numVal=10):
m = len(yArr)
indexList = range(m)
errorMat = zeros((numVal, 30)) # create error mat 30columns numVal rows
for i in range(numVal):
trainX = [];
trainY = []
testX = [];
testY = []
random.shuffle(indexList)
for j in range(m): # create training set based on first 90% of values in indexList
if j < m * 0.9:
trainX.append(xArr[indexList[j]])
trainY.append(yArr[indexList[j]])
else:
testX.append(xArr[indexList[j]])
testY.append(yArr[indexList[j]])
wMat = ridgeTest(trainX, trainY) # get 30 weight vectors from ridge
for k in range(30): # loop over all of the ridge estimates
matTestX = mat(testX);
matTrainX = mat(trainX)
meanTrain = mean(matTrainX, 0)
varTrain = var(matTrainX, 0)
matTestX = (matTestX - meanTrain) / varTrain # regularize test with training params
yEst = matTestX * mat(wMat[k, :]).T + mean(trainY) # test ridge results and store
errorMat[i, k] = rssError(yEst.T.A, array(testY))
# print errorMat[i,k]
meanErrors = mean(errorMat, 0) # calc avg performance of the different ridge weight vectors
minMean = float(min(meanErrors))
bestWeights = wMat[nonzero(meanErrors == minMean)]
# can unregularize to get model
# when we regularized we wrote Xreg = (x-meanX)/var(x)
# we can now write in terms of x not Xreg: x*w/var(x) - meanX/var(x) +meanY
xMat = mat(xArr);
yMat = mat(yArr).T
meanX = mean(xMat, 0);
varX = var(xMat, 0)
unReg = bestWeights / varX
print "the best model from Ridge Regression is:\n", unReg
print "with constant term: ", -1 * sum(multiply(meanX, unReg)) + mean(yMat)
总结
- 与分类一样,回归也是预测目标值的过程。回归与分类的不同点在于,前者预测连续性变量,后者预测零散型变量。
- 当数据的样本比特征还少时候,举证xTx的逆不能直接计算。即便当样本数比特征树多时,xTx的逆仍然可能无法直接计算,这是因为特征有可能高度相关。这时可以考虑使用岭回归。因为当xTx的逆不能计算时,它仍保证能求得回归系数
- 岭回归是缩减法的一种,相当于对回归系数的大小施加限制。另一种很好的缩减法是lasso。Lasso难以求解,但可以使用计算简便的逐步线性回归求得近似结果。
- 缩减法还可以看做是对一个模型增加偏差的同时减少方差。偏差方差折中是一个重要的概念,可以帮助我们理解现有模型并做出改进,从而得到更好的模型。