一、說明
我是在jupyter完成的,然後導出成markdown格式,ipynb文件導出爲markdown的命令如下:
jupyter nbconvert --to markdown
源代碼和數據文件,點擊這裏獲取
二、數據項說明
Name Data Type Meas. Description
---- --------- ----- -----------
Sex nominal M, F, and I (infant)
Length continuous mm Longest shell measurement
Diameter continuous mm perpendicular to length
Height continuous mm with meat in shell
Whole weight continuous grams whole abalone
Shucked weight continuous grams weight of meat
Viscera weight continuous grams gut weight (after bleeding)
Shell weight continuous grams after being dried
Rings integer +1.5 gives the age in years
現在有8個數據字段,前面7個是特徵值,最最後一個Rings爲預測,具體請查閱文件內容
三、實戰部分
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
dataframe01 = pd.read_excel('abalone.xlsx', sheet_name='data')
dataframe01.head(10)
Sex | Length | Diameter | Height | Whole weight | Shucked weight | Viscera weight | Shell weight | Rings | |
---|---|---|---|---|---|---|---|---|---|
0 | M | 0.455 | 0.365 | 0.095 | 0.5140 | 0.2245 | 0.1010 | 0.150 | 15 |
1 | M | 0.350 | 0.265 | 0.090 | 0.2255 | 0.0995 | 0.0485 | 0.070 | 7 |
2 | F | 0.530 | 0.420 | 0.135 | 0.6770 | 0.2565 | 0.1415 | 0.210 | 9 |
3 | M | 0.440 | 0.365 | 0.125 | 0.5160 | 0.2155 | 0.1140 | 0.155 | 10 |
4 | I | 0.330 | 0.255 | 0.080 | 0.2050 | 0.0895 | 0.0395 | 0.055 | 7 |
5 | I | 0.425 | 0.300 | 0.095 | 0.3515 | 0.1410 | 0.0775 | 0.120 | 8 |
6 | F | 0.530 | 0.415 | 0.150 | 0.7775 | 0.2370 | 0.1415 | 0.330 | 20 |
7 | F | 0.545 | 0.425 | 0.125 | 0.7680 | 0.2940 | 0.1495 | 0.260 | 16 |
8 | M | 0.475 | 0.370 | 0.125 | 0.5095 | 0.2165 | 0.1125 | 0.165 | 9 |
9 | F | 0.550 | 0.440 | 0.150 | 0.8945 | 0.3145 | 0.1510 | 0.320 | 19 |
# 查看數據容量
dataframe01.shape
(4177, 9)
dataframe01.columns # 特徵名字
Index(['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
'Viscera weight', 'Shell weight', 'Rings'],
dtype='object')
# 清洗數據
# 替換特徵值,將性別中的字符類型轉化爲整數
dataframe02 = dataframe01.copy()
dataframe02.Sex[dataframe01['Sex']=='I']=0
dataframe02.Sex[dataframe01['Sex']=='F']=1
dataframe02.Sex[dataframe01['Sex']=='M']=2
dataframe02.head(10)
Sex | Length | Diameter | Height | Whole weight | Shucked weight | Viscera weight | Shell weight | Rings | |
---|---|---|---|---|---|---|---|---|---|
0 | 2 | 0.455 | 0.365 | 0.095 | 0.5140 | 0.2245 | 0.1010 | 0.150 | 15 |
1 | 2 | 0.350 | 0.265 | 0.090 | 0.2255 | 0.0995 | 0.0485 | 0.070 | 7 |
2 | 1 | 0.530 | 0.420 | 0.135 | 0.6770 | 0.2565 | 0.1415 | 0.210 | 9 |
3 | 2 | 0.440 | 0.365 | 0.125 | 0.5160 | 0.2155 | 0.1140 | 0.155 | 10 |
4 | 0 | 0.330 | 0.255 | 0.080 | 0.2050 | 0.0895 | 0.0395 | 0.055 | 7 |
5 | 0 | 0.425 | 0.300 | 0.095 | 0.3515 | 0.1410 | 0.0775 | 0.120 | 8 |
6 | 1 | 0.530 | 0.415 | 0.150 | 0.7775 | 0.2370 | 0.1415 | 0.330 | 20 |
7 | 1 | 0.545 | 0.425 | 0.125 | 0.7680 | 0.2940 | 0.1495 | 0.260 | 16 |
8 | 2 | 0.475 | 0.370 | 0.125 | 0.5095 | 0.2165 | 0.1125 | 0.165 | 9 |
9 | 1 | 0.550 | 0.440 | 0.150 | 0.8945 | 0.3145 | 0.1510 | 0.320 | 19 |
# 導入線性迴歸的庫
from sklearn.linear_model import LinearRegression as LR
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
data_index = list(dataframe01.columns)
data_index
['Sex',
'Length',
'Diameter',
'Height',
'Whole weight',
'Shucked weight',
'Viscera weight',
'Shell weight',
'Rings']
# 獲取特徵矩陣X 的index
X_index = data_index[0:-1]
Y_index = data_index[-1]
X_index, Y_index
(['Sex',
'Length',
'Diameter',
'Height',
'Whole weight',
'Shucked weight',
'Viscera weight',
'Shell weight'],
'Rings')
X = dataframe02[X_index]
X.head()
Sex | Length | Diameter | Height | Whole weight | Shucked weight | Viscera weight | Shell weight | |
---|---|---|---|---|---|---|---|---|
0 | 2 | 0.455 | 0.365 | 0.095 | 0.5140 | 0.2245 | 0.1010 | 0.150 |
1 | 2 | 0.350 | 0.265 | 0.090 | 0.2255 | 0.0995 | 0.0485 | 0.070 |
2 | 1 | 0.530 | 0.420 | 0.135 | 0.6770 | 0.2565 | 0.1415 | 0.210 |
3 | 2 | 0.440 | 0.365 | 0.125 | 0.5160 | 0.2155 | 0.1140 | 0.155 |
4 | 0 | 0.330 | 0.255 | 0.080 | 0.2050 | 0.0895 | 0.0395 | 0.055 |
Y = dataframe02[Y_index]
Y.head()
0 15
1 7
2 9
3 10
4 7
Name: Rings, dtype: int64
# 劃分訓練集和測試集
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,Y,test_size=0.2,random_state=420)
Xtrain.head()
Sex | Length | Diameter | Height | Whole weight | Shucked weight | Viscera weight | Shell weight | |
---|---|---|---|---|---|---|---|---|
2763 | 0 | 0.550 | 0.425 | 0.135 | 0.6560 | 0.2570 | 0.1700 | 0.203 |
439 | 2 | 0.500 | 0.415 | 0.165 | 0.6885 | 0.2490 | 0.1380 | 0.250 |
1735 | 2 | 0.670 | 0.520 | 0.165 | 1.3900 | 0.7110 | 0.2865 | 0.300 |
751 | 2 | 0.485 | 0.355 | 0.120 | 0.5470 | 0.2150 | 0.1615 | 0.140 |
1626 | 1 | 0.570 | 0.450 | 0.135 | 0.7805 | 0.3345 | 0.1850 | 0.210 |
Ytrain.head()
2763 10
439 13
1735 11
751 10
1626 8
Name: Rings, dtype: int64
#恢復索引
for i in [Xtrain, Xtest]:
i.index = range(i.shape[0])
#恢復索引
for i in [Ytrain, Ytest]:
i.index = range(i.shape[0])
Xtrain.head() # 查看X訓練集頭部
Sex | Length | Diameter | Height | Whole weight | Shucked weight | Viscera weight | Shell weight | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 0.550 | 0.425 | 0.135 | 0.6560 | 0.2570 | 0.1700 | 0.203 |
1 | 2 | 0.500 | 0.415 | 0.165 | 0.6885 | 0.2490 | 0.1380 | 0.250 |
2 | 2 | 0.670 | 0.520 | 0.165 | 1.3900 | 0.7110 | 0.2865 | 0.300 |
3 | 2 | 0.485 | 0.355 | 0.120 | 0.5470 | 0.2150 | 0.1615 | 0.140 |
4 | 1 | 0.570 | 0.450 | 0.135 | 0.7805 | 0.3345 | 0.1850 | 0.210 |
Ytrain.head()
0 10
1 13
2 11
3 10
4 8
Name: Rings, dtype: int64
# 先用訓練集訓練(fit)標準化的類,然後用訓練好的類分別轉化(transform)訓練集和測試集
# 開始建模
reg = LR().fit(Xtrain, Ytrain)
yhat = reg.predict(Xtest) #預測我們的yhat
yhat.min()
4.22923686878166
yhat.max()
22.656846035572762
reg.coef_ # w,係數向量
array([ 0.40527178, -0.88791132, 13.01662939, 10.39250886,
9.64127293, -20.87747601, -10.50683081, 7.70632772])
Xtrain.columns
Index(['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
'Viscera weight', 'Shell weight'],
dtype='object')
[*zip(Xtrain.columns,reg.coef_)]
[('Sex', 0.4052717783379893),
('Length', -0.8879113179582045),
('Diameter', 13.016629389061475),
('Height', 10.39250886428478),
('Whole weight', 9.64127293101552),
('Shucked weight', -20.87747600529615),
('Viscera weight', -10.506830809919672),
('Shell weight', 7.706327719866024)]
# 特徵說明
Name Data Type Meas. Description
Sex nominal M, F, and I (infant)
Length continuous mm Longest shell measurement
Diameter continuous mm perpendicular to length
Height continuous mm with meat in shell
Whole weight continuous grams whole abalone
Shucked weight continuous grams weight of meat
Viscera weight continuous grams gut weight (after bleeding)
Shell weight continuous grams after being dried
Rings integer +1.5 gives the age in years
# 截距
reg.intercept_
2.7888240054011835
# 自定義最小二乘法嘗試
def my_least_squares(x_array, y_array):
'''
:param x: 列表,表示m*n矩陣
:param y: 列表,表示m*1矩陣
:return: coef:list 迴歸係數(1*n矩陣) intercept: float 截距
'''
# 矩陣對象化
arr_x_01 = np.array(x_array)
arr_y_01 = np.array(y_array)
# x_array由 m*n矩陣轉化爲 m*(n+1)矩陣,其中第n+1列係數全爲1
# 獲取行數
row_num = arr_x_01.shape[0]
# 生成常量係數矩陣 m*1矩陣
arr_b = np.array([[1 for i in range(0, row_num)]])
# 合併成m*(n+1)矩陣
arr_x_02 = np.insert(arr_x_01, 0, values=arr_b, axis=1)
# 矩陣運算
w = np.linalg.inv(np.matmul(arr_x_02.T, arr_x_02))
w = np.matmul(w, arr_x_02.T)
w = np.matmul(w, arr_y_01)
# w爲1*(n+1)矩陣
# print(w)
result = list(w)
coef = result.pop(-1)
intercept = result
return coef, intercept
# debug中
my_least_squares(Xtrain,list(Ytrain))
# 梯度下降法嘗試
def costFunc(X,Y,theta):
'''
代價函數
'''
inner = np.power((X*theta.T)-Y,2)
return np.sum(inner)/(2*len(X))
def gradientDescent(X,Y,theta,alpha,iters):
'''
梯度下降
'''
temp = np.mat(np.zeros(theta.shape))
cost = np.zeros(iters)
thetaNums = int(theta.shape[1])
print(thetaNums)
for i in range(iters):
error = (X*theta.T-Y)
for j in range(thetaNums):
derivativeInner = np.multiply(error,X[:,j])
temp[0,j] = theta[0,j] - (alpha*np.sum(derivativeInner)/len(X))
theta = temp
cost[i] = costFunc(X,Y,theta)
return theta,cost