下面是訓練集和測試集的部分圖像
SVM模型代碼(進行了調參):
#!/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
import pandas as pd
from sklearn import svm
import matplotlib.colors
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.metrics import accuracy_score
import os
from sklearn.model_selection import GridSearchCV
from time import time
def show_accuracy(a, b, tip):
acc = a.ravel() == b.ravel()
print(tip + '正確率:%.2f%%' % (100 * np.mean(acc)))
def save_image(im, i):
im *= (256 / 17)
im = 255 - im
a = im.astype(np.uint8)
output_path = './HandWritten'
if not os.path.exists(output_path):
os.mkdir(output_path)
Image.fromarray(a).resize(size=(100, 100)).save(output_path + ('\\%d.png' % i))
if __name__ == "__main__":
print('Load Training File Start...')
data = pd.read_csv('optdigits.tra', header=None)
x, y = data[list(range(64))], data[64]
x, y = x.values, y.values # 轉換爲numpy形式,返回DataFrame的Numpy表示。
images = x.reshape(-1, 8, 8) # 不知道多少行,反正每一行是一個8*8的矩陣,對應着圖片
print('images.shape = ', images.shape)
y = y.ravel().astype(np.int)
print('Load Test Data Start...')
data = np.loadtxt('optdigits.tes', dtype=np.float, delimiter=',')
x_test, y_test = np.split(data, (-1,), axis=1) # axis=1 按行方向拆分數據,也就是水平方向
print(y_test.shape)
images_test = x_test.reshape(-1, 8, 8)
y_test = y_test.ravel().astype(np.int)
print('Load Data OK...')
# x, x_test, y, y_test = train_test_split(x, y, test_size=0.4, random_state=1)
# images = x.reshape(-1, 8, 8)
# images_test = x_test.reshape(-1, 8, 8)
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(15, 9), facecolor='w')
for index, image in enumerate(images[:16]):
plt.subplot(4, 8, index + 1)
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
plt.title('訓練圖片: %i' % y[index])
for index, image in enumerate(images_test[:16]):
plt.subplot(4, 8, index + 17)
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
save_image(image.copy(), index)
plt.title('測試圖片: %i' % y_test[index])
plt.tight_layout()
plt.show()
params = {'C':np.logspace(0, 3, 7), 'gamma':np.logspace(-5, 0, 11)}
model = GridSearchCV(svm.SVC(kernel='rbf'), param_grid=params, cv=3)
#model = svm.SVC(C=1, kernel='rbf', gamma=0.001)
print('Start Learning...')
t0 = time()
model.fit(x, y)
t1 = time()
t = t1 - t0
print('訓練+CV耗時:%d分鐘%.3f秒' % (int(t / 60), t - 60 * int(t / 60)))
print ('最優參數:\t', model.best_params_)
# clf.fit(x, y)
print('Learning is OK...')
print('訓練集準確率:', accuracy_score(y, model.predict(x)))
y_hat = model.predict(x_test)
print('測試集準確率:', accuracy_score(y_test, model.predict(x_test)))
print(y_hat)
print(y_test)
err_images = images_test[y_test != y_hat]
err_y_hat = y_hat[y_test != y_hat]
err_y = y_test[y_test != y_hat]
print(err_y_hat)
print(err_y)
plt.figure(figsize=(10, 8), facecolor='w')
for index, image in enumerate(err_images):
if index >= 12:
break
plt.subplot(3, 4, index + 1)
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
plt.title('錯分爲:%i,真實值:%i' % (err_y_hat[index], err_y[index]))
plt.tight_layout()
plt.show()
結果:
訓練耗時:4分鐘40.544秒
最優參數: {'C': 10.0, 'gamma': 0.001}
訓練集準確率: 1.0
測試集準確率: 0.9827490261547023
下面是識別錯誤例子(人都看不出來是啥數字。。。。):
XGBOOST模型(進行了調參):
import pandas as pd
import xgboost as xgb
import numpy as np
from time import time
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
def show_accuracy(a, b, tip):
acc = a.ravel() == b.ravel()
print(tip + '正確率:%.2f%%' % (100 * np.mean(acc)))
if __name__ == '__main__':
print('Load Training File Start...')
data = pd.read_csv('optdigits.tra', header=None)
x, y = data[list(range(64))], data[64]
x, y = x.values, y.values # 轉換爲numpy形式,返回DataFrame的Numpy表示。
images = x.reshape(-1, 8, 8) # 得到圖片對應的矩陣
print('images.shape = ', images.shape)
y = y.ravel().astype(np.int) # 由一個列向量拉開成行向量
print('Load Test Data Start...')
data = np.loadtxt('optdigits.tes', dtype=np.float, delimiter=',')
x_test, y_test = np.split(data, (-1,), axis=1) # axis=1 按行方向拆分數據,也就是水平方向
print(y_test.shape)
images_test = x_test.reshape(-1, 8, 8)
y_test = y_test.ravel().astype(np.int)
print('Load Data OK...')
t0 = time()
#xgb模型參數
params = {'objective': 'multi:softmax', # 定義多分類問題
'num_class': 10, # 類別個數
'eta': 0.1, # 學習率
'silent': 1 # 是否打印中間結果,1就是不打印
}
# train = xgb.DMatrix(x, label=y)
# test = xgb.DMatrix(x_test, label=y_test)
num_round = 5
#bst = xgb.train(params, train, num_round)
cv_params = {'eta': [0.1, 0.01],'n_estimators': np.linspace(100, 600, 20, dtype=int)}
gbm = xgb.XGBClassifier(**params)
#調參,訓練模型
opt_clf = GridSearchCV(estimator=gbm, param_grid=cv_params, cv=3)
opt_clf.fit(x, y)
#pred = opt_clf.predict(x_test)
t1 = time()
t = t1 - t0
print('訓練模型耗時:%d分鐘%.3f秒' % (int(t / 60), t - 60 * int(t / 60)))
print('最優參數:\t', opt_clf.best_params_)
#accuracy = accuracy_score(y_test, pred)
print('訓練集準確率: ', accuracy_score(y, opt_clf.predict(x)))
print('測試集準確率: ',accuracy_score(y_test, opt_clf.predict(x_test)))
# #
# t0 = time()
# #n_estimators的值已調出最優值 1390
# cv_params = {'n_estimators': np.linspace(100, 1000, 10, dtype=int)}
# regress_model = xgb.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=187, silent=False, objective='multi:softmax')
# model = GridSearchCV(regress_model, param_grid=cv_params, verbose=2, refit=True, cv=5, n_jobs=-1)
# model.fit(x,y)
#
# t1 = time()
# t = t1 - t0
# print('訓練模型耗時:%d分鐘%.3f秒' % (int(t / 60), t - 60 * int(t / 60)))
# print('最優參數:\t', model.best_params_)
# # 對測試集進行預測
# y_hat = model.predict(x)
# show_accuracy(y,y_hat,"訓練集")
#
# y_hat_test = model.predict(x_test)
# show_accuracy(y_test, y_hat_test, "測試集")
# #print('訓練集準確率:', accuracy_score(y, model.predict(x)))
# #print('測試集準確率:', accuracy_score(y_test, model.predict(x_test)))
結果:
訓練模型耗時:29分鐘59.371秒
最優參數: {'eta': 0.1, 'n_estimators': 284}
訓練集準確率: 1.0
測試集準確率: 0.9671675013912076
總結:
從最後的運行結果可以看出SVM比xgboost的效果好些,並且svm運行時間也快於xgboost。
xgboost耗時較多的原因主要是調參的原因,若不進行調參,則很快就能訓練出模型,但由於使用了GridSearchCV()來對n_estimators進行調參,所以運行時間大大增加,所以參數cv的值最好調小一些,不然運行時間太慢,在此次實驗中將cv設爲3,都需要跑半個小時才運行出來,最後結果還沒SVM的效果好。