竞赛地址 https://www.kaggle.com/c/digit-recognizer
苦于学了cs231n和pytorch之后没有东西拿来练手,就去kaggle上找了一个入门竞赛,MNIST手写数字识别,这个比赛把MNIST数据集拆分成了42000的trainset和28000的testset。
然后自己跟着cs231n的模式一步步写网络,pandas+numpy预处理数据,重写check_acc/train_part函数,重新定义新的网络结构,训练模型,一直到输出测试集的结果,到处都是坑啊!!!耗时一天多一点,终于完成。。。最后score0.99042,也懒得再改了,等看论文看到新技术再改吧。参数也调了好久,各种batch_size/lr/lrdecay/网络层数不停地试,终于在加深了网络和调整了batch_size之后让score上升了2%(果真深度是有好处的_)
最坑的TM测试输出的时候因为28000张数据一起输入,显存炸了。。。显存炸完,心态也快炸了。。。3G真的伤不起,早知道当时买6G了。。都是泪。。。
然后下面是代码,使用Jupyter notebook写的,前面的网络训练部分拿来可以直接跑,后面的数据测试和结果输出需要改(因为显存炸了只好把测试集分成两半测试一个释放一个然后再测另外一个,之后再合并到一起输出。。。所以输出部分改的有点多,不能直接跑)
然后看到这个比赛的leaderboard前95个score都是1,惊呆了,后来看了评论才发现有一些高分(也许有大佬用42000个训练集数据就做出来100的acc也不是不可能)是直接用MINIST所有的数据集来训练的,这样岂不是overfit也没有问题???反正testset是MNIST的子集,剩下的等以后学到啥再来改吧。
其实通过这个收获也挺大的,至少自己手写了一遍!!然后什么normalization/standardization/batchnorm都自己训练体验了一遍,就比光看论文看博客来的实在一点!
#%%
import pandas as pd
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
import numpy as np
import matplotlib.pyplot as plt
import datetime
#%%
### 读取文件数据集
# pandas读取文件内容
data = pd.read_csv('./digit-recognizer/train.csv')
data.head(5) # 显示文件前五行
# dataframe数据转化成numpy数组,共42000个数据
np_data = np.array(data)
# 数据第一列为标签,将标签与数据分开
digit_data = np_data[:, 1:]
digit_label = np_data[:, 0]
print(digit_label.shape)
# 将二维数据转化为三维张量,第一维度代表数据个数,第二第三代表图像长宽像素,然后这个数据的二三维拿出来就是图像
# 也就是把一维的像素值reshape成图像
digit_data = digit_data.reshape((42000, 28, 28))
example = digit_data[5, :, :] # 可以看一下第一张图像
plt.imshow(example)
#%%
# 将数据分为train,val两组,分别为38400/3600
train_data = digit_data[0:38400, :, :]
val_data = digit_data[38400:, :, :]
train_exam = train_data[0, :, :]
val_exam = val_data[0, :, :]
print('train and val data: ', train_data.shape, ' ', val_data.shape) # 检查维度
train_label = digit_label[0:38400]
val_label = digit_label[38400:]
print('train and val label: ', train_label.shape, ' ', val_label.shape)
#%%
plt.imshow(train_data[0, :, :])
# 将数据集和验证集都转化为pytorch的tensor
train_data = torch.tensor(train_data)
val_data = torch.tensor(val_data)
train_label = torch.tensor(train_label)
val_label = torch.tensor(val_label)
#%%
# 对数据进行归一化处理,因为原始数据是[0,255]的值,需要归一化到[0,1]的区间
# 在整个数据维度上对像素进行规范化,直接除以255
def data_normalization(tensor):
return tensor.float()/255
def data_stdandization(tensor):
return (tensor.float()-tensor.float().mean())/tensor.float().std()
#%%
# 将数据都规范化到0-1之间
train_data = data_normalization(train_data)
val_data = data_normalization(val_data)
train_data = data_stdandization(train_data)
val_data = data_stdandization(val_data)
#%%
def read_a_batch(batch_size):
index = random.sample(range(38400), batch_size)
# return (train_data[index, :, :], train_label[:, index])
return (train_data[index, :, :], train_label[index])
#%%
USE_GPU = True
dtype = torch.float32 # we will be using float throughout this tutorial
if USE_GPU and torch.cuda.is_available():
device = torch.device('cuda')
else:
device = torch.device('cpu')
# Constant to control how frequently we print train loss
print_every = 250
print('using device:', device)
#%%
def check_val_accuracy(model, val):
print('Checking accuracy on validation set')
model.eval() # set model to evaluation mode
with torch.no_grad():
x, y = val
x = torch.unsqueeze(x, 1)
x = x.to(device=device, dtype=dtype) # move to device, e.g. GPU
y = y.to(device=device, dtype=torch.long)
scores = model(x)
preds = torch.argmax(scores, dim=1)
corr=(preds==y)
acc=corr.sum().item()/3600*100
print('Got acc %.2f %%' % acc)
return acc
#%%
def train_part(model, optimizer, scheduler, batch_size = 256,epochs=1):
history = []
loss_all = []
for e in range(epochs):
for i in range((int)(38400/batch_size)):
x, y = read_a_batch(batch_size)
x = torch.unsqueeze(x, 1)
x = x.to(device=device, dtype=dtype)
y = y.to(device=device, dtype=torch.long)
scores = model(x)
loss = F.cross_entropy(scores, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if i % print_every == 0:
print('Epoch %d, Iteration %d, loss = %.4f' % (e+1, i, loss.item()))
val = (val_data, val_label)
acc = check_val_accuracy(model, val)
history.append(acc)
loss_all.append(loss.item())
print()
# 学习率衰减,每5个epoch乘以一个系数
scheduler.step()
if (e+1) % 5 == 0:
print('learning rate decreased...\n')
plt.figure(figsize=(15,6))
plt.xlabel('iteration')
plt.ylabel('acc')
plt.plot(list(range(len(history))), history, 'go-')
plt.show()
plt.figure(figsize=(15,6))
plt.xlabel('iteration')
plt.ylabel('loss')
plt.plot(list(range(len(loss_all))), loss_all, 'ro-')
plt.show()
#%%
model = None
optimizer = None
class MyDigitNet(nn.Module):
'''
输入图像是28*28*1的像单通道素图
'''
def __init__(self, num_classes=10):
super().__init__()
self.features = nn.Sequential(
nn.Conv2d(1, 32, kernel_size=3, stride=2, padding=1), # 1*28*28输入,输出为32*14*14
nn.ReLU(inplace=True),
nn.BatchNorm2d(32),
nn.Conv2d(32, 64, kernel_size=3, padding=1), # 32*14*14batchnorm输入,输出为64*14*14
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2), # 池化后尺寸7*7
nn.Conv2d(64, 128, kernel_size=3), # 输入64*7*7,输出128*5*5
nn.ReLU(inplace=True),
nn.BatchNorm2d(128),
nn.Conv2d(128, 128, kernel_size=2), # 输入128*5*5,输出128*4*4
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2), # 输出尺寸为2*2的fm
)
self.classifier = nn.Sequential(
nn.Linear(128*2*2, 256),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(256, 256),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(256, num_classes),
)
def forward(self, x):
x = self.features(x)
x = x.view(x.size(0), 128*2*2)
x = self.classifier(x)
return x
# 实例化并查看网络结构
model = MyDigitNet(10)
model.to(device)
print(model)
#%%
# 定义优化器,SGD
optimizer = optim.SGD(model.parameters(), lr=0.01,
momentum=0.9, nesterov=True, weight_decay=0.0005)
# 学习率每五个epoch调整一次
scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.3)
#%%
# 训练网络,并计算时间
start = datetime.datetime.now()
train_part(model, optimizer, scheduler, 64, epochs=15)
end = datetime.datetime.now()
print(end-start)
#%%
# 保存模型参数
torch.save(model.state_dict(), './digit-recognizer/CNN.pth')
#%%
# 处理测试集图像
test = pd.read_csv('./digit-recognizer/test.csv')
test.head(5) # 显示文件前五行
# dataframe数据转化成numpy数组,共42000个数据
np_test = np.array(test)
# 将二维数据转化为三维张量,第一维度代表数据个数,第二第三代表图像长宽像素,然后这个数据的二三维拿出来就是图像
# 也就是把一维的像素值reshape成图像
np_test = np_test.reshape((28000, 28, 28))
example = np_test[1, :, :] # 可以看一下第一张图像
plt.imshow(example)
#%%
# 数据转换成torhc.tensor,然后放到GPU上
test_data=torch.tensor(np_test)
test_data = torch.unsqueeze(test_data,dim=1)
#test_data=test_data.to(device=device,dtype=dtype)
print(test_data.shape)
print(test_data.dtype)
print(test_data.device)
#%%
# 加载模型
#model = MyDigitNet(10)
#model.load_state_dict(torch.load('./digit-recognizer/CNN.pth'))
torch.cuda.memory_allocated()
#%%
#test_data1=test_data[0:14001, :, :, :].to(dtype=dtype,device='cuda:0')
test_data2=test_data[14001:28000, :, :, :].to(dtype=dtype,device='cuda:0')
#%%
# 先用网络跑出结果矩阵,GPU显存不够。。。把test_data拆分成两半,跑完再拼起来
#test1 = test_data[0:14001, :, :, :]
#test2 = test_data[14001:28001,:, :, :]
model.eval()
with torch.no_grad():
# scores = model(test_data)
# preds = torch.argmax(scores, dim=1)
# print(preds)
# scores1 = model(test_data1)
# preds1 = torch.argmax(scores1, dim=1)
# print(preds1)
scores2 = model(test_data2)
preds2 = torch.argmax(scores2, dim=1)
print(preds2)
#%%
# res=preds.cpu().numpy()
# print(res.shape)
res1 = preds1.cpu().numpy()
# print(res1.shape)
res2 = preds2.cpu().numpy()
# print(res2.shape)
#%%
# 先将结果合并起来
res = np.hstack((res1, res2))
print(res.shape)
print(res)
#%%
# 将结果preds写入.csv文件
imageID=list(range(1, 28001))
result = {'ImageId':imageID,
'Label':res}
dt = pd.DataFrame(result)
dt.head(5)
dt.to_csv('./digit-recognizer/submission2nd.csv')