第一次參加比賽,記錄一下,我是直接使用遷移學習進行分類 採用vgg16
排名不高僅供參考
import pandas as pd
import torch
import numpy as np
from torch.autograd import Variable
import torchvision
from torchvision import transforms, models
import matplotlib.pyplot as plt
import torch.nn.functional as F
import os
from sklearn import metrics
import sys
system = sys.platform #判斷系統的,兩個電腦上 路徑不一樣
if system == 'win32':
os.chdir('input')
mode = 'train' # train用來訓練, test生成csv提交結果
# mode = 'test'
print('mode = ' + mode)
#這一塊是pytorch自帶的的載入文件夾圖片
transformer = transforms.Compose([
transforms.Resize((224, 224)),
# transforms.CenterCrop(200),
# transforms.RandomVerticalFlip(),
# transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])
train_data = {x: torchvision.datasets.ImageFolder(x, transform=transformer)
for x in ['train', 'val']}
print(train_data['train'].class_to_idx)
train_loader = {}
train_loader['train'] = torch.utils.data.DataLoader(train_data['train'],
batch_size=10,
shuffle=True)
train_loader['val'] = torch.utils.data.DataLoader(train_data['val'],
batch_size=10,
shuffle=True)
print('train num is ' + str(len(train_data['train'])))
print('val num is ' + str(len(train_data['val'])))
if os.listdir('models'): #恢復模型
print('restrore the model')
model = torch.load('my_model.pkl')
else:
print('use vgg16 model')
# model = torch.load('vgg16.pkl') #因爲網絡不好, 我都是提前下下來保存再載入
# model = torch.load('vgg_11_bn.pkl')
# models.vgg16_bn(pretrained=True, batch_norm)
model.classifier = torch.nn.Sequential(
torch.nn.Linear(7*7*512, 2), #vgg提取特徵不變 分類層改一下
if torch.cuda.is_available(): #cpu gpu轉換
model = model.cuda()
print(model)
loss_func = torch.nn.CrossEntropyLoss()
lr = 1e-5
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
## 建立這些列表基本都是用來畫圖的
epochs = 30
plot_loss = []
best_auc = 0
auc_list = []
auc_list2 = []
train_acc_list = []
test_acc_list = []
# plt.ion()
def valling(dir_name, model):
"""
得到網絡輸出 用來metrics
0 1標籤(用來算正確率)
概率(算auc)
label
"""
model.eval()
print('valling in ' + str(dir_name))
y_pre_all = np.array(())
test_y_all = np.array(())
all_pro = np.array(())
for tep_idx, [test_x, test_y] in enumerate(train_loader[dir_name]):
if tep_idx <= 10:
test_x, test_y = next(iter(train_loader[dir_name]))
if torch.cuda.is_available():
test_x, test_y = (test_x.cuda()), (test_y.cuda())
y_out_test = model(test_x)
all_pro = np.append(all_pro, F.softmax(y_out_test, 0).cpu().data.numpy()[:, 1])
# print(y_out_test)
y_pre_test = torch.argmax(y_out_test, 1)
y_pre_test = y_pre_test.cpu().data.numpy()
test_y = test_y.cpu().data.numpy()
# print(y_pre_all.shape)
# print(y_pre_test.shape)
y_pre_all = np.append(y_pre_all, y_pre_test)
test_y_all = np.append(test_y_all, test_y)
# print(y_pre_all.shape)
return y_pre_all, test_y_all, all_pro
def my_metrics(pre, label, pro):
'''
計算auc acc
'''
# print('label shape is ' + str(label.shape))
# print('pro shape is ' + str(pro.shape))
auc = metrics.roc_auc_score(label, pro)
bool_arr_test = (pre == label)
test_acc = np.sum(bool_arr_test) / pre.size
return auc, test_acc
def plot_list(list1, list2, dir_, title):
'''
畫圖 train 和test的acc auc
'''
abs_dir = os.path.abspath(dir_)
if not os.path.exists(os.path.dirname(abs_dir)):
os.mkdir(os.path.dirname(abs_dir))
print('creat dir{}'.format(abs_dir))
plt.figure()
plt.plot(list1, label='train')
plt.plot(list2, label='test')
plt.title(title)
plt.legend(loc='best')
plt.savefig(dir_)
plt.close()
if mode =='train':
best_acc = 0
plot_epoch_loss = []
# print(model)
for epoch in range(epochs):
model.train()
print('training')
batch = 0
epoch_loss = 0
correct = 0
# print(train_loader['train'])
for data in train_loader['train']:
batch += 1
x, y = data
if torch.cuda.is_available():
x, y = x.cuda(), y.cuda()
x, y = Variable(x), Variable(y)
y_out = model(x)
optimizer.zero_grad()
loss = loss_func(y_out, y)
epoch_loss += loss
# print(loss.data)
# print(loss.data[0])
loss.backward()
optimizer.step()
a_loss = loss.cpu().data.numpy()
plot_loss.append(a_loss)
plt.cla()
plt.plot(plot_loss)
print(a_loss)
plt.text(0, 0.5, 'loss = %.3f' % a_loss, {'color': 'red', 'size': 15})
plt.savefig('loss2.png')
plt.close()
plt.pause(0.5)
y_pre_all, test_y_all, all_pro = valling('val', model)
train_y_pre_all, train_test_y_all, train_all_pro = valling('train', model)
auc, test_acc = my_metrics(y_pre_all, test_y_all, all_pro)
train_auc, train_test_acc = my_metrics(train_y_pre_all, train_test_y_all, train_all_pro)
train_acc_list.append(train_test_acc)
test_acc_list.append(test_acc)
saved_figs_dir = 'vgg11_full_32'
plot_list(train_acc_list, test_acc_list, os.path.join('saved_figs', saved_figs_dir, 'acc.png'), 'acc_curve')
auc_list.append(auc)
auc_list2.append(train_auc)
plot_list(auc_list2, auc_list, os.path.join('saved_figs', saved_figs_dir, 'auc.png'), 'auc_curve')
best_acc = max(best_acc, test_acc) #保存最好的結果
best_auc = max(best_auc, auc)
print('test_acc = ' + str(test_acc * 100)[:4] + '%')
print('train_acc = ' + str(train_test_acc * 100)[:4] + '%')
epoch_loss = epoch_loss.cpu().data.numpy()
print('This ' + str(epoch) + 'th epoch', 'epoch average loss = ' + str(epoch_loss/(batch)))
plot_epoch_loss.append(epoch_loss / (batch))
plt.figure()
plt.plot(plot_epoch_loss)
plt.title('epoch_loss')
plt.savefig(os.path.join('saved_figs', saved_figs_dir, 'epoch_loss.png'))
# plt.savefig('saved_figs/2/epoch_loss.png')
print('lr = {}'.format(lr))
if best_acc <= test_acc: #存正確率最高的模型
# if best_auc <= auc:#存auc最高的
print('score is better store model')
torch.save(model, 'models/my_model.pkl')
else:
print("not good don't save")
print('-' * 40)
else:
#用來生成提交結果
test_data = torchvision.datasets.ImageFolder('test', transform=transformer)
test_data_loader = torch.utils.data.DataLoader(
test_data,
batch_size=10,
shuffle=False)
ret_df = pd.DataFrame(columns=['filename', 'probability'])
filenames = []
for i in test_data.imgs:
filename = os.path.basename(i[0])
filenames.append(filename)
# print(filenames)
ret_df['filename'] = filenames
for i, [x, y] in enumerate(test_data_loader):
if torch.cuda.is_available():
x = x.cuda()
x = Variable(x)
pre_out = model(x)
pro = F.softmax(pre_out).cpu().data.numpy()[:, 1]
pro = np.clip(pro, 0.000001, 0.999999)
print('The ' + str(i*10) + ' th ' + 'row')
try:
ret_df.iloc[10*i: 10*i+10, 1] = pro
except Exception:
ret_df.loc[10*i:, 'probability'] = pro
ret_df = ret_df.round(6)
print((ret_df['probability'] <= 0).sum())
print((ret_df['probability'] >= 1).sum())
ret_df.to_csv('outputs/submission.csv', index=False, encoding='utf-8')
新人學習中