paddle深度學習基礎之模型加載及恢復訓練

前言

前面幾節,我們從各個方面對模型進行了優化,也實現了將模型保存下來。但是在日常訓練工作中我們會遇到一些突發情況,導致訓練過程主動或被動的中斷。如果訓練一個模型需要花費幾天的訓練時間,中斷後從初始狀態重新訓練是不可接受的。彆着急,這一節咱們就是討論這個事情。

保存模型

如果前面幾篇博客看過的同學,肯定已經知道如何保存模型。這裏還需要強調一下,我們不僅可以保存模型的參數,還可以保存優化器的參數。比如我們這次測試代碼使用的是動態學習率的優化器,訓練的次數不同,學習率也不一樣,所有,我們也需要把這個信息給存儲下來。

model_save_path="model/mnist-model/dygraph-mnist"
fluid.save_dygraph(model.state_dict(),model_save_path)#保存模型參數
fluid.save_dygraph(optimaizer.state_dict(),model_save_path)#保存優化器參數

加載模型

params_dict, opt_dict = fluid.load_dygraph(model_save_path)
model = MNIST()
model.load_dict(params_dict)
    optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.001,parameter_list=model.parameters())
optimizer.set_dict(opt_dict)
  • params_dict :模型的參數

  • opt_dict :優化器的參數

完整代碼

import paddle
import numpy as np
import matplotlib.pyplot as plt
import gzip
import json
import paddle.fluid as fluid
from paddle.fluid.dygraph.nn import Linear
from paddle.fluid.dygraph.nn import Conv2D,Pool2D
from tb_paddle import SummaryWriter
from PIL import Image
import os
'''
此項目主要是在網絡結構層面上優化模型
1.經典的全鏈接神經網絡
'''

#解壓縮
mnistdata = gzip.open('data/mnist.json.gz')
#通過json導入  因爲此數據使用json形式保存的,所以需要json導入 也可以使用pandas 數據導入方式
data = json.load(mnistdata)

#分別獲取訓練集、驗證集和測試集數據
train_data,val_data,test_data = data
#設置數據大小
IMG_ROWS=28
IMG_COLS=28
##數據亂序,生成批次數據
def data_loader(dataname='train',batch_size=20):
    #亂序處理方法1
    if(dataname=='train'):
        img = train_data[0]
        label = train_data[1]
    elif(dataname=='test'):
        img = test_data[0]
        label = test_data[1]
    elif(dataname=='val'):
        img = val_data[0]
        label = val_data[1]
    else:
        raise Exception("data only can be one of ['train','test','val']")
    #驗證數據有效性
    assert len(img)==len(label),'the lenth of img must be the  same as the length of label'
    list = []
    datasize = len(img)
    list = [i for i in range(datasize)]
    #打亂數據
    np.random.shuffle(list)
    #獲取數據,定義一個數據生成器
    def data_genergator():
        listdata=[]
        listlabel=[]
        for i in list:
            #轉化數據結構
            imgdata = np.reshape(img[i],[1,IMG_ROWS,IMG_COLS]).astype('float32')
            labeldata = np.reshape(label[i],[1]).astype('int64')
            listdata.append(imgdata)
            listlabel.append(labeldata)
            if(len(listdata)%batch_size==0):
                yield np.array(listdata),np.array(listlabel)
                listlabel=[]
                listdata=[]
        if(len(listdata)>0):
            yield np.array(listdata),np.array(listlabel)

    return data_genergator
#定義類
class MNIST(fluid.dygraph.Layer):
    def __init__(self):
        super(MNIST, self).__init__()
        # self.linear1 = Linear(input_dim=28*28,output_dim=10,act=None)
        # self.linear2 = Linear(input_dim=10,output_dim=10,act='sigmoid')
        # self.linear3 = Linear(input_dim=10,output_dim=1,act='sigmoid')
        self.conv1 = Conv2D(num_channels=1, num_filters=20, filter_size=5, stride=1, padding=2, act='relu')
        self.pool1 = Pool2D(pool_size=2, pool_stride=2, pool_type='max')
        self.conv2 = Conv2D(num_channels=20, num_filters=20, filter_size=5, stride=1, padding=2, act='relu')
        self.pool2 = Pool2D(pool_size=2, pool_stride=2, pool_type='max')
        self.linear = Linear(input_dim=980, output_dim=10, act='softmax')
    def forward(self, inputs,label,check_shape=False,check_content=False):
        conv1 = self.conv1(inputs)
        pool1 = self.pool1(conv1)
        conv2 = self.conv2(pool1)
        pool2 = self.pool2(conv2)
        pool21 = fluid.layers.reshape(pool2, [pool2.shape[0], -1])
        outputs = self.linear(pool21)
        # hidden1 = self.linear1(inputs)
        # hidden2 = self.linear2(hidden1)
        # outputs = self.linear3(hidden2)
        if(check_shape):
            print("\n------------打印各個層設置的網絡超參數的尺寸 -------------")
            print("conv1-- kernel_size:{}, padding:{}, stride:{}".format(self.conv1.weight.shape, self.conv1._padding, self.conv1._stride))
            print("conv2-- kernel_size:{}, padding:{}, stride:{}".format(self.conv2.weight.shape, self.conv2._padding, self.conv2._stride))
            print("pool1-- pool_type:{}, pool_size:{}, pool_stride:{}".format(self.pool1._pool_type, self.pool1._pool_size, self.pool1._pool_stride))
            print("pool2-- pool_type:{}, poo2_size:{}, pool_stride:{}".format(self.pool2._pool_type, self.pool2._pool_size, self.pool2._pool_stride))
            print("liner-- weight_size:{}, bias_size_{}, activation:{}".format(self.linear.weight.shape, self.linear.bias.shape, self.linear._act))

            print("\n------------打印各個層的形狀 -------------")
            print("inputs_shape: {}".format(inputs.shape))
            print("outputs1_shape: {}".format(conv1.shape))
            print("outputs2_shape: {}".format(pool1.shape))
            print("outputs3_shape: {}".format(conv2.shape))
            print("outputs4_shape: {}".format(pool2.shape))
            print("outputs5_shape: {}".format(outputs.shape))

        if check_content:
            # 打印卷積層的參數-卷積核權重,權重參數較多,此處只打印部分參數
            print("\n########## print convolution layer's kernel ###############")
            print("conv1 params -- kernel weights:", self.conv1.weight[0][0])
            print("conv2 params -- kernel weights:", self.conv2.weight[0][0])

            # 創建隨機數,隨機打印某一個通道的輸出值
            idx1 = np.random.randint(0, conv1.shape[1])
            idx2 = np.random.randint(0, conv1.shape[1])
            # 打印卷積-池化後的結果,僅打印batch中第一個圖像對應的特徵
            print("\nThe {}th channel of conv1 layer: ".format(idx1), conv1[0][idx1])
            print("The {}th channel of conv2 layer: ".format(idx2), conv1[0][idx2])
            print("The output of last layer:", conv1[0], '\n')
        if label is not None:
            acc = fluid.layers.accuracy(input=outputs,label=label)
            return outputs,acc
        else:
            return outputs
#訓練
with fluid.dygraph.guard():
    model = MNIST()
    model.train()
    train_loader = data_loader()
    optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.001,parameter_list=model.parameters())
    place = fluid.CPUPlace()
    traindata_loader = fluid.io.DataLoader.from_generator(capacity=5, return_list=True)
    traindata_loader.set_batch_generator(train_loader, places=place)
    EPOCH_NUM = 3
    #添加日誌
    data_writer = SummaryWriter(logdir="log/data")
    model_save_path="model/mnist-model/dygraph-mnist"
    for epoch_id in range(EPOCH_NUM):
        for batch_id,data in enumerate(traindata_loader()):
            image_data, label_data = data
            image = fluid.dygraph.to_variable(image_data)
            label = fluid.dygraph.to_variable(label_data)
            if batch_id==1000:
                predict,acc = model(image,label,check_shape=False,check_content=False)
            else:
                predict,acc = model(image,label)
            # loss = fluid.layers.square_error_cost(predict,label)
            loss = fluid.layers.cross_entropy(predict,label)
            avg_loss = fluid.layers.mean(loss)
            if batch_id !=0 and batch_id %100 ==0:
                data_writer.add_scalar("train/loss",avg_loss.numpy(),batch_id)
                data_writer.add_scalar("train/accuracy",acc.numpy(),batch_id)
                print("epoch:{},batch:{},loss is:{},acc is :{}".format(epoch_id,batch_id,avg_loss.numpy(),acc.numpy()))
            avg_loss.backward()
            optimizer.minimize(avg_loss)
            model.clear_gradients()
        print("保存模型")
        fluid.save_dygraph(model.state_dict(), model_save_path+""+str(epoch_id))
        fluid.save_dygraph(optimizer.state_dict(),model_save_path+""+str(epoch_id))
#再訓練
print("接着訓練")
with fluid.dygraph.guard():
    model = MNIST()
    model_save_path="model/mnist-model/dygraph-mnist"
    params_dict, opt_dict = fluid.load_dygraph(model_save_path+"0")
    model.load_dict(params_dict)
    train_loader = data_loader()
    optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.001,parameter_list=model.parameters())
    optimizer.set_dict(opt_dict)
    place = fluid.CPUPlace()
    traindata_loader = fluid.io.DataLoader.from_generator(capacity=5, return_list=True)
    traindata_loader.set_batch_generator(train_loader, places=place)
    EPOCH_NUM = 3
    #添加日誌
    for epoch_id in range(1,EPOCH_NUM):
        for batch_id,data in enumerate(traindata_loader()):
            image_data, label_data = data
            image = fluid.dygraph.to_variable(image_data)
            label = fluid.dygraph.to_variable(label_data)
            predict,acc = model(image,label)
            # loss = fluid.layers.square_error_cost(predict,label)
            loss = fluid.layers.cross_entropy(predict,label)
            avg_loss = fluid.layers.mean(loss)
            if batch_id !=0 and batch_id %100 ==0:
                print("epoch:{},batch:{},loss is:{},acc is :{}".format(epoch_id,batch_id,avg_loss.numpy(),acc.numpy()))
            avg_loss.backward()
            optimizer.minimize(avg_loss)
            model.clear_gradients()

總結

截至到這篇博客,整個基礎系列就總結結束了。這些資源都是百度AI Studio提供的免費課程,全程聽完,實踐後,真的是收穫很多。也很感謝製作這些課程的工作人員,也同樣希望,這一系列基礎課程能夠給大家帶來一些幫助。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章