強化學習經典算法筆記(十)：使用粒子羣算法訓練Policy智能體

本文使用粒子羣算法訓練了一個小型Actor網絡，共226個參數，完美解決了CartPole遊戲。

粒子羣算法實現

羣體智能算法採用最簡單的粒子羣優化算法（PSO）。Python實現如下：

class PSO(object):
    def __init__(self, population_size, max_steps, dim=2, x_bound=[-10,10]):
        self.w = 0.6                            # 慣性權重
        self.c1 = self.c2 = 2
        self.population_size = population_size  # 粒子羣數量
        self.dim = dim                            # 搜索空間的維度
        self.max_steps = max_steps              # 迭代次數
        self.x_bound = x_bound                # 解空間範圍
        self.x = np.random.uniform(self.x_bound[0], self.x_bound[1],  # 也可以這樣寫：np.random.uniform([-1,10,20],[1,15,25],(3,3))
                                   (self.population_size, self.dim))  # 初始化粒子羣位置
        self.v = np.random.rand(self.population_size, self.dim)       # 初始化粒子羣速度
        fitness = self.calculate_fitness(self.x)
        self.p = self.x                         # 個體的最佳位置
        self.pg = self.x[np.argmin(fitness)]    # 全局最佳位置
        self.individual_best_fitness = fitness  # 個體的最優適應度
        self.global_best_fitness = np.min(fitness)  # 全局最佳適應度

    def calculate_fitness(self, x, value_net=None, device=None):
        return np.sum(pow((x-1),2), axis=1)
        # return value_net(torch.tensor(x).to(device))
        
    def evolve(self):
        # fig = plt.figure() # 費時
        for step in range(self.max_steps):
            r1 = np.random.rand(self.population_size, self.dim)
            r2 = np.random.rand(self.population_size, self.dim)
            # 更新速度和權重
            self.v = self.w*self.v+self.c1*r1*(self.p-self.x)+self.c2*r2*(self.pg-self.x)
            self.x = self.v + self.x
            # plt.clf()
            # plt.scatter(self.x[:, 0], self.x[:, 1], s=30, color='k')
            # plt.xlim(self.x_bound[0], self.x_bound[1])
            # plt.ylim(self.x_bound[0], self.x_bound[1])
            # plt.pause(0.0001)
            fitness = self.calculate_fitness(self.x)
            # 需要更新的個體
            update_id = np.greater(self.individual_best_fitness, fitness)
            self.p[update_id] = self.x[update_id]
            self.individual_best_fitness[update_id] = fitness[update_id]
            # 新一代出現了更小的fitness，所以更新全局最優fitness和位置
            if np.min(fitness) < self.global_best_fitness:
                self.pg = self.x[np.argmin(fitness)]
                self.global_best_fitness = np.min(fitness)
        print('best fitness: %.5f, mean fitness: %.5f' % (self.global_best_fitness, np.mean(fitness)))

    def evolve_step(self):
        r1 = np.random.rand(self.population_size, self.dim)
        r2 = np.random.rand(self.population_size, self.dim)
        # 更新速度和權重
        self.v = self.w*self.v+self.c1*r1*(self.p-self.x)+self.c2*r2*(self.pg-self.x)
        self.x = self.v + self.x
        fitness = self.calculate_fitness(self.x)
        # 需要更新的個體
        update_id = np.greater(self.individual_best_fitness, fitness)
        self.p[update_id] = self.x[update_id]
        self.individual_best_fitness[update_id] = fitness[update_id]
        # 新一代出現了更小的fitness，所以更新全局最優fitness和位置
        if np.min(fitness) < self.global_best_fitness:
            self.pg = self.x[np.argmin(fitness)]
            self.global_best_fitness = np.min(fitness)

Actor實現

Actor網絡採用最簡單的MLP，總共226個參數。

class Actor(nn.Module):
    def __init__(self,s_dim,hidden_size=32):
        super(Actor,self).__init__()
        # self.fc_s1 = nn.Linear(s_dim,hidden_size)
        # self.fc_a1 = nn.Linear(a_dim,hidden_size)
        # self.fc_Q1 = nn.Linear(num_atoms,hidden_size)
        
        self.fc = nn.Linear(s_dim, hidden_size)
        # self.fc_saq2 = nn.Linear(hidden_size,hidden_size)
        
        self.out = nn.Linear(hidden_size,2)
        
    def forward(self,st):
        st = F.relu(self.fc(st))
        # s_a_Q = F.relu(self.fc_saq2(s_a_Q))
        
        probs = F.softmax(self.out(st), dim=1) # accept or reject
        dist  = Categorical(probs=probs)
        action = dist.sample()
        a_probs = dist.probs.gather(-1,action.view(-1,1))
        return action, dist, a_probs

網絡參數的轉換

PSO對優化參數在形式上要求是一個向量，而Actor的參數是torch.tensor形式的張量，因此需要編寫轉換函數。

def get_vec_param(model):
    '''
    輸出模型的參數，列向量形式
    '''
    keys = list(actor.state_dict().keys())
    param = []
    for key in keys:
        param.extend(actor.state_dict()[key].view(-1).numpy())
        print(key)
    return np.array(param)

def vec_param_reverse(model,params):
    '''
    將參數從列向量恢復成Tensor形式
    '''
    keys = list(actor.state_dict().keys())
    lens = [0]
    tmp = []
    for key in keys:
        lens.append(lens[-1]+len(actor.state_dict()[key].view(-1)))
    for i in range(len(lens)-1):
        tmp.append(params[lens[i]:lens[i+1]].reshape(actor.state_dict()[keys[i]].shape))
    return tmp

def param_update(model,params):
    '''
    用params更新model的參數
    params是Tensor的列表
    '''
    i = 0
    for param in model.parameters():
        param.data.copy_(torch.tensor(params[i]))
        i += 1

改寫PSO

主要改動是計算粒子適應度的self.calculate_fitness()函數。對連續函數的優化可以通過numpy編寫函數來實現，對於強化學習，fitness採用episode total reward作爲fitness最爲直接。因此calculate_fitness()函數的內部是一個rollout()函數，使Agent與環境完成一輪交互，得到total reward。

這裏有進一步優化的空間。對粒子羣的rollout()採用for循環來實現，也就是串行運行rollout()函數。在粒子羣的平均遊戲水平不斷提高的過程中，rollout需要的總時間也在不斷增加，甚至到無法接受的地步。

解決方案有兩個，限制episode最大幀數；使用多線程技術，如multiprocessing。

class PSO(object):
    def __init__(self, population_size, max_steps, dim=226, x_bound=[-10,10]):
        self.w = 0.6                            # 慣性權重
        self.c1 = self.c2 = 1
        self.population_size = population_size  # 粒子羣數量
        self.dim = dim                          # 搜索空間的維度
        self.max_steps = max_steps              # 迭代次數
        self.x_bound = x_bound                  # 解空間範圍
        self.x = np.random.uniform(self.x_bound[0], self.x_bound[1],  # 也可以這樣寫：np.random.uniform([-1,10,20],[1,15,25],(3,3))
                                   (self.population_size, self.dim))  # 初始化粒子羣位置
        self.v = np.random.rand(self.population_size, self.dim)       # 初始化粒子羣速度
        fitness = self.calculate_fitness(self.x,actor)
        self.p = self.x                         # 個體的最佳位置
        self.pg = self.x[np.argmin(fitness)]    # 全局最佳位置
        self.individual_best_fitness = fitness  # 個體的最優適應度
        self.global_best_fitness = np.min(fitness)  # 全局最佳適應度
        
        
        # self.actor = Actor(self.env.observation_space.shape[0])
        
    def calculate_fitness(self, x ,actor):
        fitness = []
        for i in range(self.population_size):
            params = vec_param_reverse(actor,x[i])
            param_update(actor,params)
            reward = self.rollout(env,actor)
            fitness.append(-reward)
        return np.array(fitness)
        # return np.sum(pow((x-1),2), axis=1)
        # return value_net(torch.tensor(x).to(device))
    
    def rollout(self,env,actor,num_episode=2):
        total_r = []
        for episode in range(num_episode):
            st = env.reset()
            done = False
            reward = []
            while not done and len(reward)<=2000:
                at,dist,a_probs = actor(torch.FloatTensor(st).view(1,-1))
                st_,rt,done,_ = env.step(at.item())
                st = st_
                reward.append(rt)
            total_r.append(np.sum(reward))
        return np.mean(total_r)
    
    def evolve(self):
        # fig = plt.figure() # 費時
        for step in range(self.max_steps):
            r1 = np.random.rand(self.population_size, self.dim)
            r2 = np.random.rand(self.population_size, self.dim)
            # 更新速度和權重
            self.v = self.w*self.v+self.c1*r1*(self.p-self.x)+self.c2*r2*(self.pg-self.x)
            self.x = self.v + self.x
            # plt.clf()
            # plt.scatter(self.x[:, 0], self.x[:, 1], s=30, color='k')
            # plt.xlim(self.x_bound[0], self.x_bound[1])
            # plt.ylim(self.x_bound[0], self.x_bound[1])
            # plt.pause(0.0001)
            fitness = self.calculate_fitness(self.x,actor)
            # 需要更新的個體
            update_id = np.greater(self.individual_best_fitness, fitness)
            self.p[update_id] = self.x[update_id]
            self.individual_best_fitness[update_id] = fitness[update_id]
            # 新一代出現了更小的fitness，所以更新全局最優fitness和位置
            if np.min(fitness) < self.global_best_fitness:
                self.pg = self.x[np.argmin(fitness)]
                self.global_best_fitness = np.min(fitness)
        print('best fitness: %.5f, mean fitness: %.5f' % (self.global_best_fitness, np.mean(fitness)))

    def evolve_step(self):
        r1 = np.random.rand(self.population_size, self.dim)
        r2 = np.random.rand(self.population_size, self.dim)
        # 更新速度和權重
        self.v = self.w*self.v+self.c1*r1*(self.p-self.x)+self.c2*r2*(self.pg-self.x)
        self.x = self.v + self.x
        fitness = self.calculate_fitness(self.x,actor)
        # 需要更新的個體
        update_id = np.greater(self.individual_best_fitness, fitness)
        self.p[update_id] = self.x[update_id]
        self.individual_best_fitness[update_id] = fitness[update_id]
        # 新一代出現了更小的fitness，所以更新全局最優fitness和位置
        if np.min(fitness) < self.global_best_fitness:
            self.pg = self.x[np.argmin(fitness)]
            self.global_best_fitness = np.min(fitness)
        print('best fitness: %.5f, mean fitness: %.5f' % (self.global_best_fitness, np.mean(fitness)))

訓練

def test_PSO():
    pso = PSO(100, 50)
    t1 = time.time()
    for i in range(pso.max_steps):
        pso.evolve_step()
    t2 = time.time()
    print('best fitness: %.5f' % (pso.global_best_fitness))
    print((t2-t1)*1000,pso.pg)
    # plt.show()
test_PSO()

訓練曲線，50episode，最大幀數2000。全局最優解似乎依賴參數初始化，與參數訓練關係不大；全體平均水平依賴進化過程， $c_1$ 和 $c_2$ 參數是影響平均水平的關鍵超參數之一。

測試智能體水平

param = [ -6.87896184  , 5.31828134 ,  7.82629271 , 11.49515893 ,  1.0271663,
   7.80245951 ,  0.86431302  , 6.65732013 , -7.05834392 , -2.77873133,
   0.2737117  ,-11.41249693  ,-4.17671666 , 20.82454274 , 10.21621312,
   6.25155418 ,  6.84601883  ,-4.87639211  , 5.48519695,  -1.1041995,
  -1.27808745 ,  4.50248951  , 0.90334966 , -0.57488323 , -7.23400948,
   1.6727567  , -4.32737899  , 8.7040376  ,  1.26843809 , -3.85637248,
   5.37430959 ,  3.09935354  ,-2.00124693 ,  0.48069837 ,  6.6492788,
   4.11648383 , -1.59672105  ,-9.26551789  , 5.63647067 , -4.79751455,
  -1.28522267 ,  0.2746853   , 7.12620095  , 9.91123101  , 3.31125785,
  -0.98761396 ,  2.68209021  , 6.72879901 ,  2.17928962  , 0.43484967,
  -0.17051361 ,  7.18574405  ,-9.16824552 ,  6.85580742  , 0.39356039,
  -1.91960579 , -5.28012928  ,-6.64585406 , -2.88104786 , -0.93664817,
   4.62139195 ,  5.74788331  ,19.06988875 , 10.51456934 ,  7.28021944,
  -5.25869966 , 13.50789934   ,3.16167996 ,  2.1250155  ,  2.96896245,
  -5.15416997 ,  5.75334791   ,3.66229713 , 10.12652344 , -2.57011141,
 -20.84619964 ,  4.10380666  , 0.95247508 , -1.42566661,  -0.47485613,
   3.96775504 ,  4.72049171 ,  6.18003048 ,  9.31993689 , -2.15950689,
   3.97015033 , -7.20319404 , -2.86942767 , -1.21129762 , -9.40233665,
  -8.28052152 , -3.82436772 ,-11.51163202 , -6.73873155 ,-14.00143546,
   2.42079231 , 10.47531365  ,-9.64212868 ,  5.96629151, -10.68158861,
   0.46462626 ,  3.36130207  ,-9.50256154 ,  7.01769215,  -3.01207241,
  -7.82379394 , 11.10809385  , 5.47243914 ,  7.6781193 ,  -1.87116829,
   5.90623753 ,  5.61624662  ,-5.93666653 , -0.37315959 , 10.06546935,
  -8.92483729 ,  4.01005289  , 6.27772565 , -6.33809662 , -3.60532543,
  17.33469986 ,  1.76190327  ,-2.94241235 ,  1.61809649,  -5.57822847,
 -14.38582429 ,  2.03545268  ,-8.58136404 ,  5.07029378,   1.00060675,
  14.48148578 ,  7.69070164  , 2.18766084 ,  5.80161216 , -6.95092303,
  -2.524429   ,  6.21825582  , 8.88038541 , -1.4686213 ,   3.60089002,
   8.17486288 , -0.89304728  ,-0.70482453 ,  8.03158443 ,  8.69518794,
  -0.97965991 ,  3.51722298  , 6.46690211 ,  8.70787085 , -1.83222077,
   8.08140153 , -2.80639208  ,-0.72808167 , -3.83116559 ,  6.31652989,
  -5.77773393 ,  2.96048317  ,16.02955236 , -3.59239226, -11.55725956,
  -5.55743526 ,  3.73823912  ,-1.62886117 ,  1.80684061 , -4.94731609,
   6.39458554 , 10.81374966  , 6.53170828  ,-9.02887258 ,  4.61511603,
   4.45396772 ,  6.5356133   ,-9.79741849 ,  1.92043783 ,  0.94549931,
  -6.93996886 , -2.96215584  , 5.77684539 ,  2.5490333  ,  6.21542846,
  -1.23416605 , -4.91964363  , 0.80911777 , -6.10358595 ,  0.35812193,
  -5.35343233 , -0.96369554  ,-7.37074233 ,  0.6976975 ,   4.99837632,
   6.31696881 ,  3.4126433   ,-2.76266351 , 10.23040262 , -1.80379426,
   4.65767489  ,-4.49810261 ,  0.95758551 ,  5.79889708 ,  1.49071184,
  -7.61360839 ,  2.28477062,  -8.06971786 , -8.10108671 ,  5.73477901,
   2.52718578 , -0.33666324 , 17.12996485, -11.49077889 , -0.21451774,
  -5.13595118 , -6.2628335 ,   3.28553963 , -5.05751852 , -5.91102513,
   3.36703182 , -4.35517555 ,  4.82459182 , -7.18761084  , 4.47127266,
  17.14806542 ,  4.49706897  , 7.66597588 , 15.41274939 , 18.78789285,
  -7.43182869]
actor = Actor(4)
env = gym.make('CartPole-v1')
env = env.unwrapped
params = vec_param_reverse(actor,np.array(param))
param_update(actor,params)

for episode in range(10):
    st = env.reset()
    env.render()
    done = False
    reward = []
    while not done and len(reward)<=2000:
        at,dist,a_probs = actor(torch.FloatTensor(st).view(1,-1))
        st_,rt,done,_ = env.step(at.item())
        env.render()
        st = st_
        reward.append(rt)
    print(np.sum(reward))

強化學習經典算法筆記(十)：使用粒子羣算法訓練Policy智能體

強化學習經典算法筆記(十)：使用粒子羣算法訓練Policy智能體

粒子羣算法實現

Actor實現

網絡參數的轉換

改寫PSO

訓練

測試智能體水平

[轉帖]使用NMT和pmap解決JVM資源泄漏問題原創

Python實現大麥網搶票的四大關鍵技術點解析

Python 安裝庫指令大全

salesforce零基礎學習（一百三十八）零碎知識點小總結（十）

一款開源的.NET程序集反編譯、編輯和調試神器

關於接口協議，你必須要知道這些！

2020年上半年數據庫系統工程師考試

基於 Milvus + LlamaIndex 實現高級 RAG

【2024-05-21】以茶會友

強化學習經典算法筆記(十)：使用粒子羣算法訓練Policy智能體

強化學習經典算法筆記(八)：LSTM加持的A2C算法解決POMDP問題

強化學習經典算法筆記(九)：LSTM加持的PolicyGradient算法

AlphaGo Zero強化學習簡易教程（譯）

讓你手裏的Ubuntu系統爽到飛起——記Ubuntu系統的Windows化（未完待續）

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結