前言:
前面我们讲了一下策略评估的原理,以及例子.
强化学习核心是找到最优的策略,这里
重点讲解两个知识点:
策略改进
策略迭代与值迭代
最后以下面环境E 为例,给出Python 代码
。
目录:
1: 策略改进
2: 策略迭代与值迭代
3: 策略迭代代码实现 Python 代码
一 策略改进
理想的策略应该能够最大化累积奖赏:
最优策略对应的值函数称为最优值函数
状态值函数(Bellman 等式):
动作求和
......16.9
......16.9
状态-动作值函数
状态值函数(Bellman 等式): 动作求和
...16.10
...16.10
由于最优值的累计奖赏已经最大,可以对前面的Bellman 等式做改动,
即使对动作求和 改为取最优
最优
....16.13
...16.13
则....16.14 带入16.10
...16.10
...16.10
最优Bellman 等式揭示了非最优策略的改进方式:
将策略选择的动作改变为当前的最优动作。这样改进能使策略更好
策略为,改变动作的条件为:
带入16.10,可以得到递推不等式
16.16
二 策略迭代与值迭代
可以看出:策略迭代法在每次改进策略后都要对策略进行重新评估,因此比较耗时。
由公式16.16 策略改进 与值函数的改进是一致的
由公式16.13可得
于是可得值迭代(value iteration)算法.
三 策略迭代代码实现
# -*- coding: utf-8 -*-
"""
Created on Wed Nov 1 19:34:00 2023@author: cxf
"""# -*- coding: utf-8 -*-
"""
Created on Mon Oct 30 15:38:17 2023@author: chengxf2
"""
import numpy as np
from enum import Enum
import copyclass State(Enum):#状态空间X shortWater =1 #缺水health = 2 #健康overflow = 3 #凋亡apoptosis = 4 #溢水class Action(Enum):#动作空间Awater = 1 #浇水noWater = 2 #不浇水class Env():def __init__(self):#状态空间self.X = [State.shortWater, State.health,State.overflow, State.apoptosis] #动作空间self.A = [Action.water,Action.noWater] #从状态x出发,执行动作a,转移到新的状态x',得到的奖赏 r为已知道self.Q ={}self.Q[State.shortWater] = [[Action.water,0.5, State.shortWater,-1],[Action.water,0.5, State.health,1],[Action.noWater,0.4, State.shortWater,-1],[Action.noWater,0.6, State.overflow,-100]]self.Q[State.health] = [[Action.water,0.6, State.health,1],[Action.water,0.4, State.overflow,-1],[Action.noWater,0.6, State.shortWater,-1],[Action.noWater,0.4, State.health,1]]self.Q[State.overflow] = [[Action.water,0.6, State.overflow,-1],[Action.water,0.4, State.apoptosis,-100],[Action.noWater,0.6, State.health,1],[Action.noWater,0.4, State.overflow,-1]]self.Q[State.apoptosis] =[[Action.water,1, State.apoptosis,-100],[Action.noWater,1, State.apoptosis,-100]]self.curV ={} #前面的累积奖赏,t时刻的累积奖赏self.V ={} #累积奖赏,t-1时刻的累积奖赏for x in self.X: self.V[x] =0self.curV[x]=0def GetX(self):#获取状态空间return self.Xdef GetAction(self):#获取动作空间return self.Adef GetQTabel(self):#获取状态转移概率return self.Qclass LearningAgent():def initStrategy(self): #初始化策略stragegy ={}stragegy[State.shortWater] = Action.waterstragegy[State.health] = Action.waterstragegy[State.overflow] = Action.waterstragegy[State.apoptosis] = Action.waterself.stragegy = stragegydef __init__(self):env = Env()self.X = env.GetX()self.A = env.GetAction()self.QTabel = env.GetQTabel()self.curV ={} #前面的累积奖赏self.V ={} #累积奖赏for x in self.X: self.V[x] =0self.curV[x]=0def evaluation(self,T):#策略评估for t in range(1,T):#当前策略下面的累积奖赏for state in self.X: #状态空间reward = 0.0action = self.stragegy[state]QTabel= self.QTabel[state]for Q in QTabel:if action == Q[0]:#在状态x 下面执行了动作a,转移到了新的状态,得到的rnewstate = Q[2] p_a_ss = Q[1]r_a_ss = Q[-1]#print("\n p_a_ss",p_a_ss, "\t r_a_ss ",r_a_ss)reward += p_a_ss*((1.0/t)*r_a_ss + (1.0-1/t)*self.V[newstate])self.curV[state] = reward if (T+1)== t:breakelse:self.V = self.curVdef improve(self,T):#策略改进stragegy ={}for state in self.X:QTabel= self.QTabel[state]max_reward = -float('inf') #计算每种Q(state, action)for action in self.A:reward = 0.0for Q in QTabel:if action == Q[0]:#在状态x 下面执行了动作a,转移到了新的状态,得到的rnewstate = Q[2] p_a_ss = Q[1]r_a_ss = Q[-1]#print("\n p_a_ss",p_a_ss, "\t r_a_ss ",r_a_ss)reward += p_a_ss*((1.0/T)*r_a_ss + (1.0-1/T)*self.V[newstate])if reward> max_reward:max_reward = rewardstragegy[state] = action#print("\n state ",state, "\t action ",action, "\t reward %4.2f"%reward)return stragegydef compare(self,dict1, dict2):#策略比较for key in dict1:if dict1[key] != dict2.get(key):return Falsereturn Truedef learn(self,T):#随机初始化策略self.initStrategy()n = 0while True:self.evaluation(T-1) #策略评估n = n+1print("\n 迭代次数 %d"%n ,State.shortWater.name, "\t 奖赏: %4.2f "%self.V[State.shortWater],State.health.name, "\t 奖赏: %4.2f "%self.V[State.health],State.overflow.name, "\t 奖赏: %4.2f "%self.V[State.overflow],State.apoptosis.name, "\t 奖赏: %4.2f "%self.V[State.apoptosis],)strategyN =self.improve(T) #策略改进#print("\n ---cur---\n",self.stragegy,"\n ---new-- \n ",strategyN )if self.compare(self.stragegy,strategyN):print("\n ----- 最终策略 -----\n ")for state in self.X:print("\n state ",state, "\t action: ",self.stragegy[state])breakelse:for state in self.X:self.stragegy[state] = strategyN[state]if __name__ == "__main__":T =10agent = LearningAgent()agent.learn(T)
参考:
机器学习.周志华《16 强化学习 》_51CTO博客_机器学习 周志华
CSDN