(六)Value Function Approximation-LSPI code (6)

本篇是domain.py

  1 # -*- coding: utf-8 -*-
  2 """Contains example domains that LSPI works on."""
  3 # 包含了LSPI工作的域
  4 
  5 import abc#引入了ABC父类
  6 
  7 
  8 from random import randint, random
  9 
 10 import numpy as np
 11 
 12 from sample import Sample
 13 
 14 
 15 class Domain(object):
 16 
 17     r"""ABC for domains.
 18     #domains类继承了ABC父类
 19     Minimum interface for a reinforcement learning domain.
 20     """
 21 
 22     __metaclass__ = abc.ABCMeta#继承了abc父类
 23 
 24     @abc.abstractmethod #抽象方法
 25     def num_actions(self): #动作的数量
 26         """Return number of possible actions for the given domain.
 27         #返回每一个域中可能有的动作的数量
 28         Actions are indexed from 0 to num_actions - 1.
 29         #动作的编码方法
 30         Returns#返回值
 31         -------#
 32         int#动作的数量
 33             Number of possible actions.
 34         """
 35         pass  # pragma: no cover
 36 
 37     @abc.abstractmethod#抽象方法
 38     def current_state(self):#当前的状态
 39         """Return the current state of the domain.#返回了域中当前的状态
 40 
 41         Returns#返回
 42         -------#
 43         numpy.array#一个矩阵
 44             The current state of the environment expressed as a numpy array
 45             of the individual state variables.
 46         """
 47         pass  # pragma: no cover
 48 
 49     @abc.abstractmethod #抽象方法
 50     def apply_action(self, action)#应用动作:
 51         """Apply action and return a sample.
 52         #应用动作,返回一个sample
 53         Parameters#参数
 54         ----------
 55         action: int#动作的编号
 56             The action index to apply. This should be a number in the range
 57             [0, num_actions())
 58 
 59         Returns#返回
 60         -------
 61         sample.Sample#一个sanple
 62         ##注意:!!每个sample中包含了过去的状态,应用的动作,应用动作之后的状态,以及返回的奖励情况
 63         ##注意:这个函数就是代表了系统的状态更新情况
 64             Sample containing the previous state, the action applied, the
 65             received reward and the resulting state.
 66         """
 67         pass  # pragma: no cover
 68 
 69     @abc.abstractmethod#抽象方法
 70     def reset(self, initial_state=None):#重置
 71         """Reset the simulator to initial conditions.
 72         #将仿真器重置回初始状态
 73         Parameters#参数
 74         ----------#初始状态
 75         initial_state: numpy.array
 76             Optionally specify the state to reset to. If None then the domain
 77             should use its default initial set of states. The type will
 78             generally be a numpy.array, but a subclass may accept other types.
 79             #就是将系统状态重置,
 80 
 81         """
 82         pass  # pragma: no cover
 83 
 84     @abc.abstractmethod #抽显方法
 85     def action_name(self, action):#返回action的名字
 86         """Return a string representation of the action.
 87         #返回动作的名字
 88         Parameters
 89         ----------
 90         action: int#输入动作的编号
 91             The action index to apply. This number should be in the range
 92             [0, num_actions())
 93 
 94         Returns
 95         -------#返回动作的名字
 96         str
 97             String representation of the action index.
 98         """
 99         pass  # pragma: no cover
100 
101 
102 class ChainDomain(Domain):#定义序列域,一个域就是该算法作用的环境,比如序列域,还可能是平面小车,倒立摆,复杂的机器人系统等等
103 
104     """Chain domain from LSPI paper.
105     #序列域应用于LSPI文献
106     #一个非常简单的MDP,通常用来测试LSPI的方法并且展示接口,
107     #状态空间是一系列的离散点.有两个动作:向左,或者向右
108     Very simple MDP. Used to test LSPI methods and demonstrate the interface.
109     The state space is a series of discrete nodes in a chain. There are two
110     actions: Left and Right. These actions fail with a configurable
111     probability. When the action fails to performs the opposite action. In
112     otherwords if left is the action applied, but it fails, then the agent will
113     actually move right (assuming it is not in the right most state).
114     #动作有一定的概率会失败,如果失败了就是向另一个方向走了
115     The default reward for any action in a state is 0. There are 2 special
116     states that will give a +1 reward for entering. The two special states can
117     be configured to appear at the end of the chain, in the middle, or
118     in the middle of each half of the state space.
119     #默认的每个动作的奖励是0,有两个特殊的状态,奖励是+1,这两个特殊的状态可以出现在序列的左边中间或者右边
120     Parameters#参数
121     ----------
122     num_states: int#状态的个数
123         Number of states in the chain. Must be at least 4.#最少4个默认是10个
124         Defaults to 10 states.
125     reward_location: ChainDomain.RewardLoction #奖励位置
126         Location of the states with +1 rewards #有+1奖励的
127     failure_probability: float #动作失败的概率
128         The probability that the applied action will fail. Must be in range
129         [0, 1]
130 
131     """
132 
133     class RewardLocation(object):
134         #奖励的位置用一个类来表示!!
135 
136         """Location of states giving +1 reward in the chain.
137 
138         Ends:
139         # 在链的末尾会给出奖励
140             Rewards will be given at the ends of the chain.
141         Middle:
142         #在链的中间的两个!!状态奖励
143             Rewards will be given at the middle two states of the chain.
144         HalfMiddles:#在链的两个半边的中间的两个状态给出奖励
145             Rewards will be given at the middle two states of each half
146             of the chain.
147 
148         """
149         #为什么这么取值?
150         Ends, Middle, HalfMiddles = range(3)
151 
152     __action_names = ['left', 'right']#私有的变量,无法修改
153     #初始化函数
154     def __init__(self, num_states=10,
155                  reward_location=RewardLocation.Ends,
156                  failure_probability=.1):
157         """Initialize ChainDomain."""
158         if num_states < 4:#检查一些量是否合格
159             raise ValueError('num_states must be >= 4')
160         if failure_probability < 0 or failure_probability > 1:
161             raise ValueError('failure_probability must be in range [0, 1]')
162         #成员变量:状态个数,奖励位置,失败概率
163         self.num_states = int(num_states)
164         self.reward_location = reward_location
165         self.failure_probability = failure_probability
166 
167         self._state = ChainDomain.__init_random_state(num_states)#随机初始状态
168 
169     def num_actions(self):#动作的数目,向左向右共有两个
170         """Return number of actions.
171 
172         Chain domain has 2 actions.
173 
174         Returns
175         -------
176         int
177             Number of actions
178 
179         """
180         return 2
181 
182     def current_state(self): #当前的状态
183         """Return the current state of the domain.
184 
185         Returns
186         -------
187         numpy.array
188             The current state as a 1D numpy vector of type int.
189 
190         """
191         return self._state #直接返回成员变量
192 
193     def apply_action(self, action):#应用动作
194         """Apply the action to the chain.
195         #将动作应用到链
196         If left is applied then the occupied state index will decrease by 1.
197         Unless the agent is already at 0, in which case the state will not
198         change.
199         如果向左,那么状态数-1,除非状态在0,那么状态数不变
200         If right is applied then the occupied state index will increase by 1.
201         Unless the agent is already at num_states-1, in which case the state
202         will not change.
203         相反,向右就是+1
204         The reward function is determined by the reward location specified when
205         constructing the domain.
206         #奖励函数是由位置决定的,看是否是在奖励的位置上
207         If failure_probability is > 0 then there is the chance for the left
208         and right actions to fail. If the left action fails then the agent
209         will move right. Similarly if the right action fails then the agent
210         will move left.
211         #机器人有一定的概率会动作失败
212         Parameters#输入参数
213         ----------
214         action: int#动作
215             Action index. Must be in range [0, num_actions())
216 
217         Returns返回
218         -------#一个采样值,注意采样值所包含的项目
219         sample.Sample
220             The sample for the applied action.
221 
222         Raises#一些错误的定义
223         ------
224         ValueError
225             If the action index is outside of the range [0, num_actions())
226 
227         """
228         if action < 0 or action >= 2:#检查动作的标志是否合格
229             raise ValueError('Action index outside of bounds [0, %d)' %
230                              self.num_actions())
231 
232         action_failed = False#动作失败初始化
233         if random() < self.failure_probability:
234             action_failed = True#看看产生的概率是否是让动作失败,这其实是系统自己的动态特性.
235 
236         # this assumes that the state has one and only one occupied location
237         
238         if (action == 0 and not action_failed) #这两行是在更新状态
239                 or (action == 1 and action_failed):
240             new_location = max(0, self._state[0]-1)
241         else:
242             new_location = min(self.num_states-1, self._state[0]+1)
243 
244         next_state = np.array([new_location])
245 
246         reward = 0#判断是否到达终点,来给动作的奖励进行赋值
247         if self.reward_location == ChainDomain.RewardLocation.Ends:
248             if new_location == 0 or new_location == self.num_states-1:
249                 reward = 1
250         elif self.reward_location == ChainDomain.RewardLocation.Middle:
251             if new_location == int(self.num_states/2) 
252                     or new_location == int(self.num_states/2 + 1):
253                 reward = 1
254         else:  # HalfMiddles case
255             if new_location == int(self.num_states/4) 
256                     or new_location == int(3*self.num_states/4):
257                 reward = 1
258         #将计算出的相应的数值付给sample
259         sample = Sample(self._state.copy(), action, reward, next_state.copy())
260 
261         self._state = next_state
262 
263         return sample#返回一个采样
264 
265     def reset(self, initial_state=None):#重置状态的函数
266         """Reset the domain to initial state or specified state.
267         如果state没有给出具体的值,那么就随机付给一个初值
268         If the state is unspecified then it will generate a random state, just
269         like when constructing from scratch.
270         状态和原始的状态维度要相等,状态值可以是0或者1
271         State must be the same size as the original state. State values can be
272         either 0 or 1. There must be one and only one location that contains
273         a value of 1. Whatever the numpy array type used, it will be converted
274         to an integer numpy array.
275 
276         Parameters#输入
277         ----------
278         initial_state: numpy.array
279             The state to set the simulator to. If None then set to a random
280             state.
281 
282         Raises
283         ------
284         ValueError
285             If initial state's shape does not match (num_states, ). In
286             otherwords the initial state must be a 1D numpy array with the
287             same length as the existing state.
288         ValueError
289             If part of the state has a value or 1, or there are multiple
290             parts of the state with value of 1.
291         ValueError
292             If there are values in the state other than 0 or 1.
293 
294         """
295         if initial_state is None:
296             self._state = ChainDomain.__init_random_state(self.num_states)
297         else:
298             if initial_state.shape != (1, ):
299                 raise ValueError('The specified state did not match the '
300                                  + 'current state size')
301             state = initial_state.astype(np.int)
302             if state[0] < 0 or state[0] >= self.num_states:
303                 raise ValueError('State value must be in range '
304                                  + '[0, num_states)')
305             self._state = state
306 
307     def action_name(self, action):#返回动作名字
308         """Return string representation of actions.
309 
310         0:
311             left
312         1:
313             right
314 
315         Returns
316         -------
317         str
318             String representation of action.
319         """
320         return ChainDomain.__action_names[action] #返回私有成员
321 
322     @staticmethod #静态函数,随机初始化状态
323     def __init_random_state(num_states):
324         """Return randomly initialized state of the specified size."""
325         return np.array([randint(0, num_states-1)])