diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index c0557293c8..2afd870378 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -5,6 +5,12 @@ Changelog For download links, please look at `Github release page `_. +Release 2.2.0 (2018-11-07) +-------------------------- + +- Hotfix for ppo2, the wrong placeholder was used for the value function + + Release 2.1.2 (2018-11-06) -------------------------- diff --git a/stable_baselines/__init__.py b/stable_baselines/__init__.py index e974c952e7..585a64f009 100644 --- a/stable_baselines/__init__.py +++ b/stable_baselines/__init__.py @@ -11,7 +11,7 @@ from stable_baselines.ppo2 import PPO2 from stable_baselines.trpo_mpi import TRPO -__version__ = "2.1.2" +__version__ = "2.2.0" # patch Gym spaces to add equality functions, if not implemented diff --git a/stable_baselines/ppo2/ppo2.py b/stable_baselines/ppo2/ppo2.py index 69a58d568f..1d93b9078d 100644 --- a/stable_baselines/ppo2/ppo2.py +++ b/stable_baselines/ppo2/ppo2.py @@ -140,9 +140,9 @@ def setup_model(self): neglogpac = train_model.proba_distribution.neglogp(self.action_ph) self.entropy = tf.reduce_mean(train_model.proba_distribution.entropy()) - vpred = train_model.value_fn + vpred = train_model._value vpredclipped = self.old_vpred_ph + tf.clip_by_value( - train_model.value_fn - self.old_vpred_ph, - self.clip_range_ph, self.clip_range_ph) + train_model._value - self.old_vpred_ph, - self.clip_range_ph, self.clip_range_ph) vf_losses1 = tf.square(vpred - self.rewards_ph) vf_losses2 = tf.square(vpredclipped - self.rewards_ph) self.vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) @@ -265,11 +265,11 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_lo t_first_start = time.time() nupdates = total_timesteps // self.n_batch - for update in range(nupdates + 1): + for update in range(1, nupdates + 1): assert self.n_batch % self.nminibatches == 0 n_batch_train = self.n_batch // self.nminibatches t_start = time.time() - frac = 1.0 - (update / (nupdates + 1)) + frac = 1.0 - (update - 1.0) / nupdates lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) # true_reward is the reward without discount @@ -319,7 +319,7 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_lo if callback is not None: callback(locals(), globals()) - if self.verbose >= 1 and ((update + 1) % log_interval == 0 or update == 0): + if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", (update + 1) * self.n_steps) logger.logkv("nupdates", (update + 1))