Skip to content

Commit

Permalink
Merge pull request #76 from hill-a/hotfix-ppo2
Browse files Browse the repository at this point in the history
Hotfix ppo2
  • Loading branch information
araffin authored Nov 7, 2018
2 parents 795877e + 33c418b commit 9f36c9a
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 6 deletions.
6 changes: 6 additions & 0 deletions docs/misc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ Changelog

For download links, please look at `Github release page <https://github.com/hill-a/stable-baselines/releases>`_.

Release 2.2.0 (2018-11-07)
--------------------------

- Hotfix for ppo2, the wrong placeholder was used for the value function


Release 2.1.2 (2018-11-06)
--------------------------

Expand Down
2 changes: 1 addition & 1 deletion stable_baselines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from stable_baselines.ppo2 import PPO2
from stable_baselines.trpo_mpi import TRPO

__version__ = "2.1.2"
__version__ = "2.2.0"


# patch Gym spaces to add equality functions, if not implemented
Expand Down
10 changes: 5 additions & 5 deletions stable_baselines/ppo2/ppo2.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,9 +140,9 @@ def setup_model(self):
neglogpac = train_model.proba_distribution.neglogp(self.action_ph)
self.entropy = tf.reduce_mean(train_model.proba_distribution.entropy())

vpred = train_model.value_fn
vpred = train_model._value
vpredclipped = self.old_vpred_ph + tf.clip_by_value(
train_model.value_fn - self.old_vpred_ph, - self.clip_range_ph, self.clip_range_ph)
train_model._value - self.old_vpred_ph, - self.clip_range_ph, self.clip_range_ph)
vf_losses1 = tf.square(vpred - self.rewards_ph)
vf_losses2 = tf.square(vpredclipped - self.rewards_ph)
self.vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
Expand Down Expand Up @@ -265,11 +265,11 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_lo
t_first_start = time.time()

nupdates = total_timesteps // self.n_batch
for update in range(nupdates + 1):
for update in range(1, nupdates + 1):
assert self.n_batch % self.nminibatches == 0
n_batch_train = self.n_batch // self.nminibatches
t_start = time.time()
frac = 1.0 - (update / (nupdates + 1))
frac = 1.0 - (update - 1.0) / nupdates
lr_now = self.learning_rate(frac)
cliprangenow = self.cliprange(frac)
# true_reward is the reward without discount
Expand Down Expand Up @@ -319,7 +319,7 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_lo
if callback is not None:
callback(locals(), globals())

if self.verbose >= 1 and ((update + 1) % log_interval == 0 or update == 0):
if self.verbose >= 1 and (update % log_interval == 0 or update == 1):
explained_var = explained_variance(values, returns)
logger.logkv("serial_timesteps", (update + 1) * self.n_steps)
logger.logkv("nupdates", (update + 1))
Expand Down

0 comments on commit 9f36c9a

Please sign in to comment.