From 693465a5c58eadc002c77820f631e594ad534562 Mon Sep 17 00:00:00 2001 From: Dmitry Nikulin <1315874+dniku@users.noreply.github.com> Date: Sun, 12 Apr 2020 23:11:51 +0300 Subject: [PATCH] Miscellaneous fixes to spring20 (#360) * [Week 1] Revert use of Video from IPython since it does not exist in Colab * [Week 1] Add policy() funtion and improve style of the Gym Interface notebook * [Week 4] Improve formatting of the DQN Atari TF notebook * [Week 7] Fix typo in POMDP TF practice notebook * [Week 4] Replace broken import of scipy.misc.imresize with skimage.transform.resize * [Week 6] Style fixes for Reinforce TF notebook * Add spaces around plus sign in video file creation * [Week 1] Reformulate gym_interface notebook for seminar compatibility * [Week 4] Undo more redundant line breaks in Week 4 Atari DQN TF --- week01_intro/deep_crossentropy_method.ipynb | 13 +- week01_intro/seminar_gym_interface.ipynb | 51 ++++++-- week04_approx_rl/homework_lasagne.ipynb | 4 +- week04_approx_rl/homework_pytorch_main.ipynb | 2 +- week04_approx_rl/homework_tf.ipynb | 122 ++++++++---------- week04_approx_rl/seminar_lasagne.ipynb | 2 +- week04_approx_rl/seminar_pytorch.ipynb | 2 +- week04_approx_rl/seminar_tf.ipynb | 2 +- week06_policy_based/reinforce_lasagne.ipynb | 2 +- week06_policy_based/reinforce_pytorch.ipynb | 2 +- .../reinforce_tensorflow.ipynb | 25 ++-- week07_seq2seq/practice_tf.ipynb | 2 +- week08_pomdp/practice_pytorch.ipynb | 4 +- week08_pomdp/practice_tensorflow.ipynb | 4 +- 14 files changed, 127 insertions(+), 110 deletions(-) diff --git a/week01_intro/deep_crossentropy_method.ipynb b/week01_intro/deep_crossentropy_method.ipynb index dc2e66ab2..1fcf88b79 100644 --- a/week01_intro/deep_crossentropy_method.ipynb +++ b/week01_intro/deep_crossentropy_method.ipynb @@ -270,12 +270,19 @@ "outputs": [], "source": [ "# show video\n", - "from IPython.display import Video\n", + "from IPython.display import HTML\n", "import os\n", "\n", - "video_names = list(filter(lambda s: s.endswith(\".mp4\"), os.listdir(\"./videos/\")))\n", + "video_names = [\n", + " s for s in os.listdir(\"./videos/\")\n", + " if s.endswith(\".mp4\")\n", + "]\n", "\n", - "Video(\"./videos/\"+video_names[-1]) # this may or may not be _last_ video. Try other indices" + "HTML(\"\"\"\n", + "\n", + "\"\"\".format(\"./videos/\" + video_names[-1])) # this may or may not be the _last_ video. Try other indices" ] }, { diff --git a/week01_intro/seminar_gym_interface.ipynb b/week01_intro/seminar_gym_interface.ipynb index 7fede606c..d3cac0589 100644 --- a/week01_intro/seminar_gym_interface.ipynb +++ b/week01_intro/seminar_gym_interface.ipynb @@ -115,15 +115,11 @@ "source": [ "### Play with it\n", "\n", - "Below is the code that drives the car to the right. \n", - "\n", - "However, it doesn't reach the flag at the far right due to gravity. \n", + "Below is the code that drives the car to the right. However, if you simply use the default policy, the car will not reach the flag at the far right due to gravity.\n", "\n", "__Your task__ is to fix it. Find a strategy that reaches the flag. \n", "\n", - "You're not required to build any sophisticated algorithms for now, feel free to hard-code :)\n", - "\n", - "__Hint__: your action at each step should depend either on `t` or on `s`." + "You are not required to build any sophisticated algorithms for now, feel free to hard-code :)" ] }, { @@ -134,25 +130,52 @@ "source": [ "from IPython import display\n", "\n", - "# create env manually to set time limit. Please don't change this.\n", + "# Create env manually to set time limit. Please don't change this.\n", "TIME_LIMIT = 250\n", "env = gym.wrappers.TimeLimit(\n", " gym.envs.classic_control.MountainCarEnv(),\n", " max_episode_steps=TIME_LIMIT + 1,\n", ")\n", - "s = env.reset()\n", - "actions = {'left': 0, 'stop': 1, 'right': 2}\n", - "\n", + "actions = {'left': 0, 'stop': 1, 'right': 2}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def policy(obs, t):\n", + " # Write the code for your policy here. You can use the observation\n", + " # (a tuple of position and velocity), the current time step, or both,\n", + " # if you want.\n", + " position, velocity = obs\n", + " \n", + " # This is an example policy. You can try running it, but it will not work.\n", + " # Your goal is to fix that.\n", + " return actions['right']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "plt.figure(figsize=(4, 3))\n", "display.clear_output(wait=True)\n", "\n", + "obs = env.reset()\n", "for t in range(TIME_LIMIT):\n", " plt.gca().clear()\n", " \n", - " # change the line below to reach the flag\n", - " s, r, done, _ = env.step(actions['right'])\n", + " action = policy(obs, t) # Call your policy\n", + " obs, reward, done, _ = env.step(action) # Pass the action chosen by the policy to the environment\n", + " \n", + " # We don't do anything with reward here because MountainCar is a very simple environment,\n", + " # and reward is a constant -1. Therefore, your goal is to end the episode as quickly as possible.\n", "\n", - " # draw game image on display\n", + " # Draw game image on display.\n", " plt.imshow(env.render('rgb_array'))\n", " \n", " display.clear_output(wait=True)\n", @@ -173,7 +196,7 @@ "metadata": {}, "outputs": [], "source": [ - "assert s[0] > 0.47\n", + "assert obs[0] > 0.47\n", "print(\"You solved it!\")" ] } diff --git a/week04_approx_rl/homework_lasagne.ipynb b/week04_approx_rl/homework_lasagne.ipynb index b8fcf5c2b..2aee7c8dd 100644 --- a/week04_approx_rl/homework_lasagne.ipynb +++ b/week04_approx_rl/homework_lasagne.ipynb @@ -480,7 +480,7 @@ "\n", - "\"\"\".format(\"./records/\"+video_names[-1])) # this may or may not be _last_ video. Try other indices" + "\"\"\".format(\"./records/\" + video_names[-1])) # this may or may not be _last_ video. Try other indices" ] }, { @@ -609,7 +609,7 @@ "\n", - "\"\"\".format(\"./videos/\"+video_names[-1])) # this may or may not be _last_ video. Try other indices" + "\"\"\".format(\"./videos/\" + video_names[-1])) # this may or may not be _last_ video. Try other indices" ] }, { diff --git a/week04_approx_rl/homework_pytorch_main.ipynb b/week04_approx_rl/homework_pytorch_main.ipynb index e20832ac2..2f0f36f03 100644 --- a/week04_approx_rl/homework_pytorch_main.ipynb +++ b/week04_approx_rl/homework_pytorch_main.ipynb @@ -1154,7 +1154,7 @@ "\n", - "\"\"\".format(\"./videos/\"+video_names[-1])) # this may or may not be _last_ video. Try other indices" + "\"\"\".format(\"./videos/\" + video_names[-1])) # this may or may not be _last_ video. Try other indices" ] }, { diff --git a/week04_approx_rl/homework_tf.ipynb b/week04_approx_rl/homework_tf.ipynb index 9ef465c0a..24b4d18df 100644 --- a/week04_approx_rl/homework_tf.ipynb +++ b/week04_approx_rl/homework_tf.ipynb @@ -75,7 +75,7 @@ "from gym.core import ObservationWrapper\n", "from gym.spaces import Box\n", "\n", - "from scipy.misc import imresize\n", + "from skimage.transform import resize\n", "\n", "\n", "class PreprocessAtari(ObservationWrapper):\n", @@ -92,8 +92,8 @@ " # Here's what you need to do:\n", " # * crop image, remove irrelevant parts\n", " # * resize image to self.img_size\n", - " # (use imresize imported above or any library you want,\n", - " # e.g. opencv, skimage, PIL, keras)\n", + " # (use resize imported above or any library you want,\n", + " # e.g. opencv, PIL, keras)\n", " # * cast image to grayscale\n", " # * convert image pixels to (0,1) range, float32 type\n", "\n", @@ -123,8 +123,7 @@ "assert obs.shape == observation_shape\n", "assert obs.dtype == 'float32'\n", "assert len(np.unique(obs)) > 2, \"your image must not be binary\"\n", - "assert 0 <= np.min(obs) and np.max(\n", - " obs) <= 1, \"convert image pixels to (0,1) range\"\n", + "assert 0 <= np.min(obs) and np.max(obs) <= 1, \"convert image pixels to (0,1) range\"\n", "\n", "print(\"Formal tests seem fine. Here's an example of what you'll get.\")\n", "\n", @@ -228,8 +227,7 @@ " \n", "\n", " # prepare a graph for agent step\n", - " self.state_t = tf.placeholder(\n", - " 'float32', [None, ] + list(state_shape))\n", + " self.state_t = tf.placeholder('float32', [None, ] + list(state_shape))\n", " self.qvalues_t = self.get_symbolic_qvalues(self.state_t)\n", "\n", " self.weights = tf.get_collection(\n", @@ -242,8 +240,7 @@ " qvalues = \n", "\n", " assert tf.is_numeric_tensor(qvalues) and qvalues.shape.ndims == 2, \\\n", - " \"please return 2d tf tensor of qvalues [you got %s]\" % repr(\n", - " qvalues)\n", + " \"please return 2d tf tensor of qvalues [you got %s]\" % repr(qvalues)\n", " assert int(qvalues.shape[1]) == n_actions\n", "\n", " return qvalues\n", @@ -259,8 +256,7 @@ " batch_size, n_actions = qvalues.shape\n", " random_actions = np.random.choice(n_actions, size=batch_size)\n", " best_actions = qvalues.argmax(axis=-1)\n", - " should_explore = np.random.choice(\n", - " [0, 1], batch_size, p=[1-epsilon, epsilon])\n", + " should_explore = np.random.choice([0, 1], batch_size, p=[1-epsilon, epsilon])\n", " return np.where(should_explore, random_actions, best_actions)" ] }, @@ -295,8 +291,7 @@ " reward = 0\n", " for _ in range(t_max):\n", " qvalues = agent.get_qvalues([s])\n", - " action = qvalues.argmax(\n", - " axis=-1)[0] if greedy else agent.sample_actions(qvalues)[0]\n", + " action = qvalues.argmax(axis=-1)[0] if greedy else agent.sample_actions(qvalues)[0]\n", " s, r, done, _ = env.step(action)\n", " reward += r\n", " if done:\n", @@ -345,11 +340,9 @@ "exp_replay = ReplayBuffer(10)\n", "\n", "for _ in range(30):\n", - " exp_replay.add(env.reset(), env.action_space.sample(),\n", - " 1.0, env.reset(), done=False)\n", + " exp_replay.add(env.reset(), env.action_space.sample(), 1.0, env.reset(), done=False)\n", "\n", - "obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(\n", - " 5)\n", + "obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(5)\n", "\n", "assert len(exp_replay) == 10, \"experience replay size should be 10 because that's what maximum capacity is\"" ] @@ -390,28 +383,26 @@ "\n", "# if you're using your own experience replay buffer, some of those tests may need correction.\n", "# just make sure you know what your code does\n", - "assert len(exp_replay) == 10000, \"play_and_record should have added exactly 10000 steps, \"\\\n", - " \"but instead added %i\" % len(exp_replay)\n", + "assert len(exp_replay) == 10000, (\n", + " \"play_and_record should have added exactly 10000 steps, \" +\n", + " \"but instead added %i\") % len(exp_replay)\n", "is_dones = list(zip(*exp_replay._storage))[-1]\n", "\n", - "assert 0 < np.mean(is_dones) < 0.1, \"Please make sure you restart the game whenever it is 'done' and record the is_done correctly into the buffer.\"\\\n", - " \"Got %f is_done rate over %i steps. [If you think it's your tough luck, just re-run the test]\" % (\n", - " np.mean(is_dones), len(exp_replay))\n", + "assert 0 < np.mean(is_dones) < 0.1, (\n", + " \"Please make sure you restart the game whenever it is 'done' \" +\n", + " \"and record the is_done correctly into the buffer.\" +\n", + " \"Got %f is_done rate over %i steps. [If you think it's your tough luck, just re-run the test]\"\n", + ") % (np.mean(is_dones), len(exp_replay))\n", "\n", "for _ in range(100):\n", " obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(\n", " 10)\n", " assert obs_batch.shape == next_obs_batch.shape == (10,) + state_dim\n", - " assert act_batch.shape == (\n", - " 10,), \"actions batch should have shape (10,) but is instead %s\" % str(act_batch.shape)\n", - " assert reward_batch.shape == (\n", - " 10,), \"rewards batch should have shape (10,) but is instead %s\" % str(reward_batch.shape)\n", - " assert is_done_batch.shape == (\n", - " 10,), \"is_done batch should have shape (10,) but is instead %s\" % str(is_done_batch.shape)\n", - " assert [int(i) in (0, 1)\n", - " for i in is_dones], \"is_done should be strictly True or False\"\n", - " assert [\n", - " 0 <= a <= n_actions for a in act_batch], \"actions should be within [0, n_actions]\"\n", + " assert act_batch.shape == (10,), \"actions batch should have shape (10,) but is instead %s\" % str(act_batch.shape)\n", + " assert reward_batch.shape == (10,), \"rewards batch should have shape (10,) but is instead %s\" % str(reward_batch.shape)\n", + " assert is_done_batch.shape == (10,), \"is_done batch should have shape (10,) but is instead %s\" % str(is_done_batch.shape)\n", + " assert [int(i) in (0, 1) for i in is_dones], \"is_done should be strictly True or False\"\n", + " assert [0 <= a <= n_actions for a in act_batch], \"actions should be within [0, n_actions]\"\n", "\n", "print(\"Well done!\")" ] @@ -466,8 +457,7 @@ "copy_step = load_weigths_into_target_network(agent, target_network)\n", "sess.run(copy_step)\n", "# check that it works\n", - "sess.run([tf.assert_equal(w, w_target)\n", - " for w, w_target in zip(agent.weights, target_network.weights)])\n", + "sess.run([tf.assert_equal(w, w_target) for w, w_target in zip(agent.weights, target_network.weights)])\n", "print(\"It works!\")" ] }, @@ -510,8 +500,7 @@ "outputs": [], "source": [ "current_qvalues = agent.get_symbolic_qvalues(obs_ph)\n", - "current_action_qvalues = tf.reduce_sum(tf.one_hot(\n", - " actions_ph, n_actions) * current_qvalues, axis=1)" + "current_action_qvalues = tf.reduce_sum(tf.one_hot(actions_ph, n_actions) * current_qvalues, axis=1)" ] }, { @@ -546,8 +535,7 @@ "td_loss = (current_action_qvalues - reference_qvalues) ** 2\n", "td_loss = tf.reduce_mean(td_loss)\n", "\n", - "train_step = tf.train.AdamOptimizer(\n", - " 1e-3).minimize(td_loss, var_list=agent.weights)" + "train_step = tf.train.AdamOptimizer(1e-3).minimize(td_loss, var_list=agent.weights)" ] }, { @@ -568,17 +556,12 @@ "for chk_grad in tf.gradients(reference_qvalues, agent.weights):\n", " error_msg = \"Reference q-values should have no gradient w.r.t. agent weights. Make sure you used target_network qvalues! \"\n", " error_msg += \"If you know what you're doing, ignore this assert.\"\n", - " assert chk_grad is None or np.allclose(\n", - " sess.run(chk_grad), sess.run(chk_grad * 0)), error_msg\n", - "\n", - "assert tf.gradients(reference_qvalues, is_not_done)[\n", - " 0] is not None, \"make sure you used is_not_done\"\n", - "assert tf.gradients(reference_qvalues, rewards_ph)[\n", - " 0] is not None, \"make sure you used rewards\"\n", - "assert tf.gradients(reference_qvalues, next_obs_ph)[\n", - " 0] is not None, \"make sure you used next states\"\n", - "assert tf.gradients(reference_qvalues, obs_ph)[\n", - " 0] is None, \"reference qvalues shouldn't depend on current observation!\" # ignore if you're certain it's ok\n", + " assert chk_grad is None or np.allclose(sess.run(chk_grad), sess.run(chk_grad * 0)), error_msg\n", + "\n", + "assert tf.gradients(reference_qvalues, is_not_done)[0] is not None, \"make sure you used is_not_done\"\n", + "assert tf.gradients(reference_qvalues, rewards_ph)[0] is not None, \"make sure you used rewards\"\n", + "assert tf.gradients(reference_qvalues, next_obs_ph)[0] is not None, \"make sure you used next states\"\n", + "assert tf.gradients(reference_qvalues, obs_ph)[0] is None, \"reference qvalues shouldn't depend on current observation!\" # ignore if you're certain it's ok\n", "print(\"Splendid!\")" ] }, @@ -598,13 +581,14 @@ "outputs": [], "source": [ "from tqdm import trange\n", + "import pandas as pd\n", "from IPython.display import clear_output\n", "import matplotlib.pyplot as plt\n", - "from pandas import DataFrame\n", - "moving_average = lambda x, span=100, **kw: DataFrame(\n", - " {'x': np.asarray(x)}).x.ewm(span=span, **kw).mean().values\n", "%matplotlib inline\n", "\n", + "def moving_average(x, span=100, **kw):\n", + " return pd.DataFrame({'x': np.asarray(x)}).x.ewm(span=span, **kw).mean().values\n", + "\n", "mean_rw_history = []\n", "td_loss_history = []" ] @@ -620,11 +604,13 @@ "\n", "\n", "def sample_batch(exp_replay, batch_size):\n", - " obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(\n", - " batch_size)\n", + " obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(batch_size)\n", " return {\n", - " obs_ph: obs_batch, actions_ph: act_batch, rewards_ph: reward_batch,\n", - " next_obs_ph: next_obs_batch, is_done_ph: is_done_batch\n", + " obs_ph: obs_batch,\n", + " actions_ph: act_batch,\n", + " rewards_ph: reward_batch,\n", + " next_obs_ph: next_obs_batch,\n", + " is_done_ph: is_done_batch,\n", " }" ] }, @@ -635,29 +621,27 @@ "outputs": [], "source": [ "for i in trange(10**5):\n", - "\n", " # play\n", " play_and_record(agent, env, exp_replay, 10)\n", "\n", " # train\n", - " _, loss_t = sess.run([train_step, td_loss],\n", - " sample_batch(exp_replay, batch_size=64))\n", + " _, loss_t = sess.run([train_step, td_loss], sample_batch(exp_replay, batch_size=64))\n", " td_loss_history.append(loss_t)\n", "\n", " # adjust agent parameters\n", " if i % 500 == 0:\n", - " #load_weigths_into_target_network(agent, target_network)\n", - " # calling 'load_weights_into_target_network' repeatedly cause creating tf copy operator\n", - " # again and again, which bloat memory consumption along training step\n", - " # create'copy_step' once\n", + " # You could think that loading weights onto a target network is simply\n", + " # load_weigths_into_target_network(agent, target_network)\n", + " # but actually calling this function repeatedly creates a TF copy operator\n", + " # again and again, which bloats memory consumption with each training step.\n", + " # Instead, you should create 'copy_step' once.\n", " sess.run(copy_step)\n", " agent.epsilon = max(agent.epsilon * 0.99, 0.01)\n", " mean_rw_history.append(evaluate(make_env(), agent, n_games=3))\n", "\n", " if i % 100 == 0:\n", " clear_output(True)\n", - " print(\"buffer size = %i, epsilon = %.5f\" %\n", - " (len(exp_replay), agent.epsilon))\n", + " print(\"buffer size = %i, epsilon = %.5f\" % (len(exp_replay), agent.epsilon))\n", "\n", " plt.subplot(1, 2, 1)\n", " plt.title(\"mean reward per game\")\n", @@ -668,8 +652,7 @@ " plt.figure(figsize=[12, 4])\n", " plt.subplot(1, 2, 2)\n", " plt.title(\"TD loss history (moving average)\")\n", - " plt.plot(moving_average(\n", - " np.array(td_loss_history), span=100, min_periods=100))\n", + " plt.plot(moving_average(np.array(td_loss_history), span=100, min_periods=100))\n", " plt.grid()\n", " plt.show()" ] @@ -748,14 +731,13 @@ "from IPython.display import HTML\n", "import os\n", "\n", - "video_names = list(\n", - " filter(lambda s: s.endswith(\".mp4\"), os.listdir(\"./videos/\")))\n", + "video_names = list(filter(lambda s: s.endswith(\".mp4\"), os.listdir(\"./videos/\")))\n", "\n", "HTML(\"\"\"\n", "\n", - "\"\"\".format(\"./videos/\"+video_names[-1])) # this may or may not be _last_ video. Try other indices" + "\"\"\".format(\"./videos/\" + video_names[-1])) # this may or may not be _last_ video. Try other indices" ] }, { diff --git a/week04_approx_rl/seminar_lasagne.ipynb b/week04_approx_rl/seminar_lasagne.ipynb index 7a36e968d..d80665ed7 100644 --- a/week04_approx_rl/seminar_lasagne.ipynb +++ b/week04_approx_rl/seminar_lasagne.ipynb @@ -328,7 +328,7 @@ "\n", - "\"\"\".format(\"./videos/\"+video_names[-1])) # this may or may not be _last_ video. Try other indices" + "\"\"\".format(\"./videos/\" + video_names[-1])) # this may or may not be _last_ video. Try other indices" ] }, { diff --git a/week04_approx_rl/seminar_pytorch.ipynb b/week04_approx_rl/seminar_pytorch.ipynb index e2a5d2dd9..5371147f6 100644 --- a/week04_approx_rl/seminar_pytorch.ipynb +++ b/week04_approx_rl/seminar_pytorch.ipynb @@ -365,7 +365,7 @@ "\n", - "\"\"\".format(\"./videos/\"+video_names[-1])) # this may or may not be _last_ video. Try other indices" + "\"\"\".format(\"./videos/\" + video_names[-1])) # this may or may not be _last_ video. Try other indices" ] }, { diff --git a/week04_approx_rl/seminar_tf.ipynb b/week04_approx_rl/seminar_tf.ipynb index 6358022ce..75813c8b1 100644 --- a/week04_approx_rl/seminar_tf.ipynb +++ b/week04_approx_rl/seminar_tf.ipynb @@ -354,7 +354,7 @@ "\n", - "\"\"\".format(\"./videos/\"+video_names[-1])) #this may or may not be _last_ video. Try other indices" + "\"\"\".format(\"./videos/\" + video_names[-1])) #this may or may not be _last_ video. Try other indices" ] }, { diff --git a/week06_policy_based/reinforce_lasagne.ipynb b/week06_policy_based/reinforce_lasagne.ipynb index 9487ad1b4..3dcb9cefe 100644 --- a/week06_policy_based/reinforce_lasagne.ipynb +++ b/week06_policy_based/reinforce_lasagne.ipynb @@ -429,7 +429,7 @@ "\n", - "\"\"\".format(\"./videos/\"+video_names[-1])) # this may or may not be _last_ video. Try other indices" + "\"\"\".format(\"./videos/\" + video_names[-1])) # this may or may not be _last_ video. Try other indices" ] }, { diff --git a/week06_policy_based/reinforce_pytorch.ipynb b/week06_policy_based/reinforce_pytorch.ipynb index 019838259..0ec8f5308 100644 --- a/week06_policy_based/reinforce_pytorch.ipynb +++ b/week06_policy_based/reinforce_pytorch.ipynb @@ -373,7 +373,7 @@ "\n", - "\"\"\".format(\"./videos/\"+video_names[-1])) # this may or may not be the _last_ video. Try other indices" + "\"\"\".format(\"./videos/\" + video_names[-1])) # this may or may not be the _last_ video. Try other indices" ] }, { diff --git a/week06_policy_based/reinforce_tensorflow.ipynb b/week06_policy_based/reinforce_tensorflow.ipynb index eb67a6af1..73b9cf48d 100644 --- a/week06_policy_based/reinforce_tensorflow.ipynb +++ b/week06_policy_based/reinforce_tensorflow.ipynb @@ -114,7 +114,8 @@ "outputs": [], "source": [ "# utility function to pick action in one given state\n", - "def get_action_proba(s): return policy.eval({states: [s]})[0]" + "def get_action_proba(s):\n", + " return policy.eval({states: [s]})[0]" ] }, { @@ -143,7 +144,7 @@ "metadata": {}, "outputs": [], "source": [ - "# get probabilities for parti\n", + "# select log-probabilities for chosen actions, log pi(a_i|s_i)\n", "indices = tf.stack([tf.range(tf.shape(log_policy)[0]), actions], axis=-1)\n", "log_policy_for_actions = tf.gather_nd(log_policy, indices)" ] @@ -226,12 +227,15 @@ "outputs": [], "source": [ "assert len(get_cumulative_rewards(range(100))) == 100\n", - "assert np.allclose(get_cumulative_rewards([0, 0, 1, 0, 0, 1, 0], gamma=0.9), [\n", - " 1.40049, 1.5561, 1.729, 0.81, 0.9, 1.0, 0.0])\n", - "assert np.allclose(get_cumulative_rewards(\n", - " [0, 0, 1, -2, 3, -4, 0], gamma=0.5), [0.0625, 0.125, 0.25, -1.5, 1.0, -4.0, 0.0])\n", - "assert np.allclose(get_cumulative_rewards(\n", - " [0, 0, 1, 2, 3, 4, 0], gamma=0), [0, 0, 1, 2, 3, 4, 0])\n", + "assert np.allclose(\n", + " get_cumulative_rewards([0, 0, 1, 0, 0, 1, 0], gamma=0.9),\n", + " [1.40049, 1.5561, 1.729, 0.81, 0.9, 1.0, 0.0])\n", + "assert np.allclose(\n", + " get_cumulative_rewards([0, 0, 1, -2, 3, -4, 0], gamma=0.5),\n", + " [0.0625, 0.125, 0.25, -1.5, 1.0, -4.0, 0.0])\n", + "assert np.allclose(\n", + " get_cumulative_rewards([0, 0, 1, 2, 3, 4, 0], gamma=0),\n", + " [0, 0, 1, 2, 3, 4, 0])\n", "print(\"looks good!\")" ] }, @@ -289,6 +293,7 @@ "\n", " train_step(states, actions, rewards)\n", "\n", + " # technical: return session rewards to print them later\n", " return sum(rewards)" ] }, @@ -308,7 +313,7 @@ " print(\"mean reward:%.3f\" % (np.mean(rewards)))\n", "\n", " if np.mean(rewards) > 300:\n", - " print(\"You Win!\")\n", + " print(\"You Win!\") # but you can train even further\n", " break" ] }, @@ -349,7 +354,7 @@ "\n", - "\"\"\".format(\"./videos/\"+video_names[-1])) # this may or may not be _last_ video. Try other indices" + "\"\"\".format(\"./videos/\" + video_names[-1])) # this may or may not be _last_ video. Try other indices" ] }, { diff --git a/week07_seq2seq/practice_tf.ipynb b/week07_seq2seq/practice_tf.ipynb index 62cfa8225..32fd72689 100644 --- a/week07_seq2seq/practice_tf.ipynb +++ b/week07_seq2seq/practice_tf.ipynb @@ -10,7 +10,7 @@ "\n", " * word (sequence of letters in source language) -> translation (sequence of letters in target language)\n", "\n", - "Unlike what most deep learning practicioners do, we won't only train it to maximize likelihood of correct translation, but also employ reinforcement learning to actually teach it to translate with as few errors as possible.\n", + "Unlike what most deep learning practitioners do, we won't only train it to maximize likelihood of correct translation, but also employ reinforcement learning to actually teach it to translate with as few errors as possible.\n", "\n", "\n", "### About the task\n", diff --git a/week08_pomdp/practice_pytorch.ipynb b/week08_pomdp/practice_pytorch.ipynb index 7e240348c..0f00681e6 100644 --- a/week08_pomdp/practice_pytorch.ipynb +++ b/week08_pomdp/practice_pytorch.ipynb @@ -312,7 +312,7 @@ "\n", - "\"\"\".format(\"./kungfu_videos/\"+video_names[-1])) # this may or may not be _last_ video. Try other indices" + "\"\"\".format(\"./kungfu_videos/\" + video_names[-1])) # this may or may not be _last_ video. Try other indices" ] }, { @@ -646,7 +646,7 @@ "\n", - "\"\"\".format(\"./kungfu_videos/\"+video_names[-1])) # this may or may not be _last_ video. Try other indices" + "\"\"\".format(\"./kungfu_videos/\" + video_names[-1])) # this may or may not be _last_ video. Try other indices" ] } ], diff --git a/week08_pomdp/practice_tensorflow.ipynb b/week08_pomdp/practice_tensorflow.ipynb index 0208029ec..d322a36f0 100644 --- a/week08_pomdp/practice_tensorflow.ipynb +++ b/week08_pomdp/practice_tensorflow.ipynb @@ -337,7 +337,7 @@ "\n", - "\"\"\".format(\"./kungfu_videos/\"+video_names[-1])) # this may or may not be _last_ video. Try other indices" + "\"\"\".format(\"./kungfu_videos/\" + video_names[-1])) # this may or may not be _last_ video. Try other indices" ] }, { @@ -585,7 +585,7 @@ "\n", - "\"\"\".format(\"./kungfu_videos/\"+video_names[-1])) # this may or may not be _last_ video. Try other indices" + "\"\"\".format(\"./kungfu_videos/\" + video_names[-1])) # this may or may not be _last_ video. Try other indices" ] }, {