added more german comments. updated readme.

chucnorrisful · Mar 8, 2019 · 90f15a8 · 90f15a8
1 parent 3fcb431
commit 90f15a8
Show file tree

Hide file tree

Showing 8 changed files with 205 additions and 115 deletions.
diff --git a/README.md b/README.md
@@ -8,18 +8,54 @@ and modding it to to use the [Rainbow-DQN](https://arxiv.org/abs/1710.02298) alg
 
 - [x] Naive DQN with basic keras-rl dqn agent
 - [x] Fully-conv network with 2 outputs (described in [this deepmind paper](https://deepmind.com/documents/110/sc2le.pdf))
-- [x] Double DQN
-- [x] Dueling DQN
-- [x] Prioritized experience replay
-- [x] Multi-step learning
-- [x] Noisy nets
-- [ ] Distributional RL
-- [ ] Final [rainbow agent](https://arxiv.org/pdf/1710.02298.pdf)
+- [x] Double DQN (described [here](https://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/viewPaper/12389))
+- [x] Dueling DQN (described [here](https://arxiv.org/abs/1511.06581))
+- [x] Prioritized experience replay (described [here](https://arxiv.org/abs/1511.05952))
+- [x] Multi-step learning (described [here](https://arxiv.org/pdf/1710.02298.pdf))
+- [x] Noisy nets (described [here](https://arxiv.org/abs/1706.10295))
+- [ ] Distributional RL - working, but not learning (described [here](https://dl.acm.org/citation.cfm?id=3305428))
+- [x] Final [rainbow agent](https://arxiv.org/pdf/1710.02298.pdf) without Distributional RL
+
+---
+### Installation:
+
+Make sure, you have Python 3.6.
+
+Follow the instructions on the [pysc2](https://github.com/deepmind/pysc2) repository 
+for installing it as well as for installing StarCraft2 and the required mini_games Maps.
+
+Follow the instructions on the [keras-rl](https://github.com/keras-rl/keras-rl) repository for installation.
+
+Follow the instructions on the [baselines](https://github.com/openai/baselines) repository for installation.
+
+You will also need the following python packages installed:
+- tensorflow 1.12 (newer is currently not working with CUDA support for me)
+- keras 2.2.4
+- numpy
+- matplotlib
+
+If you want to use a CUDA-able GPU, install tensorflow-gpu and keras-gpu as well. You need to make sure to 
+have a compatible driver and CUDA-toolkit (9.0 works for me) and the cudnn library (7.1.2 works for me) installed. 
+This provides a 5x to 20x SpeedUp and therefor is recommended for training.
+
+Running it on Linux is recommended for training as well, because it is required for running the game headless 
+with up to 2x speedup.
+
+Download the project files:
+```bash
+git clone https://github.com/chucnorrisful/dqn.git
+```
+
+The executable is located in exec.py - just set some Hyperparameters and run it!
+
+The plot.py file provides some visualisation, but you have to manually enter the 
+path to a (created by execution) log file.
+
 
 --- 
 ### Challenges and Benchmarks (Deepmind SC2 minigames)
 
-- [x] MoveToBeacon [mean: 26, max: 32]
+- [x] MoveToBeacon [mean: 25,64, max: 34]
 - [x] CollectMineralShards [mean: 89, max: 120]
 - [ ] FindAndDefeatZerglings
 - [ ] DefeatRoaches

diff --git a/agent2.py b/agent2.py
@@ -1,10 +1,8 @@
-# -*- coding: utf-8 -*-
 import warnings
 from copy import deepcopy
-
 import numpy as np
-from keras.callbacks import History
 
+from keras.callbacks import History
 from rl.callbacks import (
     CallbackList,
     TestLogger,
@@ -14,8 +12,22 @@
 )
 
 
+# Dies ist eine modifizierte Version der Keras-rl Agent Klasse, welche insbesondere in Form der fit()-Funktion die
+# Hauptschleife des Lernalgorithmus implementiert. Modifiziert wurden nur Details, die Struktur des Algorithmus ist
+# unverändert. Interessant ist dabei nur Agent3; Agent2 wird nur als Grundklasse für ältere Versionen benutzt, und
+# ist nur aus historischen Gründen noch nicht gelöscht worden.
+
 class Agent3(object):
-    """Abstract base class for all implemented agents.
+    """Modifizierte Version des Keras-rl core/Agent
+
+    Änderungen:
+    - backward() hat als zusätzliches Argument observation_1;
+    - Außerdem Änderung des Ende-der-Episode Codes in fit(), welcher nun den RingBuffer der State-Action-Paare leert.
+
+    Anmerkung: Da der Code von Keras-rl stammt, habe ich ihn nicht weiter mit Kommentaren versehen abseits
+    meiner Änderungen.
+
+    Abstract base class for all implemented agents.
 
     Each agent interacts with the environment (as defined by the `Env` class) by first observing the
     state of the environment. Based on this observation the agent changes the environment by performing
@@ -54,6 +66,7 @@ def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1,
             visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000,
             nb_max_episode_steps=None):
         """Trains the agent on the given environment.
+        Modifiziert, siehe Text am Klassenanfang.
 
         # Arguments
             env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details.
@@ -191,6 +204,7 @@ def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1,
                 if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                     # Force a terminal state.
                     done = True
+                # backward hat nun das zusätzliche Argument observation_1
                 metrics = self.backward(reward, terminal=done, observation_1=observation)
                 episode_reward += reward
 
@@ -227,6 +241,7 @@ def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1,
 
                     episode += 1
                     observation = None
+                    # RingBuffer leeren!
                     for _ in range(self.recent.maxlen):
                         self.recent.append(None)
                     episode_step = None
@@ -241,6 +256,7 @@ def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1,
 
         return history
 
+    # Änderungen, um die modifizierte Version von backward() zu ermöglichen, ansonsten unverändert.
     def test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True,
              nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1):
         """Callback that is called before training begins.
@@ -417,6 +433,7 @@ def forward(self, observation):
         """
         raise NotImplementedError()
 
+    # zusätzliches Argument observation_1 hinzugefügt!
     def backward(self, reward, terminal, observation_1):
         """Updates the agent after having executed the action returned by `forward`.
         If the policy is implemented by a neural network, this corresponds to a weight update using back-prop.
@@ -500,7 +517,10 @@ def _on_test_end(self):
 
 
 class Agent2(object):
-    """Abstract base class for all implemented agents.
+    """Modifizierte Version des Keras-rl core/Agent
+    Änderung: backward() hat nun das zusätzliche Argument observation_1.
+
+    Abstract base class for all implemented agents.
 
     Each agent interacts with the environment (as defined by the `Env` class) by first observing the
     state of the environment. Based on this observation the agent changes the environment by performing

diff --git a/env.py b/env.py
@@ -1,5 +1,4 @@
 from rl.core import Env
-
 from pysc2.env import sc2_env
 from pysc2.lib import features
 from pysc2.lib import actions
@@ -8,7 +7,16 @@
 FUNCTIONS = actions.FUNCTIONS
 
 
-class Sc2Env1Output(Env):
+# Environment Wrapper für StarCraft2 (pysc2 Bibliothek)
+# Erwartet als Action das Output-Format der FullyConv Netzwerk Architektur: ein Tupel bestehend aus zwei Arrays:
+# - einem linearen, welches Q-Werte für jede unterschiedliche Aktion enthält
+# - einem zweidimensionalen, welches Q-Werte für jede Koordinate auf dem Screen enthält
+# Die Methode action_to_sc2 wandelt dabei diesen Output in für pysc2 verwendbare Actions um.
+# Außerdem wird die Art der Observation definiert; hier werden aktuell zwei Feature-Layers übergeben:
+# - feature_screen.player_relative (Ganzzahlige Klassen 0-3 für (Nichts, Spieler, Gegner, Neutral))
+# - feature_screen.selected (1 für selektierte Einheit, 0 für rest)
+# Die Klasse implementiert das Interface Keras-rl/core/Env.
+class Sc2Env2Outputs(Env):
     last_obs = None
 
     def __init__(self, screen=16, visualize=False, env_name="MoveToBeacon", training=False):
@@ -36,22 +44,23 @@ def __init__(self, screen=16, visualize=False, env_name="MoveToBeacon", training
         )
 
     def action_to_sc2(self, act):
+
         real_action = FUNCTIONS.no_op()
 
-        # hacked to only move_screen
-        if 0 < act <= self._SCREEN * self._SCREEN:
+        if act.action == 1:
             if 331 in self.last_obs.observation.available_actions:
-                arg = act - 1
-                x = int(arg / self._SCREEN)
-                y = arg % self._SCREEN
-                real_action = FUNCTIONS.Move_screen("now", (y, x))
 
-        elif self._SCREEN * self._SCREEN < act < self._SCREEN * self._SCREEN * 2:
-            # if FUNCTIONS.select_point.id in self.last_obs.observation.available_actions:
-            arg = act - 1 - self._SCREEN * self._SCREEN
-            x = int(arg / self._SCREEN)
-            y = arg % self._SCREEN
-            real_action = FUNCTIONS.select_point("toggle", (y, x))
+                real_action = FUNCTIONS.Move_screen("now", (act.coords[1], act.coords[0]))
+
+        elif act.action == 2:
+
+            real_action = FUNCTIONS.select_point("toggle", (act.coords[1], act.coords[0]))
+
+        elif act.action == 0:
+            pass
+        else:
+            print(act.action, "wtf")
+            assert False
 
         return real_action
 
@@ -62,21 +71,27 @@ def step(self, action):
 
         observation = self.env.step(actions=(real_action,))
         self.last_obs = observation[0]
-        small_observation = [observation[0].observation.feature_screen.player_relative, observation[0].observation.feature_screen.selected]
+
+        # small_observation = observation[0].observation.feature_screen.unit_density
+        small_observation = [observation[0].observation.feature_screen.player_relative,
+                             observation[0].observation.feature_screen.selected]
 
         return small_observation, observation[0].reward, observation[0].last(), {}
 
     def reset(self):
         observation = self.env.reset()
 
-        if self._TRAINING and np.random.random_integers(1, 1) == 1:
+        if self._TRAINING and np.random.random_integers(0, 1) == 4:
             ys, xs = np.where(observation[0].observation.feature_screen.player_relative == 1)
             observation = self.env.step(actions=(FUNCTIONS.select_point("toggle", (xs[0], ys[0])),))
 
-        # observation = self.env.step(actions=(FUNCTIONS.select_army()))
+        observation = self.env.step(actions=(FUNCTIONS.select_army(0),))
 
         self.last_obs = observation[0]
-        small_observation = np.array([observation[0].observation.feature_screen.player_relative, observation[0].observation.feature_screen.selected])
+
+        # small_observation = observation[0].observation.feature_screen.unit_density
+        small_observation = [observation[0].observation.feature_screen.player_relative,
+                             observation[0].observation.feature_screen.selected]
 
         return small_observation
 
@@ -117,8 +132,14 @@ def set_visualize(self, visualize: bool):
     def set_minimap(self, minimap: int):
         self._MINIMAP = minimap
 
+    @property
+    def screen(self):
+        return self._SCREEN
+
 
-class Sc2Env2Outputs(Env):
+# Selbes wie Sc2Env2Outputs, allerdings mit anderem Output:
+# Gibt ALLE Screen-Feature-Layers zurück (war als Experiment nützlich, wird aber aktuell nicht verwendet).
+class Sc2Env2OutputsFull(Env):
     last_obs = None
 
     def __init__(self, screen=16, visualize=False, env_name="MoveToBeacon", training=False):
@@ -167,33 +188,33 @@ def action_to_sc2(self, act):
         return real_action
 
     def step(self, action):
-        # print(action, " ACTION")
 
         real_action = self.action_to_sc2(action)
 
         observation = self.env.step(actions=(real_action,))
         self.last_obs = observation[0]
 
-        # small_observation = observation[0].observation.feature_screen.unit_density
-        small_observation = [observation[0].observation.feature_screen.player_relative,
-                             observation[0].observation.feature_screen.selected]
+        small_observation = observation[0].observation.feature_screen
+        # small_observation = [observation[0].observation.feature_screen.player_relative,
+        #                      observation[0].observation.feature_screen.selected]
 
         return small_observation, observation[0].reward, observation[0].last(), {}
 
     def reset(self):
-        observation = self.env.reset()
+        self.env.reset()
 
-        if self._TRAINING and np.random.random_integers(0, 1) == 4:
-            ys, xs = np.where(observation[0].observation.feature_screen.player_relative == 1)
-            observation = self.env.step(actions=(FUNCTIONS.select_point("toggle", (xs[0], ys[0])),))
+        # if self._TRAINING and np.random.random_integers(0, 1) == 4:
+        #     ys, xs = np.where(observation[0].observation.feature_screen.player_relative == 1)
+        #     observation = self.env.step(actions=(FUNCTIONS.select_point("toggle", (xs[0], ys[0])),))
 
         observation = self.env.step(actions=(FUNCTIONS.select_army(0),))
 
         self.last_obs = observation[0]
 
-        # small_observation = observation[0].observation.feature_screen.unit_density
-        small_observation = [observation[0].observation.feature_screen.player_relative,
-                             observation[0].observation.feature_screen.selected]
+        small_observation = observation[0].observation.feature_screen
+
+        # small_observation = [observation[0].observation.feature_screen.player_relative,
+        #                      observation[0].observation.feature_screen.selected]
 
         return small_observation
 
@@ -239,7 +260,11 @@ def screen(self):
         return self._SCREEN
 
 
-class Sc2Env2OutputsFull(Env):
+# Environment Wrapper für StarCraft2 (pysc2 Bibliothek)
+# Diese Version erwartet als Action einen einzelnen Vektor mit Q-Values für entsprechende Aktionen.
+# Durch die FullyConv Architektur, welche 2 Outputs verschiedener Dimension bereitstellt ist dies nicht mehr verwendbar.
+# Aus historischen Gründen noch nicht gelöscht.
+class Sc2Env1Output(Env):
     last_obs = None
 
     def __init__(self, screen=16, visualize=False, env_name="MoveToBeacon", training=False):
@@ -267,23 +292,22 @@ def __init__(self, screen=16, visualize=False, env_name="MoveToBeacon", training
         )
 
     def action_to_sc2(self, act):
-
         real_action = FUNCTIONS.no_op()
 
-        if act.action == 1:
+        # hacked to only move_screen
+        if 0 < act <= self._SCREEN * self._SCREEN:
             if 331 in self.last_obs.observation.available_actions:
+                arg = act - 1
+                x = int(arg / self._SCREEN)
+                y = arg % self._SCREEN
+                real_action = FUNCTIONS.Move_screen("now", (y, x))
 
-                real_action = FUNCTIONS.Move_screen("now", (act.coords[1], act.coords[0]))
-
-        elif act.action == 2:
-
-            real_action = FUNCTIONS.select_point("toggle", (act.coords[1], act.coords[0]))
-
-        elif act.action == 0:
-            pass
-        else:
-            print(act.action, "wtf")
-            assert False
+        elif self._SCREEN * self._SCREEN < act < self._SCREEN * self._SCREEN * 2:
+            # if FUNCTIONS.select_point.id in self.last_obs.observation.available_actions:
+            arg = act - 1 - self._SCREEN * self._SCREEN
+            x = int(arg / self._SCREEN)
+            y = arg % self._SCREEN
+            real_action = FUNCTIONS.select_point("toggle", (y, x))
 
         return real_action
 
@@ -294,28 +318,21 @@ def step(self, action):
 
         observation = self.env.step(actions=(real_action,))
         self.last_obs = observation[0]
-
-        small_observation = observation[0].observation.feature_screen
-        # small_observation = [observation[0].observation.feature_screen.player_relative,
-        #                      observation[0].observation.feature_screen.selected]
+        small_observation = [observation[0].observation.feature_screen.player_relative, observation[0].observation.feature_screen.selected]
 
         return small_observation, observation[0].reward, observation[0].last(), {}
 
     def reset(self):
-        self.env.reset()
+        observation = self.env.reset()
 
-        # if self._TRAINING and np.random.random_integers(0, 1) == 4:
-        #     ys, xs = np.where(observation[0].observation.feature_screen.player_relative == 1)
-        #     observation = self.env.step(actions=(FUNCTIONS.select_point("toggle", (xs[0], ys[0])),))
+        if self._TRAINING and np.random.random_integers(1, 1) == 1:
+            ys, xs = np.where(observation[0].observation.feature_screen.player_relative == 1)
+            observation = self.env.step(actions=(FUNCTIONS.select_point("toggle", (xs[0], ys[0])),))
 
-        observation = self.env.step(actions=(FUNCTIONS.select_army(0),))
+        # observation = self.env.step(actions=(FUNCTIONS.select_army()))
 
         self.last_obs = observation[0]
-
-        small_observation = observation[0].observation.feature_screen
-
-        # small_observation = [observation[0].observation.feature_screen.player_relative,
-        #                      observation[0].observation.feature_screen.selected]
+        small_observation = np.array([observation[0].observation.feature_screen.player_relative, observation[0].observation.feature_screen.selected])
 
         return small_observation
 
@@ -356,6 +373,3 @@ def set_visualize(self, visualize: bool):
     def set_minimap(self, minimap: int):
         self._MINIMAP = minimap
 
-    @property
-    def screen(self):
-        return self._SCREEN