From 055724aedb496c816b3dbfdddfbc9cf8f87d24d1 Mon Sep 17 00:00:00 2001
From: Vadim Liventsev <dev@vadim.me>
Date: Wed, 14 Feb 2024 18:05:58 +0100
Subject: [PATCH] Simplified agents

---
 examples/agent.py         |  2 +-
 examples/requirements.txt |  2 +-
 programlib/agent.py       | 61 +++++++++++++++------------------------
 pyproject.toml            |  3 +-
 4 files changed, 26 insertions(+), 42 deletions(-)

diff --git a/examples/agent.py b/examples/agent.py
index 9a7e33e..9373c2b 100644
--- a/examples/agent.py
+++ b/examples/agent.py
@@ -13,7 +13,7 @@
 
 env = gym.make('MountainCarContinuous-v0', max_episode_steps=500, render_mode='human')
 program = Program(source=mountain_car_solver, language='Python')
-agent = program.spawn().rl(env.action_space, env.observation_space)
+agent = program.spawn()
 
 obs, info = env.reset()
 print(obs, info)
diff --git a/examples/requirements.txt b/examples/requirements.txt
index 977cd68..907261c 100644
--- a/examples/requirements.txt
+++ b/examples/requirements.txt
@@ -1,3 +1,3 @@
-gym[classic_control]~=0.26
+gymnasium[classic_control]
 GitPython~=3.1
 programlib
\ No newline at end of file
diff --git a/programlib/agent.py b/programlib/agent.py
index 2a7e5d9..994050e 100644
--- a/programlib/agent.py
+++ b/programlib/agent.py
@@ -1,16 +1,15 @@
 import gymnasium as gym
 import numpy as np
 
-def decode_action(action_space, action):
+def decode_action(action):
     try:
-        a = eval(action)
-
-        if not isinstance(action_space, gym.spaces.Discrete):
-            return [a]
+        x = np.array(eval(action))
     except SyntaxError:
-        return list(map(eval, action.split(r'[\p\s]+')))
+        x = np.array(map(eval, action.split(r'[\p\s]+')))
+
+    return x.reshape(-1)
     
-def encode_obs(obs_space, obs):
+def encode_obs(obs):
     try:
         obs = obs.tolist()
     except AttributeError:
@@ -39,31 +38,12 @@ def act(self, input_lines):
 
         self.process.expect(self.delimiter)
         return self.process.before.decode()
-
-    def rl(self, action_space, obs_space):
-        return RLAgent(self, action_space, obs_space)
-    
-    def close(self):
-        self.process.close()
-        self.program.exitstatus = self.process.exitstatus
     
-    def __del__(self):
-        self.close()
-
-class RLAgent():
-    """
-    Reinforcement Learning Agent: represents a running program for control in
-    an OpenAI gym environment. Mimics the interface of a stable-baselines model.
-    """
-
-    def __init__(self, agent, action_space, obs_space) -> None:
-        self.agent = agent
-        self.action_space = action_space
-        self.obs_space = obs_space
-
     def predict(self, obs, deterministic=True):
         """
-        Predict what the next action should be given the current observation
+        Predict what the next action should be given the current observation.
+        Same as act(), but designed to work with reinforcement learning envs.
+        Mimics the interface of a stable-baselines model.
 
         The observations will be passed to stdin of the program, and the action
         will be read from stdout.
@@ -71,8 +51,8 @@ def predict(self, obs, deterministic=True):
         Parameters
         ----------
         obs - the current observation
-        deterministic - whether to return the action or a pseudo-stochastic
-        vector of action probabilities (one-hot)
+        deterministic - should always be set to True, 
+        for compatibility with stable-baselines
 
         Returns (action, state) tuple
         -------
@@ -80,12 +60,17 @@ def predict(self, obs, deterministic=True):
         state - a reference to the process to examine the execution state
         """
 
-        obs_str = encode_obs(self.obs_space, obs)
-        action_str = self.agent.act(obs_str)
-        action = decode_action(self.action_space, action_str)
+        assert deterministic, "Pseudo-stochastic actions not supported"
 
-        if not deterministic:
-            actions_probs = np.zeros(self.action_space.n)
-            actions_probs[action] = 1.0
+        obs_str = encode_obs(obs)
+        action_str = self.act(obs_str)
+        action = decode_action(action_str)
 
-        return action, self.agent.process
\ No newline at end of file
+        return action, self.process
+    
+    def close(self):
+        self.process.close()
+        self.program.exitstatus = self.process.exitstatus
+    
+    def __del__(self):
+        self.close()
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 02ea841..facdc99 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "programlib"
-version = "9.0.3"
+version = "10.0.0"
 description = "Programs as Objects"
 authors = ["Vadim Liventsev <dev@vadim.me>"]
 license = "MIT"
@@ -14,7 +14,6 @@ classifiers = [
 [tool.poetry.dependencies]
 python = "^3.8"
 pexpect = "^4.8.0"
-gymnasium = " >=0.0.0"
 numpy = "^1.24.2"
 pyte = "^0.8.0"
 contextlib-chdir = {version = "^1.0.2", python = "<3.11"}