Added policy transformation function

Transforms a matrix of probabilities to a matrix of numerical preferences; this is the data type Discrete Policy Gradient works with
ssm-lab · Apr 10, 2024 · cded5a5 · cded5a5
1 parent b890c63
commit cded5a5
Showing 1 changed file with 20 additions and 0 deletions.
diff --git a/src/runner.py b/src/runner.py
@@ -77,6 +77,24 @@ def shape_policy(self, policy, human_input):
         policy = normalize(policy, axis=1, norm='l1')
 
         return policy
+
+    def policy_to_numerical_preferences(self, policy, environment):
+        num_states = environment.observation_space.n
+        num_actions = environment.action_space.n
+
+        theta = np.zeros((num_states, num_actions))
+
+        for state in range(num_states):
+            mu = policy[state]
+            log_sum = 0 
+            for action in range(num_actions):
+                log_sum += np.log(mu[action])
+                c = (-1 / num_actions) * log_sum
+
+        for action in range(num_actions):
+            theta[state, action] = np.log(mu[action]) + c
+
+        return theta
 
     def get_action_probabilities(self, environment, state, policy):
         logits = np.zeros(environment.action_space.n)
@@ -120,6 +138,8 @@ def discrete_policy_grad(self, human_input=None):
             policy = self.shape_policy(policy, human_input)
             logging.debug(policy)
 
+        #policy = self.policy_to_numerical_preferences(policy, environment)
+
         total_reward, total_successes = [], 0
         for episode in range(self._MAX_EPISODES):
             state = environment.reset()[0]