Skip to content

Commit

Permalink
Added policy transformation function
Browse files Browse the repository at this point in the history
Transforms a matrix of probabilities to a matrix of numerical preferences; this is the data type Discrete Policy Gradient works with
  • Loading branch information
dagenaik committed Apr 10, 2024
1 parent b890c63 commit cded5a5
Showing 1 changed file with 20 additions and 0 deletions.
20 changes: 20 additions & 0 deletions src/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,24 @@ def shape_policy(self, policy, human_input):
policy = normalize(policy, axis=1, norm='l1')

return policy

def policy_to_numerical_preferences(self, policy, environment):
num_states = environment.observation_space.n
num_actions = environment.action_space.n

theta = np.zeros((num_states, num_actions))

for state in range(num_states):
mu = policy[state]
log_sum = 0
for action in range(num_actions):
log_sum += np.log(mu[action])
c = (-1 / num_actions) * log_sum

for action in range(num_actions):
theta[state, action] = np.log(mu[action]) + c

return theta

def get_action_probabilities(self, environment, state, policy):
logits = np.zeros(environment.action_space.n)
Expand Down Expand Up @@ -120,6 +138,8 @@ def discrete_policy_grad(self, human_input=None):
policy = self.shape_policy(policy, human_input)
logging.debug(policy)

#policy = self.policy_to_numerical_preferences(policy, environment)

total_reward, total_successes = [], 0
for episode in range(self._MAX_EPISODES):
state = environment.reset()[0]
Expand Down

0 comments on commit cded5a5

Please sign in to comment.