-
Notifications
You must be signed in to change notification settings - Fork 23
/
model_setup.py
21 lines (18 loc) · 988 Bytes
/
model_setup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import tensorflow as tf
class Agent(object):
"""
A simple neural agent. It consists of a set of values for each of the bandits.
Each value is an estimate of the value of the return from choosing the bandit.
We use a policy gradient method to update the agent by moving the value for
the selected action toward the received reward.
"""
def __init__(self, n_bandits, learning_rate=0.001):
# Establish the feed-forward part of the network. This does the actual choosing.
self.w = tf.Variable(tf.ones([n_bandits]))
self.a = tf.argmax(self.w, 0)
self.reward_ph = tf.placeholder(shape=[1], dtype=tf.float32)
self.action_ph = tf.placeholder(shape=[1], dtype=tf.int32)
self.responsible_weight = tf.slice(self.w, self.action_ph, [1])
loss = -tf.log(self.responsible_weight) * self.reward_ph
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
self.update_op = optimizer.minimize(loss)