This repository has been archived by the owner on Sep 30, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
reinforccedAgent.py
131 lines (108 loc) · 4.81 KB
/
reinforccedAgent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import tensorflow as tf
import numpy as np
import time
from tf_agents.networks import network
from tf_agents.environments import tf_py_environment
from tf_agents.policies import q_policy
from tf_agents.metrics import tf_metrics
from tf_agents.drivers import dynamic_episode_driver
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.agents.dqn import dqn_agent
from tf_agents.utils import common
from tf_agents.policies import policy_saver
from battleship2_env import Battleship2
tf.compat.v1.enable_v2_behavior()
environment = Battleship2()
tf_env = tf_py_environment.TFPyEnvironment(environment)
action_spec = tf_env.action_spec()
num_actions = action_spec.maximum - action_spec.minimum + 1
# MODEL TWEAKS
FILTERS = 64
REG = None
CHANNEL_TYPE = "channels_last"
class QNetwork(network.Network):
def __init__(self, input_tensor_spec, action_spec, num_actions=num_actions, name=None):
super(QNetwork, self).__init__(
input_tensor_spec=input_tensor_spec,
state_spec=(),
name=name)
self._sub_layers = [
# tf.keras.layers.Conv2D(filters=FILTERS,kernel_size=(3,3),padding="same",use_bias=False,activation='linear',kernel_regularizer=REG,data_format=CHANNEL_TYPE),
# tf.keras.layers.BatchNormalization(axis=-1),
# tf.keras.layers.LeakyReLU(),
# tf.keras.layers.Conv2D(filters=FILTERS,kernel_size=(3,3),padding="same",use_bias=False,activation='linear',kernel_regularizer=REG,data_format=CHANNEL_TYPE),
# tf.keras.layers.BatchNormalization(axis=1),
# tf.keras.layers.LeakyReLU(),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(288),
tf.keras.layers.Dense(416),
tf.keras.layers.Dense(num_actions,activation='sigmoid'), #tweak later
]
def call(self, inputs, step_type=None, network_state=()):
del step_type
inputs = tf.cast(inputs, tf.float32)
for layer in self._sub_layers:
inputs = layer(inputs)
return inputs, network_state
timSpec = tf_env.time_step_spec()
obsSpec = tf_env.observation_spec()
q_net = QNetwork(input_tensor_spec=obsSpec,action_spec=action_spec)
q_net.create_variables(obsSpec, training=True)
print(q_net.summary())
q_policy = q_policy.QPolicy(timSpec, action_spec, q_network=q_net)
global_step = tf.compat.v1.train.get_or_create_global_step()
# AGENT DEF
agent = dqn_agent.DqnAgent(
timSpec,
action_spec,
n_step_update=4, # safe?
q_network=q_net,
optimizer=tf.keras.optimizers.Adam(0.001),
train_step_counter=global_step) #tf.compat.v1.train.AdamOptimizer(0.001)
# BUFFER TWEAKS
batch_size = 32
MAX_LENGTH = 1000
NUM_GAMES = 15
EPOCHS = 100
data_spec = (action_spec,obsSpec)
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(agent.collect_data_spec,batch_size=tf_env.batch_size,max_length=MAX_LENGTH)
num_episodes = tf_metrics.NumberOfEpisodes()
env_steps = tf_metrics.EnvironmentSteps()
lossAvg = tf.keras.metrics.Mean()
observers = [num_episodes, env_steps, replay_buffer.add_batch]
driver = dynamic_episode_driver.DynamicEpisodeDriver(tf_env, q_policy, observers, num_episodes=NUM_GAMES) # switch from episode to step
final_time_step, policy_state = driver.run()
# print('final_time_step', final_time_step)
# print('Number of Steps: ', env_steps.result().numpy())
# print('Number of Episodes: ', num_episodes.result().numpy())
# Decorated Function
agent.train = common.function(agent.train)#move
# dataset = replay_buffer.as_dataset(sample_batch_size=1,num_steps=2,single_deterministic_pass=True) # do I want single_deterministic sample_batch_size=1,
# iterator = iter(dataset)
# for _ in range(replay_buffer.num_frames()//2):
# trajectories, _ = next(iterator)
# loss = agent.train(experience=trajectories)
# lossAvg.update_state(loss[0])
# # print(loss[0].numpy())
# replay_buffer.clear()
for epoch in range(EPOCHS):
final_time_step, policy_state = driver.run(final_time_step, policy_state)
# dataset = replay_buffer.as_dataset(sample_batch_size=1,num_steps=2,single_deterministic_pass=True) # do I want single_deterministic sample_batch_size=1,
dataset = replay_buffer.as_dataset(sample_batch_size=5,num_steps=5,single_deterministic_pass=True) # do I want single_deterministic sample_batch_size=1,
iterator = iter(dataset)
for comp in iterator:
trajectories, _ = comp
loss = agent.train(experience=trajectories)
lossAvg.update_state(loss[0])
# print(loss[0].numpy())
replay_buffer.clear()
if (epoch+1) % (NUM_GAMES//10) == 0:
print(f"done A {lossAvg.result().numpy()}, {env_steps.result().numpy() / num_episodes.result().numpy()}")
lossAvg.reset_states()
env_steps.reset()
train_checkpointer = common.Checkpointer(ckpt_dir='saved_model/checkpoint2/cp',max_to_keep=1,agent=agent,policy=agent.policy,replay_buffer=replay_buffer,global_step=global_step)
train_checkpointer.save(global_step)
print("saved checkpoint")
tf_policy_saver = policy_saver.PolicySaver(agent.policy)
tf_policy_saver.save('saved_model/checkpoint2/policy')
print("saved policy")