-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
212 lines (161 loc) · 7 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# main function that sets up environments
# perform training loop
import envs
from buffer import ReplayBuffer
from maddpg import MADDPG
import torch
import numpy as np
from tensorboardX import SummaryWriter
import os
from utilities import transpose_list, transpose_to_tensor
# keep training awake
from workspace_utils import keep_awake
# for saving gif
import imageio
def seeding(seed=1):
np.random.seed(seed)
torch.manual_seed(seed)
def pre_process(entity, batchsize):
processed_entity = []
for j in range(3):
list = []
for i in range(batchsize):
b = entity[i][j]
list.append(b)
c = torch.Tensor(list)
processed_entity.append(c)
return processed_entity
def main():
seeding()
# number of parallel agents
parallel_envs = 4
# number of training episodes.
# change this to higher number to experiment. say 30000.
number_of_episodes = 1000
def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic, lr_actor=1.0e-2, lr_critic=1.0e-2):
super(DDPGAgent, self).__init__()
self.actor = Network(in_actor, hidden_in_actor,
hidden_out_actor, out_actor, actor=True).to(device)
self.critic = Network(in_critic, hidden_in_critic,
hidden_out_critic, 1).to(device)
self.target_actor = Network(
in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device)
self.target_critic = Network(
in_critic, hidden_in_critic, hidden_out_critic, 1).to(device)
self.noise = OUNoise(out_actor, scale=1.0)
# initialize targets same as original networks
hard_update(self.target_actor, self.actor)
hard_update(self.target_critic, self.critic)
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor)
self.critic_optimizer = optim.Adam(
self.critic.parameters(), lr=lr_critic, weight_decay=1.e-5)
def target_act(self, obs, noise=0.0):
obs = obs.to(device)
action = self.target_actor(obs) + noise*self.noise.noise()
return action
episode_length = 80
batchsize = 1000
# how many episodes to save policy and gif
save_interval = 1000
t = 0
# amplitude of OU noise
# this slowly decreases to 0
noise = 2
noise_reduction = 0.9999
# how many episodes before update
episode_per_update = 2 * parallel_envs
log_path = os.getcwd()+"/log"
model_dir = os.getcwd()+"/model_dir"
os.makedirs(model_dir, exist_ok=True)
torch.set_num_threads(parallel_envs)
env = envs.make_parallel_env(parallel_envs)
# keep 5000 episodes worth of replay
buffer = ReplayBuffer(int(5000*episode_length))
# initialize policy and critic
maddpg = MADDPG()
logger = SummaryWriter(log_dir=log_path)
agent0_reward = []
agent1_reward = []
agent2_reward = []
# training loop
# show progressbar
import progressbar as pb
widget = ['episode: ', pb.Counter(), '/', str(number_of_episodes), ' ',
pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ']
timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()
# use keep_awake to keep workspace from disconnecting
for episode in keep_awake(range(0, number_of_episodes, parallel_envs)):
timer.update(episode)
reward_this_episode = np.zeros((parallel_envs, 3))
all_obs = env.reset()
obs, obs_full = transpose_list(all_obs)
# for calculating rewards for this particular episode - addition of all time steps
# save info or not
save_info = ((episode) % save_interval <
parallel_envs or episode == number_of_episodes-parallel_envs)
frames = []
tmax = 0
if save_info:
frames.append(env.render('rgb_array'))
for episode_t in range(episode_length):
t += parallel_envs
# explore = only explore for a certain number of episodes
# action input needs to be transposed
actions = maddpg.act(transpose_to_tensor(obs), noise=noise)
noise *= noise_reduction
actions_array = torch.stack(actions).detach().numpy()
# transpose the list of list
# flip the first two indices
# input to step requires the first index to correspond to number of parallel agents
actions_for_env = np.rollaxis(actions_array, 1)
# step forward one frame
next_obs, next_obs_full, rewards, dones, info = env.step(
actions_for_env)
# add data to buffer
transition = (obs, obs_full, actions_for_env,
rewards, next_obs, next_obs_full, dones)
buffer.push(transition)
reward_this_episode += rewards
obs, obs_full = next_obs, next_obs_full
# save gif frame
if save_info:
frames.append(env.render('rgb_array'))
tmax += 1
# update once after every episode_per_update
if len(buffer) > batchsize and episode % episode_per_update < parallel_envs:
for a_i in range(3):
samples = buffer.sample(batchsize)
maddpg.update(samples, a_i, logger)
maddpg.update_targets() # soft update the target network towards the actual networks
for i in range(parallel_envs):
agent0_reward.append(reward_this_episode[i, 0])
agent1_reward.append(reward_this_episode[i, 1])
agent2_reward.append(reward_this_episode[i, 2])
if episode % 100 == 0 or episode == number_of_episodes-1:
avg_rewards = [np.mean(agent0_reward), np.mean(
agent1_reward), np.mean(agent2_reward)]
agent0_reward = []
agent1_reward = []
agent2_reward = []
for a_i, avg_rew in enumerate(avg_rewards):
logger.add_scalar('agent%i/mean_episode_rewards' %
a_i, avg_rew, episode)
# saving model
save_dict_list = []
if save_info:
for i in range(3):
save_dict = {'actor_params': maddpg.maddpg_agent[i].actor.state_dict(),
'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
'critic_params': maddpg.maddpg_agent[i].critic.state_dict(),
'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict()}
save_dict_list.append(save_dict)
torch.save(save_dict_list,
os.path.join(model_dir, 'episode-{}.pt'.format(episode)))
# save gif files
imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)),
frames, duration=.04)
env.close()
logger.close()
timer.finish()
if __name__ == '__main__':
main()