-
Notifications
You must be signed in to change notification settings - Fork 0
/
mappo_train.py
174 lines (150 loc) · 9.59 KB
/
mappo_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import torch
import numpy as np
from torch.utils.tensorboard import SummaryWriter
import argparse
from normalization import Normalization, RewardScaling
from replay_buffer import ReplayBuffer
from mappo_agent_gpu import MAPPO_MPE
from env import myEnv
from genMatrix.generate import generateAdj
import wandb
class Runner_MAPPO_MPE:
def __init__(self, args, env_name, number, seed):
self.args = args
# 下面这几个不用管是什么意思
self.env_name = env_name
self.number = number
self.seed = seed
# Set random seed
np.random.seed(self.seed)
torch.manual_seed(self.seed)
# Create env
self.env = myEnv(args) # 创建自己的环境,需要指定卫星数量,智能体(数据流)数量,邻接矩阵
self.args.N = self.env.agent_num # agent的数量
# self.args.obs_dim_n = [self.env.observation_space.shape[0] for i in range(self.args.N)] # obs dimensions of N agents
self.args.obs_dim_n = [self.env.observation_space for _ in range(self.args.N)]
self.args.action_dim_n = [self.env.action_space for _ in range(self.args.N)]
# self.args.action_dim_n = [self.env.action_space[i].n for i in range(self.args.N)] # actions dimensions of N agents
# Only for homogenous agents environments like Spread in MPE,all agents have the same dimension of observation space and action space
self.args.obs_dim = self.args.obs_dim_n[0] # The dimensions of an agent's observation space
self.args.action_dim = self.args.action_dim_n[0] # The dimensions of an agent's action space
self.args.state_dim = np.sum(self.args.obs_dim_n) # The dimensions of global state space(Sum of the dimensions of the local observation space of all agents)
print("observation_space=", self.env.observation_space)
print("obs_dim_n={}".format(self.args.obs_dim_n))
print("action_space=", self.env.action_space)
print("action_dim_n={}".format(self.args.action_dim_n))
# Create N agents
self.agent_n = MAPPO_MPE(self.args)
self.replay_buffer = ReplayBuffer(self.args)
# Create a tensorboard
self.writer = SummaryWriter(log_dir='runs/MAPPO/MAPPO_env_{}_number_{}_seed_{}'.format(self.env_name, self.number, self.seed))
self.evaluate_rewards = [] # Record the rewards during the evaluating
self.total_steps = 0
if self.args.use_reward_norm:
print("------use reward norm------")
self.reward_norm = Normalization(shape=self.args.N)
elif self.args.use_reward_scaling:
print("------use reward scaling------")
self.reward_scaling = RewardScaling(shape=self.args.N, gamma=self.args.gamma)
def run(self, ):
evaluate_num = -1 # Record the number of evaluations
while self.total_steps < self.args.max_train_steps:
if self.total_steps // self.args.evaluate_freq > evaluate_num:
self.evaluate_policy() # Evaluate the policy every 'evaluate_freq' steps
evaluate_num += 1
_, episode_steps = self.run_episode_mpe(evaluate=False) # Run an episode
self.total_steps += episode_steps
if self.replay_buffer.episode_num == self.args.batch_size:
self.agent_n.train(self.replay_buffer, self.total_steps) # Training
self.replay_buffer.reset_buffer()
self.evaluate_policy()
# self.env.close()
def evaluate_policy(self, ):
evaluate_reward = 0
for _ in range(self.args.evaluate_times):
episode_reward, _ = self.run_episode_mpe(evaluate=True)
evaluate_reward += episode_reward
evaluate_reward = evaluate_reward / self.args.evaluate_times
self.evaluate_rewards.append(evaluate_reward)
wandb.log({"total_step": self.total_steps, "reward": evaluate_reward})
print("total_steps:{} \t evaluate_reward:{}".format(self.total_steps, evaluate_reward))
self.writer.add_scalar('evaluate_step_rewards_{}'.format(self.env_name), evaluate_reward, global_step=self.total_steps)
# Save the rewards and models
np.save('./data_train/MAPPO_env_{}_number_{}_seed_{}.npy'.format(self.env_name, self.number, self.seed), np.array(self.evaluate_rewards))
self.agent_n.save_model(self.env_name, self.number, self.seed, self.total_steps)
def run_episode_mpe(self, evaluate=False):
episode_reward = 0
obs_n = self.env.reset()
if self.args.use_reward_scaling:
self.reward_scaling.reset()
if self.args.use_rnn: # If use RNN, before the beginning of each episode,reset the rnn_hidden of the Q network.
self.agent_n.actor.rnn_hidden = None
self.agent_n.critic.rnn_hidden = None
for episode_step in range(self.args.episode_limit):
a_n, a_logprob_n = self.agent_n.choose_action(obs_n, evaluate=evaluate) # Get actions and the corresponding log probabilities of N agents
s = np.array(obs_n).flatten() # In MPE, global state is the concatenation of all agents' local obs.
v_n = self.agent_n.get_value(s) # Get the state values (V(s)) of N agents
obs_next_n, r_n, done_n, _ = self.env.step(a_n)
episode_reward += r_n[0]
if not evaluate:
if self.args.use_reward_norm:
r_n = self.reward_norm(r_n)
elif args.use_reward_scaling:
r_n = self.reward_scaling(r_n)
# Store the transition
self.replay_buffer.store_transition(episode_step, obs_n, s, v_n, a_n, a_logprob_n, r_n, done_n)
obs_n = obs_next_n
if all(done_n):
break
if not evaluate:
# An episode is over, store v_n in the last step
s = np.array(obs_n).flatten()
v_n = self.agent_n.get_value(s)
self.replay_buffer.store_last_value(episode_step + 1, v_n)
return episode_reward, episode_step + 1
if __name__ == '__main__':
parser = argparse.ArgumentParser("Hyperparameters Setting for MAPPO in MPE environment")
parser.add_argument("--max_train_steps", type=int, default=int(3e6), help=" Maximum number of training steps")
parser.add_argument("--episode_limit", type=int, default=100, help="Maximum number of steps per episode")
parser.add_argument("--evaluate_freq", type=float, default=50, help="Evaluate the policy every 'evaluate_freq' steps")
parser.add_argument("--evaluate_times", type=float, default=1, help="Evaluate times")
parser.add_argument("--batch_size", type=int, default=32, help="Batch size (the number of episodes)")
parser.add_argument("--mini_batch_size", type=int, default=8, help="Minibatch size (the number of episodes)")
parser.add_argument("--rnn_hidden_dim", type=int, default=64, help="The number of neurons in hidden layers of the rnn")
parser.add_argument("--mlp_hidden_dim", type=int, default=64, help="The number of neurons in hidden layers of the mlp")
parser.add_argument("--lr", type=float, default=(5e-4)*0.01, help="Learning rate")
parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor")
parser.add_argument("--lamda", type=float, default=0.95, help="GAE parameter")
parser.add_argument("--epsilon", type=float, default=0.2, help="GAE parameter")
parser.add_argument("--K_epochs", type=int, default=15, help="GAE parameter")
parser.add_argument("--use_adv_norm", type=bool, default=True, help="Trick 1:advantage normalization")
parser.add_argument("--use_reward_norm", type=bool, default=True, help="Trick 3:reward normalization")
parser.add_argument("--use_reward_scaling", type=bool, default=False, help="Trick 4:reward scaling. Here, we do not use it.")
parser.add_argument("--entropy_coef", type=float, default=0.01, help="Trick 5: policy entropy")
parser.add_argument("--use_lr_decay", type=bool, default=True, help="Trick 6:learning rate Decay")
parser.add_argument("--use_grad_clip", type=bool, default=True, help="Trick 7: Gradient clip")
parser.add_argument("--use_orthogonal_init", type=bool, default=True, help="Trick 8: orthogonal initialization")
parser.add_argument("--set_adam_eps", type=float, default=True, help="Trick 9: set Adam epsilon=1e-5")
parser.add_argument("--use_relu", type=float, default=False, help="Whether to use relu, if False, we will use tanh")
parser.add_argument("--use_rnn", type=bool, default=True, help="Whether to use RNN")
parser.add_argument("--add_agent_id", type=float, default=False, help="Whether to add agent_id. Here, we do not use it.")
parser.add_argument("--use_value_clip", type=float, default=False, help="Whether to use value clip.")
# 这里加上环境所需的参数
parser.add_argument("--satellite_num", type=int, default=50, help="satellite num")
parser.add_argument("--agent_num", type=int, default=5, help="agent num")
parser.add_argument("--adj", default=generateAdj(), help="Whether to use value clip.")
args = parser.parse_args()
print(args)
wandb.init(
# set the wandb project where this run will be logged
project="my-awesome-project",
# track hyperparameters and run metadata
config={
"learning_rate": 0.02,
"architecture": "CNN",
"dataset": "CIFAR-100",
"epochs": 1000,
}
)
runner = Runner_MAPPO_MPE(args, env_name="simple_spread", number=1, seed=0)
runner.run()