diff --git a/evogym/envs/base.py b/evogym/envs/base.py index 41ba7333..82022ce4 100644 --- a/evogym/envs/base.py +++ b/evogym/envs/base.py @@ -392,15 +392,6 @@ def __init__( EvoGymBase.__init__(self, world=world, render_mode=render_mode, render_options=render_options) self.default_viewer.track_objects('robot') - - def step(self, action): - - action_copy = {} - - for robot_name, a in action.items(): - action_copy[robot_name] = a + 1 - - return super().step(action_copy) def pos_at_time(self, time): return super().pos_at_time(time)*self.VOXEL_SIZE diff --git a/examples/bo/run.py b/examples/bo/run.py index 7cc0707a..9cd5e6f4 100644 --- a/examples/bo/run.py +++ b/examples/bo/run.py @@ -1,9 +1,8 @@ -from distutils.command.config import config import os from re import X import shutil -import random import numpy as np +import argparse from GPyOpt.core.task.space import Design_space from GPyOpt.models import GPModel @@ -13,17 +12,9 @@ from GPyOpt.core.evaluators import ThompsonBatch from .optimizer import Objective, Optimization -import sys -curr_dir = os.path.dirname(os.path.abspath(__file__)) -root_dir = os.path.join(curr_dir, '..') -external_dir = os.path.join(root_dir, 'externals') -sys.path.insert(0, root_dir) -sys.path.insert(1, os.path.join(external_dir, 'pytorch_a2c_ppo_acktr_gail')) - +from ppo.run import run_ppo import evogym.envs from evogym import is_connected, has_actuator, get_full_connectivity -from utils.algo_utils import TerminationCondition -from ppo import run_ppo def get_robot_from_genome(genome, config): ''' @@ -36,6 +27,8 @@ def get_robot_from_genome(genome, config): def eval_genome_cost(genome, config, genome_id, generation): robot = get_robot_from_genome(genome, config) + args, env_name = config['args'], config['env_name'] + if not (is_connected(robot) and has_actuator(robot)): return 10 else: @@ -45,9 +38,7 @@ def eval_genome_cost(genome, config, genome_id, generation): save_path_controller = os.path.join(save_path_generation, 'controller') np.savez(save_path_structure, robot, connectivity) fitness = run_ppo( - structure=(robot, connectivity), - termination_condition=TerminationCondition(config['train_iters']), - saving_convention=(save_path_controller, genome_id), + args, robot, env_name, save_path_controller, f'{genome_id}', connectivity ) cost = -fitness return cost @@ -61,20 +52,23 @@ def eval_genome_constraint(genomes, config): return np.array(all_violation) def run_bo( - experiment_name, - structure_shape, - pop_size, - max_evaluations, - train_iters, - num_cores, - ): - - save_path = os.path.join(root_dir, 'saved_data', experiment_name) + args: argparse.Namespace, +): + exp_name, env_name, pop_size, structure_shape, max_evaluations, num_cores = ( + args.exp_name, + args.env_name, + args.pop_size, + args.structure_shape, + args.max_evaluations, + args.num_cores, + ) + + save_path = os.path.join('saved_data', exp_name) try: os.makedirs(save_path) except: - print(f'THIS EXPERIMENT ({experiment_name}) ALREADY EXISTS') + print(f'THIS EXPERIMENT ({exp_name}) ALREADY EXISTS') print('Override? (y/n): ', end='') ans = input() if ans.lower() == 'y': @@ -88,13 +82,13 @@ def run_bo( with open(save_path_metadata, 'w') as f: f.write(f'POP_SIZE: {pop_size}\n' \ f'STRUCTURE_SHAPE: {structure_shape[0]} {structure_shape[1]}\n' \ - f'MAX_EVALUATIONS: {max_evaluations}\n' \ - f'TRAIN_ITERS: {train_iters}\n') + f'MAX_EVALUATIONS: {max_evaluations}\n') config = { 'structure_shape': structure_shape, - 'train_iters': train_iters, 'save_path': save_path, + 'args': args, # args for run_ppo + 'env_name': env_name, } def constraint_func(genome): diff --git a/examples/cppn_neat/run.py b/examples/cppn_neat/run.py index 91c78f45..d0649793 100644 --- a/examples/cppn_neat/run.py +++ b/examples/cppn_neat/run.py @@ -1,9 +1,9 @@ import os import shutil -import random import numpy as np import torch import neat +import argparse import sys curr_dir = os.path.dirname(os.path.abspath(__file__)) @@ -11,16 +11,14 @@ external_dir = os.path.join(root_dir, 'externals') sys.path.insert(0, root_dir) sys.path.insert(1, os.path.join(external_dir, 'PyTorch-NEAT')) -sys.path.insert(1, os.path.join(external_dir, 'pytorch_a2c_ppo_acktr_gail')) from pytorch_neat.cppn import create_cppn from .parallel import ParallelEvaluator from .population import Population -from utils.algo_utils import TerminationCondition -from ppo import run_ppo -from evogym import is_connected, has_actuator, get_full_connectivity, hashable +from ppo.run import run_ppo import evogym.envs +from evogym import is_connected, has_actuator, get_full_connectivity, hashable def get_cppn_input(structure_shape): @@ -43,15 +41,16 @@ def get_robot_from_genome(genome, config): def eval_genome_fitness(genome, config, genome_id, generation): robot = get_robot_from_genome(genome, config) + args, env_name = config.extra_info['args'], config.extra_info['env_name'] + connectivity = get_full_connectivity(robot) save_path_generation = os.path.join(config.extra_info['save_path'], f'generation_{generation}') save_path_structure = os.path.join(save_path_generation, 'structure', f'{genome_id}') save_path_controller = os.path.join(save_path_generation, 'controller') np.savez(save_path_structure, robot, connectivity) + fitness = run_ppo( - structure=(robot, connectivity), - termination_condition=TerminationCondition(config.extra_info['train_iters']), - saving_convention=(save_path_controller, genome_id), + args, robot, env_name, save_path_controller, f'{genome_id}', connectivity ) return fitness @@ -93,20 +92,23 @@ def post_evaluate(self, config, population, species, best_genome): f.write(out) def run_cppn_neat( - experiment_name, - structure_shape, - pop_size, - max_evaluations, - train_iters, - num_cores, - ): + args: argparse.Namespace +): + exp_name, env_name, pop_size, structure_shape, max_evaluations, num_cores = ( + args.exp_name, + args.env_name, + args.pop_size, + args.structure_shape, + args.max_evaluations, + args.num_cores, + ) - save_path = os.path.join(root_dir, 'saved_data', experiment_name) + save_path = os.path.join('saved_data', exp_name) try: os.makedirs(save_path) except: - print(f'THIS EXPERIMENT ({experiment_name}) ALREADY EXISTS') + print(f'THIS EXPERIMENT ({exp_name}) ALREADY EXISTS') print('Override? (y/n): ', end='') ans = input() if ans.lower() == 'y': @@ -120,8 +122,7 @@ def run_cppn_neat( with open(save_path_metadata, 'w') as f: f.write(f'POP_SIZE: {pop_size}\n' \ f'STRUCTURE_SHAPE: {structure_shape[0]} {structure_shape[1]}\n' \ - f'MAX_EVALUATIONS: {max_evaluations}\n' \ - f'TRAIN_ITERS: {train_iters}\n') + f'MAX_EVALUATIONS: {max_evaluations}\n') structure_hashes = {} @@ -134,9 +135,10 @@ def run_cppn_neat( config_path, extra_info={ 'structure_shape': structure_shape, - 'train_iters': train_iters, 'save_path': save_path, 'structure_hashes': structure_hashes, + 'args': args, # args for run_ppo + 'env_name': env_name, }, custom_config=[ ('NEAT', 'pop_size', pop_size), diff --git a/examples/ga/run.py b/examples/ga/run.py index fdb98201..5d033f4b 100644 --- a/examples/ga/run.py +++ b/examples/ga/run.py @@ -3,34 +3,40 @@ import shutil import random import math +import argparse +from typing import List -import sys -curr_dir = os.path.dirname(os.path.abspath(__file__)) -root_dir = os.path.join(curr_dir, '..') -external_dir = os.path.join(root_dir, 'externals') -sys.path.insert(0, root_dir) -sys.path.insert(1, os.path.join(external_dir, 'pytorch_a2c_ppo_acktr_gail')) - -from ppo import run_ppo +from ppo.run import run_ppo +import evogym.envs from evogym import sample_robot, hashable import utils.mp_group as mp -from utils.algo_utils import get_percent_survival_evals, mutate, TerminationCondition, Structure +from utils.algo_utils import get_percent_survival_evals, mutate, Structure -def run_ga(experiment_name, structure_shape, pop_size, max_evaluations, train_iters, num_cores): +def run_ga( + args: argparse.Namespace, +): print() - - ### STARTUP: MANAGE DIRECTORIES ### - home_path = os.path.join(root_dir, "saved_data", experiment_name) + + exp_name, env_name, pop_size, structure_shape, max_evaluations, num_cores = ( + args.exp_name, + args.env_name, + args.pop_size, + args.structure_shape, + args.max_evaluations, + args.num_cores, + ) + + ### MANAGE DIRECTORIES ### + home_path = os.path.join("saved_data", exp_name) start_gen = 0 - ### DEFINE TERMINATION CONDITION ### - tc = TerminationCondition(train_iters) + ### DEFINE TERMINATION CONDITION ### is_continuing = False try: os.makedirs(home_path) except: - print(f'THIS EXPERIMENT ({experiment_name}) ALREADY EXISTS') + print(f'THIS EXPERIMENT ({exp_name}) ALREADY EXISTS') print("Override? (y/n/c): ", end="") ans = input() if ans.lower() == "y": @@ -46,10 +52,10 @@ def run_ga(experiment_name, structure_shape, pop_size, max_evaluations, train_it ### STORE META-DATA ## if not is_continuing: - temp_path = os.path.join(root_dir, "saved_data", experiment_name, "metadata.txt") + temp_path = os.path.join("saved_data", exp_name, "metadata.txt") try: - os.makedirs(os.path.join(root_dir, "saved_data", experiment_name)) + os.makedirs(os.path.join("saved_data", exp_name)) except: pass @@ -57,11 +63,10 @@ def run_ga(experiment_name, structure_shape, pop_size, max_evaluations, train_it f.write(f'POP_SIZE: {pop_size}\n') f.write(f'STRUCTURE_SHAPE: {structure_shape[0]} {structure_shape[1]}\n') f.write(f'MAX_EVALUATIONS: {max_evaluations}\n') - f.write(f'TRAIN_ITERS: {train_iters}\n') f.close() else: - temp_path = os.path.join(root_dir, "saved_data", experiment_name, "metadata.txt") + temp_path = os.path.join("saved_data", exp_name, "metadata.txt") f = open(temp_path, "r") count = 0 for line in f: @@ -71,18 +76,15 @@ def run_ga(experiment_name, structure_shape, pop_size, max_evaluations, train_it structure_shape = (int(line.split()[1]), int(line.split()[2])) if count == 2: max_evaluations = int(line.split()[1]) - if count == 3: - train_iters = int(line.split()[1]) - tc.change_target(train_iters) count += 1 print(f'Starting training with pop_size {pop_size}, shape ({structure_shape[0]}, {structure_shape[1]}), ' + - f'max evals: {max_evaluations}, train iters {train_iters}.') + f'max evals: {max_evaluations}.') f.close() ### GENERATE // GET INITIAL POPULATION ### - structures = [] + structures: List[Structure] = [] population_structure_hashes = {} num_evaluations = 0 generation = 0 @@ -103,7 +105,7 @@ def run_ga(experiment_name, structure_shape, pop_size, max_evaluations, train_it else: for g in range(start_gen+1): for i in range(pop_size): - save_path_structure = os.path.join(root_dir, "saved_data", experiment_name, "generation_" + str(g), "structure", str(i) + ".npz") + save_path_structure = os.path.join("saved_data", exp_name, "generation_" + str(g), "structure", str(i) + ".npz") np_data = np.load(save_path_structure) structure_data = [] for key, value in np_data.items(): @@ -125,8 +127,8 @@ def run_ga(experiment_name, structure_shape, pop_size, max_evaluations, train_it ### MAKE GENERATION DIRECTORIES ### - save_path_structure = os.path.join(root_dir, "saved_data", experiment_name, "generation_" + str(generation), "structure") - save_path_controller = os.path.join(root_dir, "saved_data", experiment_name, "generation_" + str(generation), "controller") + save_path_structure = os.path.join("saved_data", exp_name, "generation_" + str(generation), "structure") + save_path_controller = os.path.join("saved_data", exp_name, "generation_" + str(generation), "controller") try: os.makedirs(save_path_structure) @@ -150,19 +152,20 @@ def run_ga(experiment_name, structure_shape, pop_size, max_evaluations, train_it for structure in structures: if structure.is_survivor: - save_path_controller_part = os.path.join(root_dir, "saved_data", experiment_name, "generation_" + str(generation), "controller", - "robot_" + str(structure.label) + "_controller" + ".pt") - save_path_controller_part_old = os.path.join(root_dir, "saved_data", experiment_name, "generation_" + str(generation-1), "controller", - "robot_" + str(structure.prev_gen_label) + "_controller" + ".pt") + save_path_controller_part = os.path.join("saved_data", exp_name, "generation_" + str(generation), "controller", + f"{structure.label}.zip") + save_path_controller_part_old = os.path.join("saved_data", exp_name, "generation_" + str(generation-1), "controller", + f"{structure.prev_gen_label}.zip") print(f'Skipping training for {save_path_controller_part}.\n') try: shutil.copy(save_path_controller_part_old, save_path_controller_part) except: print(f'Error coppying controller for {save_path_controller_part}.\n') - else: - ppo_args = ((structure.body, structure.connections), tc, (save_path_controller, structure.label)) + else: + ppo_args = (args, structure.body, env_name, save_path_controller, f'{structure.label}', structure.connections) group.add_job(run_ppo, ppo_args, callback=structure.set_reward) + group.run_jobs(num_cores) @@ -177,7 +180,7 @@ def run_ga(experiment_name, structure_shape, pop_size, max_evaluations, train_it structures = sorted(structures, key=lambda structure: structure.fitness, reverse=True) #SAVE RANKING TO FILE - temp_path = os.path.join(root_dir, "saved_data", experiment_name, "generation_" + str(generation), "output.txt") + temp_path = os.path.join("saved_data", exp_name, "generation_" + str(generation), "output.txt") f = open(temp_path, "w") out = "" diff --git a/examples/ppo/__init__.py b/examples/ppo/__init__.py deleted file mode 100644 index 9108e19a..00000000 --- a/examples/ppo/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from ppo import arguments -from ppo.run import run_ppo \ No newline at end of file diff --git a/examples/ppo/args.py b/examples/ppo/args.py new file mode 100644 index 00000000..b0b2f8a6 --- /dev/null +++ b/examples/ppo/args.py @@ -0,0 +1,60 @@ +import argparse + +def add_ppo_args(parser: argparse.ArgumentParser) -> None: + """ + Add PPO arguments to the parser + """ + + ppo_parser: argparse.ArgumentParser = parser.add_argument_group('ppo arguments') + + ppo_parser.add_argument( + '--verbose-ppo', default=1, type=int, help='Verbosity level for PPO: 0 for no output, 1 for info messages (such as device or wrappers used), 2 for debug messages (default: 1)' + ) + ppo_parser.add_argument( + '--learning-rate', default=2.5e-4, type=float, help='Learning rate for PPO (default: 2.5e-4)' + ) + ppo_parser.add_argument( + '--n-steps', default=128, type=int, help='The number of steps to run for each environment per update for PPO (i.e. rollout buffer size is n_steps * n_envs where n_envs is number of environment copies running in parallel) (default: 128)' + ) + ppo_parser.add_argument( + '--batch-size', default=4, type=int, help='Mini-batch size for PPO (default: 4)' + ) + ppo_parser.add_argument( + '--n-epochs', default=4, type=int, help='Number of epochs when optimizing the surrogate objective for PPO (default: 4)' + ) + ppo_parser.add_argument( + '--gamma', default=0.99, type=float, help='Discount factor for PPO (default: 0.99)' + ) + ppo_parser.add_argument( + '--gae-lambda', default=0.95, type=float, help='Lambda parameter for Generalized Advantage Estimation for PPO (default: 0.95)' + ) + ppo_parser.add_argument( + '--vf-coef', default=0.5, type=float, help='Value function coefficient for PPO loss calculation (default: 0.5)' + ) + ppo_parser.add_argument( + '--max-grad-norm', default=0.5, type=float, help='The maximum value of the gradient clipping for PPO (default: 0.5)' + ) + ppo_parser.add_argument( + '--ent-coef', default=0.01, type=float, help='Entropy coefficient for PPO loss calculation (default: 0.01)' + ) + ppo_parser.add_argument( + '--clip-range', default=0.1, type=float, help='Clipping parameter for PPO (default: 0.1)' + ) + ppo_parser.add_argument( + '--total-timesteps', default=1e6, type=int, help='Total number of timesteps for PPO (default: 1e6)' + ) + ppo_parser.add_argument( + '--log-interval', default=50, type=int, help='Episodes before logging PPO (default: 50)' + ) + ppo_parser.add_argument( + '--n-envs', default=1, type=int, help='Number of parallel environments for PPO (default: 1)' + ) + ppo_parser.add_argument( + '--n-eval-envs', default=1, type=int, help='Number of parallel environments for PPO evaluation (default: 1)' + ) + ppo_parser.add_argument( + '--n-evals', default=1, type=int, help='Number of times to run the environment during each eval (default: 1)' + ) + ppo_parser.add_argument( + '--eval-interval', default=1e5, type=int, help='Number of steps before evaluating PPO model (default: 1e5)' + ) \ No newline at end of file diff --git a/examples/ppo/arguments.py b/examples/ppo/arguments.py deleted file mode 100644 index d99c6e6a..00000000 --- a/examples/ppo/arguments.py +++ /dev/null @@ -1,167 +0,0 @@ -import argparse -import torch - -# Derived from -# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail - -def get_args(): - parser = argparse.ArgumentParser(description='RL') - parser.add_argument( - '--algo', default='ppo', help='algorithm to use: a2c | ppo | acktr') - parser.add_argument( - '--gail', - action='store_true', - default=False, - help='do imitation learning with gail') - parser.add_argument( - '--gail-experts-dir', - default='./gail_experts', - help='directory that contains expert demonstrations for gail') - parser.add_argument( - '--gail-batch-size', - type=int, - default=128, - help='gail batch size (default: 128)') - parser.add_argument( - '--gail-epoch', type=int, default=5, help='gail epochs (default: 5)') - parser.add_argument( - '--lr', type=float, default=2.5e-4, help='learning rate (default: 2.5e-4)') - parser.add_argument( - '--eps', - type=float, - default=1e-5, - help='RMSprop optimizer epsilon (default: 1e-5)') - parser.add_argument( - '--alpha', - type=float, - default=0.99, - help='RMSprop optimizer apha (default: 0.99)') - parser.add_argument( - '--gamma', - type=float, - default=0.99, - help='discount factor for rewards (default: 0.99)') - parser.add_argument( - '--use-gae', - action='store_true', - default=True, - help='use generalized advantage estimation') - parser.add_argument( - '--gae-lambda', - type=float, - default=0.95, - help='gae lambda parameter (default: 0.95)') - parser.add_argument( - '--entropy-coef', - type=float, - default=0.01, - help='entropy term coefficient (default: 0.01)') - parser.add_argument( - '--value-loss-coef', - type=float, - default=0.5, - help='value loss coefficient (default: 0.5)') - parser.add_argument( - '--max-grad-norm', - type=float, - default=0.5, - help='max norm of gradients (default: 0.5)') - parser.add_argument( - '--seed', type=int, default=1, help='random seed (default: 1)') - parser.add_argument( - '--cuda-deterministic', - action='store_true', - default=False, - help="sets flags for determinism when using CUDA (potentially slow!)") - parser.add_argument( - '--num-processes', - type=int, - default=1, - help='how many training CPU processes to use (default: 1)') - parser.add_argument( - '--num-steps', - type=int, - default=128, - help='number of forward steps in A2C / num steps to use in PPO (default: 128)') - parser.add_argument( - '--ppo-epoch', - type=int, - default=4, - help='number of ppo epochs (default: 4)') - parser.add_argument( - '--num-mini-batch', - type=int, - default=4, - help='number of batches for ppo (default: 4)') - parser.add_argument( - '--clip-param', - type=float, - default=0.1, - help='ppo clip parameter (default: 0.1)') - parser.add_argument( - '--log-interval', - type=int, - default=10, - help='log interval, one log per n updates (default: 10)') - parser.add_argument( - '--save-interval', - type=int, - default=100, - help='save interval, one save per n updates (default: 100)') - parser.add_argument( - '--num-evals', - type=int, - default=1, - help='number of times to evaluate each controller (for evaluation purposes not training). (default: 1) as most Evolution Gym environments are deterministic.') - parser.add_argument( - '--eval-interval', - type=int, - default=None, - help='eval interval, one eval per n updates (default: None)') - parser.add_argument( - '--num-env-steps', - type=int, - default=10e6, - help='number of environment steps to train (default: 10e6)') - parser.add_argument( - '--env-name', - default='roboticgamedesign-v0', - help='environment to train on (default: roboticgamedesign-v0)') - parser.add_argument( - '--log-dir', - default='/tmp/gym/', - help='directory to save agent logs (default: /tmp/gym)') - parser.add_argument( - '--save-dir', - default='./trained_models/', - help='directory to save agent logs (default: ./trained_models/)') - parser.add_argument( - '--no-cuda', - action='store_true', - default=False, - help='disables CUDA training') - parser.add_argument( - '--use-proper-time-limits', - action='store_true', - default=False, - help='compute returns taking into account time limits') - parser.add_argument( - '--recurrent-policy', - action='store_true', - default=False, - help='use a recurrent policy') - parser.add_argument( - '--use-linear-lr-decay', - action='store_true', - default=True, - help='use a linear schedule on the learning rate') - args = parser.parse_args() - - args.cuda = not args.no_cuda and torch.cuda.is_available() - - assert args.algo in ['a2c', 'ppo', 'acktr'] - if args.recurrent_policy: - assert args.algo in ['a2c', 'ppo'], \ - 'Recurrent policy is not implemented for ACKTR' - - return args diff --git a/examples/ppo/callback.py b/examples/ppo/callback.py new file mode 100644 index 00000000..a6e6b72b --- /dev/null +++ b/examples/ppo/callback.py @@ -0,0 +1,116 @@ +import os +from typing import List, Optional +import numpy as np +from ppo.eval import eval_policy +from stable_baselines3.common.callbacks import BaseCallback + +class EvalCallback(BaseCallback): + """ + A custom callback that derives from ``BaseCallback``. + + :param verbose: Verbosity level: 0 for no output, 1 for info messages, 2 for debug messages + """ + def __init__( + self, + body: np.ndarray, + env_name: str, + eval_every: int, + n_evals: int, + n_envs: int, + model_save_dir: str, + model_save_name: str, + connections: Optional[np.ndarray] = None, + verbose: int = 0 + ): + super().__init__(verbose) + # Those variables will be accessible in the callback + # (they are defined in the base class) + # The RL model + # self.model = None # type: BaseAlgorithm + # An alias for self.model.get_env(), the environment used for training + # self.training_env # type: VecEnv + # Number of time the callback was called + # self.n_calls = 0 # type: int + # num_timesteps = n_envs * n times env.step() was called + # self.num_timesteps = 0 # type: int + # local and global variables + # self.locals = {} # type: Dict[str, Any] + # self.globals = {} # type: Dict[str, Any] + # The logger object, used to report things in the terminal + # self.logger # type: stable_baselines3.common.logger.Logger + # Sometimes, for event callback, it is useful + # to have access to the parent object + # self.parent = None # type: Optional[BaseCallback] + + self.body = body + self.connections = connections + self.env_name = env_name + self.eval_every = eval_every + self.n_evals = n_evals + self.n_envs = n_envs + self.model_save_dir = model_save_dir + self.model_save_name = model_save_name + + if not os.path.exists(model_save_dir): + os.makedirs(model_save_dir) + + self.best_reward = -float('inf') + + def _on_training_start(self) -> None: + """ + This method is called before the first rollout starts. + """ + pass + + def _on_rollout_start(self) -> None: + """ + A rollout is the collection of environment interaction + using the current policy. + This event is triggered before collecting new samples. + """ + pass + + def _on_step(self) -> bool: + """ + This method will be called by the model after each call to `env.step()`. + + For child callback (of an `EventCallback`), this will be called + when the event is triggered. + + :return: If the callback returns False, training is aborted early. + """ + + if self.num_timesteps % self.eval_every == 0: + self._validate_and_save() + return True + + def _on_rollout_end(self) -> None: + """ + This event is triggered before updating the policy. + """ + pass + + def _on_training_end(self) -> None: + """ + This event is triggered before exiting the `learn()` method. + """ + self._validate_and_save() + + def _validate_and_save(self) -> None: + rewards = eval_policy( + model=self.model, + body=self.body, + connections=self.connections, + env_name=self.env_name, + n_evals=self.n_evals, + n_envs=self.n_envs, + ) + out = f"[{self.model_save_name}] Mean: {np.mean(rewards):.3}, Std: {np.std(rewards):.3}, Min: {np.min(rewards):.3}, Max: {np.max(rewards):.3}" + mean_reward = np.mean(rewards).item() + if mean_reward > self.best_reward: + out += f" NEW BEST ({mean_reward:.3} > {self.best_reward:.3})" + self.best_reward = mean_reward + self.model.save(os.path.join(self.model_save_dir, self.model_save_name)) + if self.verbose > 0: + print(out) + \ No newline at end of file diff --git a/examples/ppo/envs.py b/examples/ppo/envs.py deleted file mode 100644 index ab23424d..00000000 --- a/examples/ppo/envs.py +++ /dev/null @@ -1,258 +0,0 @@ -import os -import gym -import numpy as np -import torch -from gym.spaces.box import Box - -from stable_baselines3.common.monitor import Monitor -from stable_baselines3.common.atari_wrappers import ( - NoopResetEnv, MaxAndSkipEnv, EpisodicLifeEnv, FireResetEnv, WarpFrame, ClipRewardEnv) -from stable_baselines3.common.vec_env import VecEnvWrapper, DummyVecEnv, SubprocVecEnv -from stable_baselines3.common.vec_env.vec_normalize import \ - VecNormalize as VecNormalize_ - -try: - import dm_control2gym -except ImportError: - pass - -try: - import roboschool -except ImportError: - pass - -try: - import pybullet_envs -except ImportError: - pass - -# Derived from -# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail - -def make_env(env_id, robot_structure, seed, rank, log_dir, allow_early_resets): - def _thunk(): - if env_id.startswith("dm"): - _, domain, task = env_id.split('.') - env = dm_control2gym.make(domain_name=domain, task_name=task) - else: - env = gym.make(env_id, body = robot_structure[0], connections = robot_structure[1]) - - is_atari = hasattr(gym.envs, 'atari') and isinstance( - env.unwrapped, gym.envs.atari.atari_env.AtariEnv) - if is_atari: - env = NoopResetEnv(env, noop_max=30) - env = MaxAndSkipEnv(env, skip=4) - - env.seed(seed + rank) - - if str(env.__class__.__name__).find('TimeLimit') >= 0: - env = TimeLimitMask(env) - - if log_dir is not None: - env = Monitor( - env, - os.path.join(log_dir, str(rank)), - allow_early_resets=allow_early_resets) - - if is_atari: - if len(env.observation_space.shape) == 3: - env = EpisodicLifeEnv(env) - if "FIRE" in env.unwrapped.get_action_meanings(): - env = FireResetEnv(env) - env = WarpFrame(env, width=84, height=84) - env = ClipRewardEnv(env) - elif len(env.observation_space.shape) == 3: - #ASK ABOUT THIS - pass - # raise NotImplementedError( - # "CNN models work only for atari,\n" - # "please use a custom wrapper for a custom pixel input env.\n" - # "See wrap_deepmind for an example.") - - # If the input has shape (W,H,3), wrap for PyTorch convolutions - obs_shape = env.observation_space.shape - if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: - env = TransposeImage(env, op=[2, 0, 1]) - - return env - - return _thunk - - -def make_vec_envs(env_name, - robot_structure, - seed, - num_processes, - gamma, - log_dir, - device, - allow_early_resets, - num_frame_stack=None): - envs = [ - make_env(env_name, robot_structure, seed, i, log_dir, allow_early_resets) - for i in range(num_processes) - ] - - if len(envs) > 1: - envs = SubprocVecEnv(envs) - else: - envs = DummyVecEnv(envs) - - if len(envs.observation_space.shape) == 1: - if gamma is None: - envs = VecNormalize(envs, norm_reward=False) - else: - envs = VecNormalize(envs, gamma=gamma) - - envs = VecPyTorch(envs, device) - - if num_frame_stack is not None: - envs = VecPyTorchFrameStack(envs, num_frame_stack, device) - elif len(envs.observation_space.shape) == 3: - envs = VecPyTorchFrameStack(envs, 4, device) - - return envs - - -# Checks whether done was caused my timit limits or not -class TimeLimitMask(gym.Wrapper): - def step(self, action): - obs, rew, done, info = self.env.step(action) - if done and self.env._max_episode_steps == self.env._elapsed_steps: - info['bad_transition'] = True - - return obs, rew, done, info - - def reset(self, **kwargs): - return self.env.reset(**kwargs) - - -# Can be used to test recurrent policies for Reacher-v2 -class MaskGoal(gym.ObservationWrapper): - def observation(self, observation): - if self.env._elapsed_steps > 0: - observation[-2:] = 0 - return observation - - -class TransposeObs(gym.ObservationWrapper): - def __init__(self, env=None): - """ - Transpose observation space (base class) - """ - super(TransposeObs, self).__init__(env) - - -class TransposeImage(TransposeObs): - def __init__(self, env=None, op=[2, 0, 1]): - """ - Transpose observation space for images - """ - super(TransposeImage, self).__init__(env) - assert len(op) == 3, "Error: Operation, " + str(op) + ", must be dim3" - self.op = op - obs_shape = self.observation_space.shape - self.observation_space = Box( - self.observation_space.low[0, 0, 0], - self.observation_space.high[0, 0, 0], [ - obs_shape[self.op[0]], obs_shape[self.op[1]], - obs_shape[self.op[2]] - ], - dtype=self.observation_space.dtype) - - def observation(self, ob): - return ob.transpose(self.op[0], self.op[1], self.op[2]) - - -class VecPyTorch(VecEnvWrapper): - def __init__(self, venv, device): - """Return only every `skip`-th frame""" - super(VecPyTorch, self).__init__(venv) - self.device = device - # TODO: Fix data types - - def reset(self): - obs = self.venv.reset() - obs = torch.from_numpy(obs).float().to(self.device) - return obs - - def step_async(self, actions): - if isinstance(actions, torch.LongTensor): - # Squeeze the dimension for discrete actions - actions = actions.squeeze(1) - actions = actions.cpu().numpy() - self.venv.step_async(actions) - - def step_wait(self): - obs, reward, done, info = self.venv.step_wait() - obs = torch.from_numpy(obs).float().to(self.device) - reward = torch.from_numpy(reward).unsqueeze(dim=1).float() - return obs, reward, done, info - - -class VecNormalize(VecNormalize_): - def __init__(self, *args, **kwargs): - super(VecNormalize, self).__init__(*args, **kwargs) - self.training = True - - def _obfilt(self, obs, update=True): - if self.obs_rms: - if self.training and update: - self.obs_rms.update(obs) - obs = np.clip((obs - self.obs_rms.mean) / - np.sqrt(self.obs_rms.var + self.epsilon), - -self.clipob, self.clipob) - return obs - else: - return obs - - def train(self): - self.training = True - - def eval(self): - self.training = False - - -# Derived from -# https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_frame_stack.py -class VecPyTorchFrameStack(VecEnvWrapper): - def __init__(self, venv, nstack, device=None): - self.venv = venv - self.nstack = nstack - - wos = venv.observation_space # wrapped ob space - self.shape_dim0 = wos.shape[0] - - low = np.repeat(wos.low, self.nstack, axis=0) - high = np.repeat(wos.high, self.nstack, axis=0) - - if device is None: - device = torch.device('cpu') - self.stacked_obs = torch.zeros((venv.num_envs, ) + - low.shape).to(device) - - observation_space = gym.spaces.Box( - low=low, high=high, dtype=venv.observation_space.dtype) - VecEnvWrapper.__init__(self, venv, observation_space=observation_space) - - def step_wait(self): - obs, rews, news, infos = self.venv.step_wait() - self.stacked_obs[:, :-self.shape_dim0] = \ - self.stacked_obs[:, self.shape_dim0:].clone() - for (i, new) in enumerate(news): - if new: - self.stacked_obs[i] = 0 - self.stacked_obs[:, -self.shape_dim0:] = obs - return self.stacked_obs, rews, news, infos - - def reset(self): - obs = self.venv.reset() - if torch.backends.cudnn.deterministic: - self.stacked_obs = torch.zeros(self.stacked_obs.shape) - else: - self.stacked_obs.zero_() - self.stacked_obs[:, -self.shape_dim0:] = obs - return self.stacked_obs - - def close(self): - self.venv.close() diff --git a/examples/ppo/eval.py b/examples/ppo/eval.py new file mode 100644 index 00000000..3d1f6e2b --- /dev/null +++ b/examples/ppo/eval.py @@ -0,0 +1,66 @@ +from typing import List, Optional +import numpy as np +from stable_baselines3 import PPO +from stable_baselines3.common.env_util import make_vec_env + +def eval_policy( + model: PPO, + body: np.ndarray, + env_name: str, + n_evals: int = 1, + n_envs: int = 1, + connections: Optional[np.ndarray] = None, + render_mode: Optional[str] = None, + deterministic_policy: bool = False, + seed: int = 42, + verbose: bool = False, +) -> List[float]: + """ + Evaluate the controller for the robot in the environment. + Returns the result of `n_evals` evaluations. + """ + + def run_evals(n: int) -> List[float]: + """ + Run `n` evaluations in parallel. + """ + + # Parallel environments + vec_env = make_vec_env(env_name, n_envs=n, seed=seed, env_kwargs={ + 'body': body, + 'connections': connections, + "render_mode": render_mode, + }) + + # Evaluate + rewards = [] + obs = vec_env.reset() + cum_done = np.array([False]*n) + while not np.all(cum_done): + action, _states = model.predict(obs, deterministic=deterministic_policy) + obs, reward, done, info = vec_env.step(action) + + # Track when environments terminate + if verbose: + for i, (d, cd) in enumerate(zip(done, cum_done)): + if d and not cd: + print(f"Environment {i} terminated after {len(rewards)} steps") + + # Keep track of done environments + cum_done = np.logical_or(cum_done, done) + + # Update rewards -- done environments will not be updated + reward[cum_done] = 0 + rewards.append(reward) + vec_env.close() + + # Sum rewards over time + rewards = np.asarray(rewards) + return np.sum(rewards, axis=0) + + # Run evaluations n_envs at a time + rewards = [] + for i in range(np.ceil(n_evals/n_envs).astype(int)): + rewards.extend(run_evals(min(n_envs, n_evals - i*n_envs))) + + return rewards diff --git a/examples/ppo/evaluate.py b/examples/ppo/evaluate.py deleted file mode 100644 index 7a138da6..00000000 --- a/examples/ppo/evaluate.py +++ /dev/null @@ -1,59 +0,0 @@ -import numpy as np -import torch -from ppo import utils -from ppo.envs import make_vec_envs - -# Derived from -# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail - -def evaluate( - num_evals, - actor_critic, - obs_rms, - env_name, - robot_structure, - seed, - num_processes, - eval_log_dir, - device): - - num_processes = min(num_processes, num_evals) - - eval_envs = make_vec_envs(env_name, robot_structure, seed + num_processes, num_processes, - None, eval_log_dir, device, True) - - vec_norm = utils.get_vec_normalize(eval_envs) - if vec_norm is not None: - vec_norm.eval() - vec_norm.obs_rms = obs_rms - - eval_episode_rewards = [] - - obs = eval_envs.reset() - eval_recurrent_hidden_states = torch.zeros( - num_processes, actor_critic.recurrent_hidden_state_size, device=device) - eval_masks = torch.zeros(num_processes, 1, device=device) - - while len(eval_episode_rewards) < num_evals: - with torch.no_grad(): - _, action, _, eval_recurrent_hidden_states = actor_critic.act( - obs, - eval_recurrent_hidden_states, - eval_masks, - deterministic=True) - - # Obser reward and next obs - obs, _, done, infos = eval_envs.step(action) - - eval_masks = torch.tensor( - [[0.0] if done_ else [1.0] for done_ in done], - dtype=torch.float32, - device=device) - - for info in infos: - if 'episode' in info.keys(): - eval_episode_rewards.append(info['episode']['r']) - - eval_envs.close() - - return np.mean(eval_episode_rewards) diff --git a/examples/ppo/group_ppo.py b/examples/ppo/group_ppo.py index 4f815172..899c13bc 100644 --- a/examples/ppo/group_ppo.py +++ b/examples/ppo/group_ppo.py @@ -1,33 +1,24 @@ +import os import numpy as np import json import shutil -import random +import argparse -import sys -import os -curr_dir = os.path.dirname(os.path.abspath(__file__)) -root_dir = os.path.join(curr_dir, '..') -external_dir = os.path.join(root_dir, 'externals') -sys.path.insert(0, root_dir) -sys.path.insert(1, os.path.join(external_dir, 'PyTorch-NEAT')) -sys.path.insert(1, os.path.join(external_dir, 'pytorch_a2c_ppo_acktr_gail')) - -from ppo import run_ppo +from ppo.run import run_ppo +from ppo.args import add_ppo_args import utils.mp_group as mp -from utils.algo_utils import * - +import evogym.envs from evogym import WorldObject class SimJob(): - def __init__(self, name, robots, envs, train_iters): + def __init__(self, name, robots, envs): self.name = name self.robots = robots self.envs = envs - self.train_iters = train_iters def get_data(self,): - return {'robots': self.robots, 'envs': self.envs, 'train_iters': self.train_iters} + return {'robots': self.robots, 'envs': self.envs} class RunData(): @@ -41,14 +32,13 @@ def set_reward(self, reward): self.reward = reward def read_robot_from_file(file_name): - global root_dir possible_paths = [ os.path.join(file_name), os.path.join(f'{file_name}.npz'), os.path.join(f'{file_name}.json'), - os.path.join(root_dir, 'world_data', file_name), - os.path.join(root_dir, 'world_data', f'{file_name}.npz'), - os.path.join(root_dir, 'world_data', f'{file_name}.json'), + os.path.join('world_data', file_name), + os.path.join('world_data', f'{file_name}.npz'), + os.path.join('world_data', f'{file_name}.json'), ] best_path = None @@ -78,14 +68,18 @@ def clean_name(name): return name def run_group_ppo(experiment_name, sim_jobs): + ### ARGS ### + parser = argparse.ArgumentParser(description='Arguments for group PPO script') + add_ppo_args(parser) + args = parser.parse_args() - ### STARTUP: MANAGE DIRECTORIES ### - exp_path = os.path.join(root_dir, "saved_data", experiment_name) + ### MANAGE DIRECTORIES ### + exp_path = os.path.join("saved_data", experiment_name) try: os.makedirs(exp_path) except: print(f'THIS EXPERIMENT ({experiment_name}) ALREADY EXISTS') - print("Override? (y/n): ", end="") + print("Delete and override? (y/n): ", end="") ans = input() if ans.lower() == "y": shutil.rmtree(exp_path) @@ -114,8 +108,6 @@ def run_group_ppo(experiment_name, sim_jobs): except: pass - tc = TerminationCondition(job.train_iters) - count = 0 for env_name in job.envs: out[job.name][env_name] = {} @@ -127,9 +119,10 @@ def run_group_ppo(experiment_name, sim_jobs): temp_path = os.path.join(save_path_structure, f'{clean_name(robot_name)}_{env_name}.npz') np.savez(temp_path, structure[0], structure[1]) - - ppo_args = ((structure[0], structure[1]), tc, (save_path_controller, f'{clean_name(robot_name)}_{env_name}'), env_name, True) + + ppo_args = (args, structure[0], env_name, save_path_controller, f'{clean_name(robot_name)}_{env_name}', structure[1]) group.add_job(run_ppo, ppo_args, callback=run_data[-1].set_reward) + group.run_jobs(2) ### SAVE RANKING TO FILE ## diff --git a/examples/ppo/run.py b/examples/ppo/run.py index 5c017744..f054f360 100644 --- a/examples/ppo/run.py +++ b/examples/ppo/run.py @@ -1,242 +1,64 @@ -import os, sys -sys.path.insert(1, os.path.join(sys.path[0], 'externals', 'pytorch_a2c_ppo_acktr_gail')) - +import argparse +from typing import Optional import numpy as np -import time -from collections import deque -import torch - -from ppo import utils -from ppo.arguments import get_args -from ppo.evaluate import evaluate -from ppo.envs import make_vec_envs - -from a2c_ppo_acktr import algo -from a2c_ppo_acktr.algo import gail -from a2c_ppo_acktr.model import Policy -from a2c_ppo_acktr.storage import RolloutStorage +from stable_baselines3 import PPO +from stable_baselines3.common.env_util import make_vec_env -import evogym.envs - -# Derived from -# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail +from ppo.eval import eval_policy +from ppo.callback import EvalCallback def run_ppo( - structure, - termination_condition, - saving_convention, - override_env_name = None, - verbose = True): - - assert (structure == None) == (termination_condition == None) and (structure == None) == (saving_convention == None) - - print(f'Starting training on \n{structure}\nat {saving_convention}...\n') - - args = get_args() - if override_env_name: - args.env_name = override_env_name - - torch.manual_seed(args.seed) - torch.cuda.manual_seed_all(args.seed) - - if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: - torch.backends.cudnn.benchmark = False - torch.backends.cudnn.deterministic = True - - log_dir = args.log_dir - if saving_convention != None: - log_dir = os.path.join(saving_convention[0], log_dir, "robot_" + str(saving_convention[1])) - eval_log_dir = log_dir + "_eval" - utils.cleanup_log_dir(log_dir) - utils.cleanup_log_dir(eval_log_dir) - - torch.set_num_threads(1) - device = torch.device("cuda:0" if args.cuda else "cpu") - - envs = make_vec_envs(args.env_name, structure, args.seed, args.num_processes, - args.gamma, args.log_dir, device, False) - - actor_critic = Policy( - envs.observation_space.shape, - envs.action_space, - base_kwargs={'recurrent': args.recurrent_policy}) - actor_critic.to(device) - - if args.algo == 'a2c': - print('Warning: this code has only been tested with ppo.') - agent = algo.A2C_ACKTR( - actor_critic, - args.value_loss_coef, - args.entropy_coef, - lr=args.lr, - eps=args.eps, - alpha=args.alpha, - max_grad_norm=args.max_grad_norm) - elif args.algo == 'ppo': - agent = algo.PPO( - actor_critic, - args.clip_param, - args.ppo_epoch, - args.num_mini_batch, - args.value_loss_coef, - args.entropy_coef, - lr=args.lr, - eps=args.eps, - max_grad_norm=args.max_grad_norm) - elif args.algo == 'acktr': - print('Warning: this code has only been tested with ppo.') - agent = algo.A2C_ACKTR( - actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) - - if args.gail: - assert len(envs.observation_space.shape) == 1 - discr = gail.Discriminator( - envs.observation_space.shape[0] + envs.action_space.shape[0], 100, - device) - file_name = os.path.join( - args.gail_experts_dir, "trajs_{}.pt".format( - args.env_name.split('-')[0].lower())) - - expert_dataset = gail.ExpertDataset( - file_name, num_trajectories=4, subsample_frequency=20) - drop_last = len(expert_dataset) > args.gail_batch_size - gail_train_loader = torch.utils.data.DataLoader( - dataset=expert_dataset, - batch_size=args.gail_batch_size, - shuffle=True, - drop_last=drop_last) - - rollouts = RolloutStorage(args.num_steps, args.num_processes, - envs.observation_space.shape, envs.action_space, - actor_critic.recurrent_hidden_state_size) - - obs = envs.reset() - rollouts.obs[0].copy_(obs) - rollouts.to(device) - - episode_rewards = deque(maxlen=10) - - start = time.time() - num_updates = int( - args.num_env_steps) // args.num_steps // args.num_processes - - rewards_tracker = [] - avg_rewards_tracker = [] - sliding_window_size = 10 - max_determ_avg_reward = float('-inf') - - for j in range(num_updates): - - if args.use_linear_lr_decay: - # decrease learning rate linearly - utils.update_linear_schedule( - agent.optimizer, j, num_updates, - agent.optimizer.lr if args.algo == "acktr" else args.lr) - - for step in range(args.num_steps): - # Sample actions - with torch.no_grad(): - value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( - rollouts.obs[step], rollouts.recurrent_hidden_states[step], - rollouts.masks[step]) - - # Obser reward and next obs - obs, reward, done, infos = envs.step(action) - - # track rewards - for info in infos: - if 'episode' in info.keys(): - episode_rewards.append(info['episode']['r']) - rewards_tracker.append(info['episode']['r']) - if len(rewards_tracker) < 10: - avg_rewards_tracker.append(np.average(np.array(rewards_tracker))) - else: - avg_rewards_tracker.append(np.average(np.array(rewards_tracker[-10:]))) - - # If done then clean the history of observations. - masks = torch.FloatTensor( - [[0.0] if done_ else [1.0] for done_ in done]) - bad_masks = torch.FloatTensor( - [[0.0] if 'bad_transition' in info.keys() else [1.0] - for info in infos]) - rollouts.insert(obs, recurrent_hidden_states, action, - action_log_prob, value, reward, masks, bad_masks) - - with torch.no_grad(): - next_value = actor_critic.get_value( - rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], - rollouts.masks[-1]).detach() - - if args.gail: - if j >= 10: - envs.venv.eval() - - gail_epoch = args.gail_epoch - if j < 10: - gail_epoch = 100 # Warm up - for _ in range(gail_epoch): - discr.update(gail_train_loader, rollouts, - utils.get_vec_normalize(envs)._obfilt) - - for step in range(args.num_steps): - rollouts.rewards[step] = discr.predict_reward( - rollouts.obs[step], rollouts.actions[step], args.gamma, - rollouts.masks[step]) - - rollouts.compute_returns(next_value, args.use_gae, args.gamma, - args.gae_lambda, args.use_proper_time_limits) - - value_loss, action_loss, dist_entropy = agent.update(rollouts) - - rollouts.after_update() - - # print status - if j % args.log_interval == 0 and len(episode_rewards) > 1 and verbose: - total_num_steps = (j + 1) * args.num_processes * args.num_steps - end = time.time() - print( - "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" - .format(j, total_num_steps, - int(total_num_steps / (end - start)), - len(episode_rewards), np.mean(episode_rewards), - np.median(episode_rewards), np.min(episode_rewards), - np.max(episode_rewards), dist_entropy, value_loss, - action_loss)) - - # evaluate the controller and save it if it does the best so far - if (args.eval_interval is not None and len(episode_rewards) > 1 - and j % args.eval_interval == 0): - - obs_rms = utils.get_vec_normalize(envs).obs_rms - determ_avg_reward = evaluate(args.num_evals, actor_critic, obs_rms, args.env_name, structure, args.seed, - args.num_processes, eval_log_dir, device) - - if verbose: - if saving_convention != None: - print(f'Evaluated {saving_convention[1]} using {args.num_evals} episodes. Mean reward: {np.mean(determ_avg_reward)}\n') - else: - print(f'Evaluated using {args.num_evals} episodes. Mean reward: {np.mean(determ_avg_reward)}\n') - - if determ_avg_reward > max_determ_avg_reward: - max_determ_avg_reward = determ_avg_reward - - temp_path = os.path.join(args.save_dir, args.algo, args.env_name + ".pt") - if saving_convention != None: - temp_path = os.path.join(saving_convention[0], "robot_" + str(saving_convention[1]) + "_controller" + ".pt") - - if verbose: - print(f'Saving {temp_path} with avg reward {max_determ_avg_reward}\n') - torch.save([ - actor_critic, - getattr(utils.get_vec_normalize(envs), 'obs_rms', None) - ], temp_path) - - # return upon reaching the termination condition - if not termination_condition == None: - if termination_condition(j): - if verbose: - print(f'{saving_convention} has met termination condition ({j})...terminating...\n') - return max_determ_avg_reward - -#python ppo_main_test.py --env-name "roboticgamedesign-v0" --algo ppo --use-gae --lr 2.5e-4 --clip-param 0.1 --value-loss-coef 0.5 --num-processes 1 --num-steps 128 --num-mini-batch 4 --log-interval 1 --use-linear-lr-decay --entropy-coef 0.01 -#python ppo.py --env-name "roboticgamedesign-v0" --algo ppo --use-gae --lr 2.5e-4 --clip-param 0.1 --value-loss-coef 0.5 --num-processes 8 --num-steps 128 --num-mini-batch 4 --log-interval 1 --use-linear-lr-decay --entropy-coef 0.01 --log-dir "logs/" + args: argparse.Namespace, + body: np.ndarray, + env_name: str, + model_save_dir: str, + model_save_name: str, + connections: Optional[np.ndarray] = None, + seed: int = 42, +) -> float: + """ + Run ppo and return the best reward achieved during evaluation. + """ + + # Parallel environments + vec_env = make_vec_env(env_name, n_envs=1, seed=seed, env_kwargs={ + 'body': body, + 'connections': connections, + }) + + # Eval Callback + callback = EvalCallback( + body=body, + connections=connections, + env_name=env_name, + eval_every=args.eval_interval, + n_evals=args.n_evals, + n_envs=args.n_eval_envs, + model_save_dir=model_save_dir, + model_save_name=model_save_name, + verbose=args.verbose_ppo, + ) + + # Train + model = PPO( + "MlpPolicy", + vec_env, + verbose=args.verbose_ppo, + learning_rate=args.learning_rate, + n_steps=args.n_steps, + batch_size=args.batch_size, + n_epochs=args.n_epochs, + gamma=args.gamma, + gae_lambda=args.gae_lambda, + vf_coef=args.vf_coef, + max_grad_norm=args.max_grad_norm, + ent_coef=args.ent_coef, + clip_range=args.clip_range + ) + model.learn( + total_timesteps=args.total_timesteps, + callback=callback, + log_interval=args.log_interval + ) + + return callback.best_reward \ No newline at end of file diff --git a/examples/ppo/utils.py b/examples/ppo/utils.py deleted file mode 100644 index a023c7bd..00000000 --- a/examples/ppo/utils.py +++ /dev/null @@ -1,69 +0,0 @@ -import glob -import os - -import torch -import torch.nn as nn - -from ppo.envs import VecNormalize - -# Derived from -# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail - -# Get a render function -def get_render_func(venv): - if hasattr(venv, 'envs'): - return venv.envs[0].render - elif hasattr(venv, 'venv'): - return get_render_func(venv.venv) - elif hasattr(venv, 'env'): - return get_render_func(venv.env) - - return None - - -def get_vec_normalize(venv): - if isinstance(venv, VecNormalize): - return venv - elif hasattr(venv, 'venv'): - return get_vec_normalize(venv.venv) - - return None - - -# Necessary for my KFAC implementation. -class AddBias(nn.Module): - def __init__(self, bias): - super(AddBias, self).__init__() - self._bias = nn.Parameter(bias.unsqueeze(1)) - - def forward(self, x): - if x.dim() == 2: - bias = self._bias.t().view(1, -1) - else: - bias = self._bias.t().view(1, -1, 1, 1) - - return x + bias - - -def update_linear_schedule(optimizer, epoch, total_num_epochs, initial_lr): - """Decreases the learning rate linearly""" - lr = initial_lr - (initial_lr * (epoch / float(total_num_epochs))) - for param_group in optimizer.param_groups: - param_group['lr'] = lr - - -def init(module, weight_init, bias_init, gain=1): - weight_init(module.weight.data, gain=gain) - bias_init(module.bias.data) - return module - - -def cleanup_log_dir(log_dir): - - try: - os.makedirs(log_dir) - except: - pass - # files = glob.glob(os.path.join(log_dir, '*.monitor.csv')) - # for f in files: - # os.remove(f) diff --git a/examples/run_bo.py b/examples/run_bo.py index 030c464e..38aa3916 100644 --- a/examples/run_bo.py +++ b/examples/run_bo.py @@ -1,21 +1,26 @@ import random import numpy as np +import argparse from bo.run import run_bo +from ppo.args import add_ppo_args if __name__ == '__main__': seed = 0 random.seed(seed) np.random.seed(seed) + + parser = argparse.ArgumentParser(description='Arguments for ga script') + parser.add_argument('--exp_name', type=str, default='test_bo', help='Name of the experiment (default: test_bo)') + parser.add_argument('--env_name', type=str, default='Walker-v0', help='Name of the environment (default: Walker-v0)') + parser.add_argument('--pop_size', type=int, default=3, help='Population size (default: 3)') + parser.add_argument('--structure_shape', type=tuple, default=(5,5), help='Shape of the structure (default: (5,5))') + parser.add_argument('--max_evaluations', type=int, default=6, help='Maximum number of robots that will be evaluated (default: 6)') + parser.add_argument('--num_cores', type=int, default=3, help='Number of robots to evaluate simultaneously (default: 3)') + add_ppo_args(parser) + args = parser.parse_args() - best_robot, best_fitness = run_bo( - experiment_name='test_bo', - structure_shape=(5, 5), - pop_size=3, - max_evaluations=6, - train_iters=50, - num_cores=3, - ) + best_robot, best_fitness = run_bo(args) print('Best robot:') print(best_robot) diff --git a/examples/run_cppn_neat.py b/examples/run_cppn_neat.py index 18437c85..8f21cf94 100644 --- a/examples/run_cppn_neat.py +++ b/examples/run_cppn_neat.py @@ -1,21 +1,26 @@ import random import numpy as np +import argparse from cppn_neat.run import run_cppn_neat +from ppo.args import add_ppo_args if __name__ == '__main__': seed = 0 random.seed(seed) np.random.seed(seed) + + parser = argparse.ArgumentParser(description='Arguments for ga script') + parser.add_argument('--exp_name', type=str, default='test_cppn', help='Name of the experiment (default: test_cppn)') + parser.add_argument('--env_name', type=str, default='Walker-v0', help='Name of the environment (default: Walker-v0)') + parser.add_argument('--pop_size', type=int, default=3, help='Population size (default: 3)') + parser.add_argument('--structure_shape', type=tuple, default=(5,5), help='Shape of the structure (default: (5,5))') + parser.add_argument('--max_evaluations', type=int, default=6, help='Maximum number of robots that will be evaluated (default: 6)') + parser.add_argument('--num_cores', type=int, default=3, help='Number of robots to evaluate simultaneously (default: 3)') + add_ppo_args(parser) + args = parser.parse_args() - best_robot, best_fitness = run_cppn_neat( - experiment_name='test_cppn', - structure_shape=(5, 5), - pop_size=3, - max_evaluations=6, - train_iters=50, - num_cores=3, - ) + best_robot, best_fitness = run_cppn_neat(args) print('Best robot:') print(best_robot) diff --git a/examples/run_ga.py b/examples/run_ga.py index 10e36450..63bbdd26 100644 --- a/examples/run_ga.py +++ b/examples/run_ga.py @@ -1,18 +1,23 @@ import random import numpy as np +import argparse from ga.run import run_ga +from ppo.args import add_ppo_args if __name__ == "__main__": seed = 0 random.seed(seed) np.random.seed(seed) - run_ga( - pop_size = 3, - structure_shape = (5,5), - experiment_name = "test_ga", - max_evaluations = 6, - train_iters = 50, - num_cores = 3, - ) \ No newline at end of file + parser = argparse.ArgumentParser(description='Arguments for ga script') + parser.add_argument('--exp_name', type=str, default='test_ga', help='Name of the experiment (default: test_ga)') + parser.add_argument('--env_name', type=str, default='Walker-v0', help='Name of the environment (default: Walker-v0)') + parser.add_argument('--pop_size', type=int, default=3, help='Population size (default: 3)') + parser.add_argument('--structure_shape', type=tuple, default=(5,5), help='Shape of the structure (default: (5,5))') + parser.add_argument('--max_evaluations', type=int, default=6, help='Maximum number of robots that will be evaluated (default: 6)') + parser.add_argument('--num_cores', type=int, default=3, help='Number of robots to evaluate simultaneously (default: 3)') + add_ppo_args(parser) + args = parser.parse_args() + + run_ga(args) \ No newline at end of file diff --git a/examples/run_group_ppo.py b/examples/run_group_ppo.py index 87c66e98..d00f8348 100644 --- a/examples/run_group_ppo.py +++ b/examples/run_group_ppo.py @@ -9,13 +9,11 @@ name = 'big', robots = ['speed_bot', 'carry_bot'], envs = ['Walker-v0', 'Carrier-v0'], - train_iters = 50 ), SimJob( name = 'small', robots = ['speed_bot'], envs = ['Walker-v0'], - train_iters = 50 ) ] ) \ No newline at end of file diff --git a/examples/run_ppo.py b/examples/run_ppo.py new file mode 100644 index 00000000..4b0908f9 --- /dev/null +++ b/examples/run_ppo.py @@ -0,0 +1,79 @@ +import os +import shutil +import json +import argparse +import numpy as np +from stable_baselines3 import PPO +import evogym.envs +from evogym import WorldObject + +from new_ppo.args import add_ppo_args +from new_ppo.run import run_ppo +from new_ppo.eval import eval_policy + +if __name__ == "__main__": + + # Args + parser = argparse.ArgumentParser(description='Arguments for PPO script') + parser.add_argument( + "--env-name", default="Walker-v0", type=str, help="Environment name (default: Walker-v0)" + ) + parser.add_argument( + "--save-dir", default="saved_data", type=str, help="Directory to save agent logs (default: saved_data)" + ) + parser.add_argument( + "--exp-name", default="ppo_test", type=str, help="Name of the model to save (default: ppo_test)" + ) + parser.add_argument( + "--robot-path", default=os.path.join("world_data", "speed_bot.json"), type=str, help="Path to the robot json file (default: world_data/speed_bot.json)" + ) + add_ppo_args(parser) + args = parser.parse_args() + + # Manage dirs + exp_dir = os.path.join(args.save_dir, args.exp_name) + if os.path.exists(exp_dir): + print(f'THIS EXPERIMENT ({args.exp_name}) ALREADY EXISTS') + print("Delete and override? (y/n): ", end="") + ans = input() + if ans.lower() != "y": + exit() + shutil.rmtree(exp_dir) + model_save_dir = os.path.join(args.save_dir, args.exp_name, "controller") + structure_save_dir = os.path.join(args.save_dir, args.exp_name, "structure") + save_name = f"{args.env_name}" + + # Get Robot + robot = WorldObject.from_json(args.robot_path) + os.makedirs(structure_save_dir, exist_ok=True) + np.savez(os.path.join(structure_save_dir, save_name), robot.get_structure(), robot.get_connections()) + + # Train + best_reward = run_ppo( + args=args, + body=robot.get_structure(), + connections=robot.get_connections(), + env_name=args.env_name, + model_save_dir=model_save_dir, + model_save_name=save_name, + ) + + # Save result file + with open(os.path.join(args.save_dir, args.exp_name, "ppo_result.json"), "w") as f: + json.dump({ + "best_reward": best_reward, + "env_name": args.env_name, + }, f, indent=4) + + # Evaluate + model = PPO.load(os.path.join(model_save_dir, save_name)) + rewards = eval_policy( + model=model, + body=robot.get_structure(), + connections=robot.get_connections(), + env_name=args.env_name, + n_evals=1, + n_envs=1, + render_mode="human", + ) + print(f"Mean reward: {np.mean(rewards)}") \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index ee64a479..031084c3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,20 +1,24 @@ -glfw==2.5.0 -GPy==1.10.0 -GPyOpt @ git+https://github.com/yunshengtian/GPyOpt@5fc1188ffdefea9a3bc7964a9414d4922603e904 -gym==0.22.0 -h5py==3.6.0 -imageio==2.14.1 -matplotlib==3.5.1 +# glfw==2.5.0 +# GPy==1.10.0 +# gym==0.22.0 +# h5py==3.6.0 +# imageio==2.14.1 +# matplotlib==3.5.1 +# neat-python @ git+https://github.com/yunshengtian/neat-python@2762ab630838520ca6c03a866e8a158f592b0370 +# numpy==1.21.5 +# opencv-python==4.5.5.62 +# Pillow==9.0.0 +# pybind11==2.9.0 +# pygifsicle==1.0.5 +# PyOpenGL==3.1.5 +# PyOpenGL-accelerate==3.1.5 +# torch==1.10.2 +# ttkbootstrap==1.5.1 +# typing==3.7.4.3 + neat-python @ git+https://github.com/yunshengtian/neat-python@2762ab630838520ca6c03a866e8a158f592b0370 -numpy==1.21.5 -opencv-python==4.5.5.62 -Pillow==9.0.0 -pybind11==2.9.0 -pygifsicle==1.0.5 -PyOpenGL==3.1.5 -PyOpenGL-accelerate==3.1.5 -torch==1.10.2 -ttkbootstrap==1.5.1 -typing==3.7.4.3 +GPyOpt @ git+https://github.com/yunshengtian/GPyOpt@5fc1188ffdefea9a3bc7964a9414d4922603e904 +stable-baselines3 + pytest -pytest-xdist +pytest-xdist \ No newline at end of file