From be20a86ca9424057868ae2deeb17047015b2e48a Mon Sep 17 00:00:00 2001 From: Janice Lan Date: Tue, 9 Apr 2024 13:48:51 -0700 Subject: [PATCH] set wandb as default logger (#647) --- TRAIN.md | 6 +++--- configs/is2re/100k/base.yml | 2 +- configs/is2re/10k/base.yml | 2 +- configs/is2re/all/base.yml | 2 +- configs/ocp_example.yml | 2 +- configs/s2ef/200k/base.yml | 2 +- configs/s2ef/20M/base.yml | 2 +- configs/s2ef/2M/base.yml | 2 +- configs/s2ef/2M/dimenet_plus_plus/dpp_relax.yml | 2 +- configs/s2ef/all/base.yml | 2 +- configs/s2ef/all/dimenet_plus_plus/dpp10.7M_forceonly.yml | 2 +- configs/s2ef/all/dimenet_plus_plus/dpp_energyonly.yml | 2 +- configs/s2ef/all/dimenet_plus_plus/dpp_forceonly.yml | 2 +- ocpmodels/common/registry.py | 4 ++-- ocpmodels/common/utils.py | 2 +- ocpmodels/modules/scaling/fit.py | 2 +- ocpmodels/trainers/base_trainer.py | 2 +- ocpmodels/trainers/ocp_trainer.py | 4 ++-- 18 files changed, 22 insertions(+), 22 deletions(-) diff --git a/TRAIN.md b/TRAIN.md index 03719e23d..79ec44b9a 100644 --- a/TRAIN.md +++ b/TRAIN.md @@ -106,8 +106,8 @@ To train a SchNet model for the IS2RE task on the 10k split, run: python main.py --mode train --config-yml configs/is2re/10k/schnet/schnet.yml ``` -Training logs are stored in `logs/tensorboard/[TIMESTAMP]` where `[TIMESTAMP]` is -the starting time-stamp of the run. You can monitor the training process by running: +Training logs are stored in `logs/wandb/[TIMESTAMP]` or `logs/tensorboard/[TIMESTAMP]` where `[TIMESTAMP]` is +the starting time-stamp of the run. For tensorboard, you can monitor the training process by running: ```bash tensorboard --logdir logs/tensorboard/[TIMESTAMP] ``` @@ -187,7 +187,7 @@ To train a SchNet model for the S2EF task on the 2M split using 2 GPUs, run: python -u -m torch.distributed.launch --nproc_per_node=2 main.py \ --mode train --config-yml configs/s2ef/2M/schnet/schnet.yml --num-gpus 2 --distributed ``` -Similar to the IS2RE task, tensorboard logs are stored in `logs/tensorboard/[TIMESTAMP]` and the +Similar to the IS2RE task, logs are stored in `logs/wandb/[TIMESTAMP]` or `logs/tensorboard/[TIMESTAMP]` and the checkpoint is stored in `checkpoints/[TIMESTAMP]/checkpoint.pt`. Next, run this model on the test data: diff --git a/configs/is2re/100k/base.yml b/configs/is2re/100k/base.yml index a76c88c2c..ccfe9575b 100755 --- a/configs/is2re/100k/base.yml +++ b/configs/is2re/100k/base.yml @@ -7,7 +7,7 @@ dataset: target_std: 2.279365062713623 - src: data/is2re/all/val_id/data.lmdb -logger: tensorboard +logger: wandb task: dataset: single_point_lmdb diff --git a/configs/is2re/10k/base.yml b/configs/is2re/10k/base.yml index 07e75c025..ea44c4f57 100755 --- a/configs/is2re/10k/base.yml +++ b/configs/is2re/10k/base.yml @@ -7,7 +7,7 @@ dataset: target_std: 2.279365062713623 - src: data/is2re/all/val_id/data.lmdb -logger: tensorboard +logger: wandb task: dataset: single_point_lmdb diff --git a/configs/is2re/all/base.yml b/configs/is2re/all/base.yml index cf61f8309..cfd817ffc 100755 --- a/configs/is2re/all/base.yml +++ b/configs/is2re/all/base.yml @@ -7,7 +7,7 @@ dataset: target_std: 2.279365062713623 - src: data/is2re/all/val_id/data.lmdb -logger: tensorboard +logger: wandb task: dataset: single_point_lmdb diff --git a/configs/ocp_example.yml b/configs/ocp_example.yml index 439e0fb07..5904f97e4 100644 --- a/configs/ocp_example.yml +++ b/configs/ocp_example.yml @@ -112,7 +112,7 @@ task: # since this can be significantly slower. set_deterministic_scatter: False # True or False -logger: tensorboard # 'wandb' or 'tensorboard' +logger: wandb # 'wandb' or 'tensorboard' loss_functions: # Specify the different terms in the loss function. For each term, the target property must diff --git a/configs/s2ef/200k/base.yml b/configs/s2ef/200k/base.yml index 5c5962496..8c8703e8f 100755 --- a/configs/s2ef/200k/base.yml +++ b/configs/s2ef/200k/base.yml @@ -9,7 +9,7 @@ dataset: grad_target_std: 2.887317180633545 - src: data/s2ef/all/val_id/ -logger: tensorboard +logger: wandb task: dataset: trajectory_lmdb diff --git a/configs/s2ef/20M/base.yml b/configs/s2ef/20M/base.yml index 2dc86b8a2..14ec01bcb 100755 --- a/configs/s2ef/20M/base.yml +++ b/configs/s2ef/20M/base.yml @@ -9,7 +9,7 @@ dataset: grad_target_std: 2.887317180633545 - src: data/s2ef/all/val_id/ -logger: tensorboard +logger: wandb task: dataset: trajectory_lmdb diff --git a/configs/s2ef/2M/base.yml b/configs/s2ef/2M/base.yml index 495341083..4c39b96e0 100755 --- a/configs/s2ef/2M/base.yml +++ b/configs/s2ef/2M/base.yml @@ -9,7 +9,7 @@ dataset: grad_target_std: 2.887317180633545 - src: data/s2ef/all/val_id/ -logger: tensorboard +logger: wandb task: dataset: trajectory_lmdb diff --git a/configs/s2ef/2M/dimenet_plus_plus/dpp_relax.yml b/configs/s2ef/2M/dimenet_plus_plus/dpp_relax.yml index 521a802e3..b5d52bd23 100755 --- a/configs/s2ef/2M/dimenet_plus_plus/dpp_relax.yml +++ b/configs/s2ef/2M/dimenet_plus_plus/dpp_relax.yml @@ -9,7 +9,7 @@ dataset: grad_target_std: 2.887317180633545 - src: data/s2ef/all/val_id/ -logger: tensorboard +logger: wandb task: dataset: trajectory_lmdb diff --git a/configs/s2ef/all/base.yml b/configs/s2ef/all/base.yml index 3a81152c1..712a596db 100755 --- a/configs/s2ef/all/base.yml +++ b/configs/s2ef/all/base.yml @@ -9,7 +9,7 @@ dataset: grad_target_std: 2.887317180633545 - src: data/s2ef/all/val_id/ -logger: tensorboard +logger: wandb task: dataset: trajectory_lmdb diff --git a/configs/s2ef/all/dimenet_plus_plus/dpp10.7M_forceonly.yml b/configs/s2ef/all/dimenet_plus_plus/dpp10.7M_forceonly.yml index add753944..afbc15a61 100755 --- a/configs/s2ef/all/dimenet_plus_plus/dpp10.7M_forceonly.yml +++ b/configs/s2ef/all/dimenet_plus_plus/dpp10.7M_forceonly.yml @@ -9,7 +9,7 @@ dataset: grad_target_std: 2.887317180633545 - src: data/s2ef/all/val_id/ -logger: tensorboard +logger: wandb task: dataset: trajectory_lmdb diff --git a/configs/s2ef/all/dimenet_plus_plus/dpp_energyonly.yml b/configs/s2ef/all/dimenet_plus_plus/dpp_energyonly.yml index c4157c40e..347fdabae 100755 --- a/configs/s2ef/all/dimenet_plus_plus/dpp_energyonly.yml +++ b/configs/s2ef/all/dimenet_plus_plus/dpp_energyonly.yml @@ -9,7 +9,7 @@ dataset: grad_target_std: 2.887317180633545 - src: data/s2ef/all/val_id/ -logger: tensorboard +logger: wandb task: dataset: trajectory_lmdb diff --git a/configs/s2ef/all/dimenet_plus_plus/dpp_forceonly.yml b/configs/s2ef/all/dimenet_plus_plus/dpp_forceonly.yml index 75a0b6ed6..8f21f7c2c 100755 --- a/configs/s2ef/all/dimenet_plus_plus/dpp_forceonly.yml +++ b/configs/s2ef/all/dimenet_plus_plus/dpp_forceonly.yml @@ -9,7 +9,7 @@ dataset: grad_target_std: 2.887317180633545 - src: data/s2ef/all/val_id/ -logger: tensorboard +logger: wandb task: dataset: trajectory_lmdb diff --git a/ocpmodels/common/registry.py b/ocpmodels/common/registry.py index 854b39819..0bd1cae8e 100644 --- a/ocpmodels/common/registry.py +++ b/ocpmodels/common/registry.py @@ -140,8 +140,8 @@ def register_logger(cls, name: str): from ocpmodels.common.registry import registry - @registry.register_logger("tensorboard") - class WandB(): + @registry.register_logger("wandb") + class WandBLogger(): ... """ diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index bdc1544d1..d6d62287c 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -1028,7 +1028,7 @@ class _TrainingContext: is_debug=config.get("is_debug", False), print_every=config.get("print_every", 10), seed=config.get("seed", 0), - logger=config.get("logger", "tensorboard"), + logger=config.get("logger", "wandb"), local_rank=config["local_rank"], amp=config.get("amp", False), cpu=config.get("cpu", False), diff --git a/ocpmodels/modules/scaling/fit.py b/ocpmodels/modules/scaling/fit.py index b8d816492..676ee8956 100644 --- a/ocpmodels/modules/scaling/fit.py +++ b/ocpmodels/modules/scaling/fit.py @@ -46,7 +46,7 @@ def main(*, num_batches: int = 16) -> None: parser = flags.get_parser() args, override_args = parser.parse_known_args() _config = build_config(args, override_args) - _config["logger"] = "tensorboard" + _config["logger"] = "wandb" # endregion assert not args.distributed, "This doesn't work with DDP" diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 0f981c34c..30e80946b 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -62,7 +62,7 @@ def __init__( is_debug: bool = False, print_every: int = 100, seed: Optional[int] = None, - logger: str = "tensorboard", + logger: str = "wandb", local_rank: int = 0, amp: bool = False, cpu: bool = False, diff --git a/ocpmodels/trainers/ocp_trainer.py b/ocpmodels/trainers/ocp_trainer.py index f6ba8b834..0e540805b 100644 --- a/ocpmodels/trainers/ocp_trainer.py +++ b/ocpmodels/trainers/ocp_trainer.py @@ -56,7 +56,7 @@ class OCPTrainer(BaseTrainer): seed (int, optional): Random number seed. (default: :obj:`None`) logger (str, optional): Type of logger to be used. - (default: :obj:`tensorboard`) + (default: :obj:`wandb`) local_rank (int, optional): Local rank of the process, only applicable for distributed training. (default: :obj:`0`) amp (bool, optional): Run using automatic mixed precision. @@ -81,7 +81,7 @@ def __init__( is_debug=False, print_every=100, seed=None, - logger="tensorboard", + logger="wandb", local_rank=0, amp=False, cpu=False,