From 5a95d6686ca0fdd24573965cd787e848d57c9988 Mon Sep 17 00:00:00 2001 From: Logan Ward Date: Mon, 10 Jun 2024 09:42:30 -0400 Subject: [PATCH] Add more testing scripts (#135) * Increase training set size as needed * Initial CP2K test * Fix how we access charges * Fix typos, modules in configs * Enforce single XPU training (#134) * Remove duplicate import statements --- component-tests/cp2k/README.md | 5 + component-tests/cp2k/run_test.py | 183 +++++++++++++++++++++++++++ component-tests/training/run_test.py | 31 ++--- envs/build-aurora.sh | 2 +- envs/environment-aurora.yml | 9 ++ mofa/difflinker_train.py | 7 + 6 files changed, 213 insertions(+), 24 deletions(-) create mode 100644 component-tests/cp2k/README.md create mode 100644 component-tests/cp2k/run_test.py diff --git a/component-tests/cp2k/README.md b/component-tests/cp2k/README.md new file mode 100644 index 00000000..b793a78b --- /dev/null +++ b/component-tests/cp2k/README.md @@ -0,0 +1,5 @@ +# CP2K + +Test CP2K runtime as a function of layout (nodes, ranks per node) +and problem size (number of steps). +Measure the change in the structure and charges with problem size. diff --git a/component-tests/cp2k/run_test.py b/component-tests/cp2k/run_test.py new file mode 100644 index 00000000..edec5266 --- /dev/null +++ b/component-tests/cp2k/run_test.py @@ -0,0 +1,183 @@ +"""Test LAMMPS by running a large number of MD simulations with different runtimes""" +from concurrent.futures import as_completed +from pathlib import Path +from platform import node +import argparse +import json + +from tqdm import tqdm +from ase import Atoms +import parsl +from parsl.config import Config +from parsl.app.python import PythonApp +from parsl.executors import HighThroughputExecutor +from parsl.providers import PBSProProvider +from parsl.launchers import SimpleLauncher + +from mofa.model import MOFRecord +from mofa.scoring.geometry import LatticeParameterChange +from mofa.simulation.cp2k import compute_partial_charges +from mofa.utils.conversions import write_to_string + + +def test_function(strc: MOFRecord, cp2k_invocation: str, steps: int) -> tuple[float, tuple[Atoms, Path]]: + """Run a LAMMPS simulation, report runtime and resultant traj + + Args: + strc: MOF to use + cp2k_invocation: Command to invoke CP2K + steps: Number of optimization steps + Returns: + - Runtime (s) + - MD trajectory + """ + from mofa.simulation.cp2k import CP2KRunner + from time import perf_counter + from pathlib import Path + + run_dir = Path(f'run-{steps}') + run_dir.mkdir(exist_ok=True, parents=True) + + # Run + runner = CP2KRunner(cp2k_invocation, run_dir=run_dir) + start_time = perf_counter() + output = runner.run_optimization(strc, steps=steps) + run_time = perf_counter() - start_time + + return run_time, output + + +if __name__ == "__main__": + # Get the length of the runs, etc + parser = argparse.ArgumentParser() + parser.add_argument('--ranks-per-node', help='Number of CP2K ranks to deploy per node', type=int, default=4) + parser.add_argument('--num-nodes', help='Number of nodes to use per computation', type=int, default=1) + parser.add_argument('--steps', help='Number of optimization steps to run', default=4, type=int) + parser.add_argument('--num-to-run', help='Number of MOFs to evaluate', default=4, type=int) + parser.add_argument('--config', help='Which compute configuration to use', default='local') + args = parser.parse_args() + + # Select the correct configuration + if args.config == "local": + assert args.num_nodes == 1, 'Only support 1 node for local config' + cp2k_cmd = (f'env OMP_NUM_THREADS={12 // args.ranks_per_node} /usr/bin/mpiexec -np {args.ranks_per_node}' + f' /home/lward/Software/cp2k-lward-fork/exe/local/cp2k_shell.psmp') + config = Config(executors=[HighThroughputExecutor(max_workers=1)]) + elif args.config == "polaris": + cp2k_cmd = (f'mpiexec -n {args.num_nodes * args.ranks_per_node} --ppn {args.ranks_per_node}' + f' --cpu-bind depth --depth {32 // args.ranks_per_node} -env OMP_NUM_THREADS={32 // args.ranks_per_node} ' + '/lus/eagle/projects/ExaMol/cp2k-2024.1/set_affinity_gpu_polaris.sh ' + '/lus/eagle/projects/ExaMol/cp2k-2024.1/exe/local_cuda/cp2k_shell.psmp') + config = Config(retries=4, executors=[ + HighThroughputExecutor( + max_workers=4, + provider=PBSProProvider( + launcher=SimpleLauncher(), + account='ExaMol', + queue='debug', + select_options="ngpus=4", + scheduler_options="#PBS -l filesystems=home:eagle", + worker_init=""" +module load kokkos +module load nvhpc/23.3 +module list +source activate /lus/eagle/projects/ExaMol/mofa/mof-generation-at-scale/env-polaris + +# Launch MPS daemon +NNODES=`wc -l < $PBS_NODEFILE` +mpiexec -n ${NNODES} --ppn 1 /lus/eagle/projects/ExaMol/mofa/mof-generation-at-scale/bin/enable_mps_polaris.sh & + +cd $PBS_O_WORKDIR +pwd +which python +hostname + """, + nodes_per_block=1, + init_blocks=1, + min_blocks=0, + max_blocks=1, + cpus_per_node=32, + walltime="1:00:00", + ) + ) + ]) + elif args.config == "sunspot": + cp2k_cmd = (f'mpiexec -n {args.num_nodes * args.ranks_per_node} --ppn {args.ranks_per_node}' + f' --cpu-bind depth --depth {104 // args.ranks_per_node} -env OMP_NUM_THREADS={104 // args.ranks_per_node} ' + '/lus/gila/projects/CSC249ADCD08_CNDA/cp2k/cp2k-2024.1/exe/local/cp2k_shell.psmp') + config = Config( + retries=2, + executors=[ + HighThroughputExecutor( + label="sunspot_test", + prefetch_capacity=0, + max_workers=1, + provider=PBSProProvider( + account="CSC249ADCD08_CNDA", + queue="workq", + worker_init=""" +source activate /lus/gila/projects/CSC249ADCD08_CNDA/mof-generation-at-scale/env +module reset +module use /soft/modulefiles/ +module use /home/ftartagl/graphics-compute-runtime/modulefiles +module load oneapi/release/2023.12.15.001 +module load intel_compute_runtime/release/775.20 +module load mpich/gnu-all-debug-pmix-gpu/52.2 +module load gcc/12.2.0 +module load fftw +module list + +cd $PBS_O_WORKDIR +pwd +which python +hostname + """, + walltime="1:10:00", + launcher=SimpleLauncher(), + select_options="system=sunspot,place=scatter", + nodes_per_block=1, + min_blocks=0, + max_blocks=1, # Can increase more to have more parallel batch jobs + cpus_per_node=208, + ), + ), + ] + ) + else: + raise ValueError(f'Configuration not defined: {args.config}') + + # Prepare parsl + parsl.load(config) + test_app = PythonApp(test_function) + + # Submit each MOF + futures = [] + with open('../lammps-md/example-mofs.json') as fp: + for line, _ in zip(fp, range(args.num_to_run)): + mof = MOFRecord(**json.loads(line)) + future = test_app(mof, cp2k_cmd, args.steps) + future.mof = mof + futures.append(future) + + # Store results + for future in tqdm(as_completed(futures), total=len(futures)): + if future.exception() is not None: + print(f'{future.mof.name} failed: {future.exception()}') + continue + runtime, (atoms, run_path) = future.result() + + # Get the strain + charges = compute_partial_charges(run_path).arrays['q'] + # Store the result + with open('runtimes.json', 'a') as fp: + print(json.dumps({ + 'host': node(), + 'nodes': args.num_nodes, + 'ranks-per-node': args.ranks_per_node, + 'cp2k_cmd': cp2k_cmd, + 'steps': args.steps, + 'mof': mof.name, + 'runtime': runtime, + 'charges': charges.tolist(), + 'strc': write_to_string(atoms, 'vasp') + }), file=fp) diff --git a/component-tests/training/run_test.py b/component-tests/training/run_test.py index 2ab7d18d..a280cb96 100644 --- a/component-tests/training/run_test.py +++ b/component-tests/training/run_test.py @@ -1,5 +1,6 @@ import json import argparse +from itertools import cycle from pathlib import Path import gzip @@ -10,7 +11,7 @@ from parsl.app.python import PythonApp from parsl.executors import HighThroughputExecutor from parsl.providers import PBSProProvider -from parsl.launchers import MpiExecLauncher +from parsl.launchers import SimpleLauncher from mofa.model import MOFRecord @@ -71,6 +72,8 @@ def test_function(model_path: Path, config_path: Path, training_set: list, num_e record = json.loads(line) record.pop('_id') training_set.append(MOFRecord(**record)) + if len(training_set) < args.training_size: + training_set = [l for l, _ in zip(cycle(training_set), range(args.training_size))] # Select the correct configuraion if args.config == "local": @@ -108,29 +111,12 @@ def test_function(model_path: Path, config_path: Path, training_set: list, num_e ) ]) elif args.config.startswith("sunspot"): - if args.config == "sunspot": - accel_ids = [ - f"{gid}.{tid}" - for gid in range(6) - for tid in range(2) - ] - elif args.config == "sunspot-device": - accel_ids = [ - f"{gid}.0,{gid}.1" - for gid in range(6) - ] - else: - raise ValueError(f'Not supported: {args.config}') config = Config( - retries=2, executors=[ HighThroughputExecutor( label="sunspot_test", - available_accelerators=accel_ids, # Ensures one worker per accelerator - cpu_affinity="block", # Assigns cpus in sequential order prefetch_capacity=0, - max_workers=len(accel_ids), - cores_per_worker=208 // len(accel_ids), + max_workers=1, provider=PBSProProvider( account="CSC249ADCD08_CNDA", queue="workq", @@ -144,16 +130,15 @@ def test_function(model_path: Path, config_path: Path, training_set: list, num_e module load gcc/12.2.0 module list -{"" if len(accel_ids) == 12 else "export IPEX_TILE_AS_DEVICE=0"} +python -c "import intel_extension_for_pytorch as ipex; print(ipex.xpu.device_count())" + cd $PBS_O_WORKDIR pwd which python hostname """, walltime="1:10:00", - launcher=MpiExecLauncher( - bind_cmd="--cpu-bind", overrides="--depth=208 --ppn 1" - ), # Ensures 1 manger per node and allows it to divide work among all 208 threads + launcher=SimpleLauncher(), select_options="system=sunspot,place=scatter", nodes_per_block=1, min_blocks=0, diff --git a/envs/build-aurora.sh b/envs/build-aurora.sh index 70acd73d..49278616 100755 --- a/envs/build-aurora.sh +++ b/envs/build-aurora.sh @@ -14,7 +14,7 @@ conda activate ./env # Build torch_ccl locally # Clone from: https://github.com/intel/torch-ccl -cd libs/torch_ccl +cd libs/torch-ccl COMPUTE_BACKEND=dpcpp pip install -e . # Now install Corey's stuff diff --git a/envs/environment-aurora.yml b/envs/environment-aurora.yml index a3c8957d..01e68dea 100644 --- a/envs/environment-aurora.yml +++ b/envs/environment-aurora.yml @@ -38,9 +38,18 @@ dependencies: - pytorch==2.1.0 - intel-extension-for-pytorch==2.1.10 + # Tools to build CCL locally + - conda-forge::cmake + - ninja + - pip - pip: - git+https://gitlab.com/ase/ase.git - git+https://github.com/exalearn/colmena.git # Fixes for streaming not yet on PyPI + + # Install ccl manually for now, uncomment when SSL doesn't disagree between + # the following wheel's version and Sunspot/Aurora + #- --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + # - oneccl_bind_pt==2.1.200+xpu - -e ..[test] diff --git a/mofa/difflinker_train.py b/mofa/difflinker_train.py index 303da7b4..974f4c2c 100644 --- a/mofa/difflinker_train.py +++ b/mofa/difflinker_train.py @@ -5,6 +5,7 @@ from pytorch_lightning import Trainer, callbacks from pytorch_lightning.callbacks import TQDMProgressBar +from pytorch_lightning.strategies import SingleDeviceStrategy try: import intel_extension_for_pytorch as ipex # noqa: F401 @@ -150,6 +151,11 @@ def main( if '.' in args.train_data_prefix: context_node_nf += 1 + # Lock XPU to single device for now + strategy = 'auto' + if args.device == 'xpu': + strategy = SingleDeviceStrategy(device='xpu') + checkpoint_callback = [callbacks.ModelCheckpoint( dirpath=checkpoints_dir, filename='difflinker_{epoch:02d}', @@ -164,6 +170,7 @@ def main( accelerator=args.device, num_sanity_val_steps=0, enable_progress_bar=args.enable_progress_bar, + strategy=strategy ) # Add a callback for fit setup