From 5a95d6686ca0fdd24573965cd787e848d57c9988 Mon Sep 17 00:00:00 2001
From: Logan Ward <WardLT@users.noreply.github.com>
Date: Mon, 10 Jun 2024 09:42:30 -0400
Subject: [PATCH] Add more testing scripts (#135)

* Increase training set size as needed

* Initial CP2K test

* Fix how we access charges

* Fix typos, modules in configs

* Enforce single XPU training (#134)

* Remove duplicate import statements
---
 component-tests/cp2k/README.md       |   5 +
 component-tests/cp2k/run_test.py     | 183 +++++++++++++++++++++++++++
 component-tests/training/run_test.py |  31 ++---
 envs/build-aurora.sh                 |   2 +-
 envs/environment-aurora.yml          |   9 ++
 mofa/difflinker_train.py             |   7 +
 6 files changed, 213 insertions(+), 24 deletions(-)
 create mode 100644 component-tests/cp2k/README.md
 create mode 100644 component-tests/cp2k/run_test.py

diff --git a/component-tests/cp2k/README.md b/component-tests/cp2k/README.md
new file mode 100644
index 00000000..b793a78b
--- /dev/null
+++ b/component-tests/cp2k/README.md
@@ -0,0 +1,5 @@
+# CP2K
+
+Test CP2K runtime as a function of layout (nodes, ranks per node) 
+and problem size (number of steps).
+Measure the change in the  structure and charges with problem size.
diff --git a/component-tests/cp2k/run_test.py b/component-tests/cp2k/run_test.py
new file mode 100644
index 00000000..edec5266
--- /dev/null
+++ b/component-tests/cp2k/run_test.py
@@ -0,0 +1,183 @@
+"""Test LAMMPS by running a large number of MD simulations with different runtimes"""
+from concurrent.futures import as_completed
+from pathlib import Path
+from platform import node
+import argparse
+import json
+
+from tqdm import tqdm
+from ase import Atoms
+import parsl
+from parsl.config import Config
+from parsl.app.python import PythonApp
+from parsl.executors import HighThroughputExecutor
+from parsl.providers import PBSProProvider
+from parsl.launchers import SimpleLauncher
+
+from mofa.model import MOFRecord
+from mofa.scoring.geometry import LatticeParameterChange
+from mofa.simulation.cp2k import compute_partial_charges
+from mofa.utils.conversions import write_to_string
+
+
+def test_function(strc: MOFRecord, cp2k_invocation: str, steps: int) -> tuple[float, tuple[Atoms, Path]]:
+    """Run a LAMMPS simulation, report runtime and resultant traj
+
+    Args:
+        strc: MOF to use
+        cp2k_invocation: Command to invoke CP2K
+        steps: Number of optimization steps
+    Returns:
+        - Runtime (s)
+        - MD trajectory
+    """
+    from mofa.simulation.cp2k import CP2KRunner
+    from time import perf_counter
+    from pathlib import Path
+
+    run_dir = Path(f'run-{steps}')
+    run_dir.mkdir(exist_ok=True, parents=True)
+
+    # Run
+    runner = CP2KRunner(cp2k_invocation, run_dir=run_dir)
+    start_time = perf_counter()
+    output = runner.run_optimization(strc, steps=steps)
+    run_time = perf_counter() - start_time
+
+    return run_time, output
+
+
+if __name__ == "__main__":
+    # Get the length of the runs, etc
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--ranks-per-node', help='Number of CP2K ranks to deploy per node', type=int, default=4)
+    parser.add_argument('--num-nodes', help='Number of nodes to use per computation', type=int, default=1)
+    parser.add_argument('--steps', help='Number of optimization steps to run', default=4, type=int)
+    parser.add_argument('--num-to-run', help='Number of MOFs to evaluate', default=4, type=int)
+    parser.add_argument('--config', help='Which compute configuration to use', default='local')
+    args = parser.parse_args()
+
+    # Select the correct configuration
+    if args.config == "local":
+        assert args.num_nodes == 1, 'Only support 1 node for local config'
+        cp2k_cmd = (f'env OMP_NUM_THREADS={12 // args.ranks_per_node} /usr/bin/mpiexec -np {args.ranks_per_node}'
+                    f' /home/lward/Software/cp2k-lward-fork/exe/local/cp2k_shell.psmp')
+        config = Config(executors=[HighThroughputExecutor(max_workers=1)])
+    elif args.config == "polaris":
+        cp2k_cmd = (f'mpiexec -n {args.num_nodes * args.ranks_per_node} --ppn {args.ranks_per_node}'
+                    f' --cpu-bind depth --depth {32 // args.ranks_per_node} -env OMP_NUM_THREADS={32 // args.ranks_per_node} '
+                    '/lus/eagle/projects/ExaMol/cp2k-2024.1/set_affinity_gpu_polaris.sh '
+                    '/lus/eagle/projects/ExaMol/cp2k-2024.1/exe/local_cuda/cp2k_shell.psmp')
+        config = Config(retries=4, executors=[
+            HighThroughputExecutor(
+                max_workers=4,
+                provider=PBSProProvider(
+                    launcher=SimpleLauncher(),
+                    account='ExaMol',
+                    queue='debug',
+                    select_options="ngpus=4",
+                    scheduler_options="#PBS -l filesystems=home:eagle",
+                    worker_init="""
+module load kokkos
+module load nvhpc/23.3
+module list
+source activate /lus/eagle/projects/ExaMol/mofa/mof-generation-at-scale/env-polaris
+
+# Launch MPS daemon
+NNODES=`wc -l < $PBS_NODEFILE`
+mpiexec -n ${NNODES} --ppn 1 /lus/eagle/projects/ExaMol/mofa/mof-generation-at-scale/bin/enable_mps_polaris.sh &
+
+cd $PBS_O_WORKDIR
+pwd
+which python
+hostname
+                    """,
+                    nodes_per_block=1,
+                    init_blocks=1,
+                    min_blocks=0,
+                    max_blocks=1,
+                    cpus_per_node=32,
+                    walltime="1:00:00",
+                )
+            )
+        ])
+    elif args.config == "sunspot":
+        cp2k_cmd = (f'mpiexec -n {args.num_nodes * args.ranks_per_node} --ppn {args.ranks_per_node}'
+                    f' --cpu-bind depth --depth {104 // args.ranks_per_node} -env OMP_NUM_THREADS={104 // args.ranks_per_node} '
+                    '/lus/gila/projects/CSC249ADCD08_CNDA/cp2k/cp2k-2024.1/exe/local/cp2k_shell.psmp')
+        config = Config(
+            retries=2,
+            executors=[
+                HighThroughputExecutor(
+                    label="sunspot_test",
+                    prefetch_capacity=0,
+                    max_workers=1,
+                    provider=PBSProProvider(
+                        account="CSC249ADCD08_CNDA",
+                        queue="workq",
+                        worker_init="""
+source activate /lus/gila/projects/CSC249ADCD08_CNDA/mof-generation-at-scale/env
+module reset
+module use /soft/modulefiles/
+module use /home/ftartagl/graphics-compute-runtime/modulefiles
+module load oneapi/release/2023.12.15.001
+module load intel_compute_runtime/release/775.20
+module load mpich/gnu-all-debug-pmix-gpu/52.2
+module load gcc/12.2.0
+module load fftw
+module list
+
+cd $PBS_O_WORKDIR
+pwd
+which python
+hostname
+                        """,
+                        walltime="1:10:00",
+                        launcher=SimpleLauncher(),
+                        select_options="system=sunspot,place=scatter",
+                        nodes_per_block=1,
+                        min_blocks=0,
+                        max_blocks=1,  # Can increase more to have more parallel batch jobs
+                        cpus_per_node=208,
+                    ),
+                ),
+            ]
+        )
+    else:
+        raise ValueError(f'Configuration not defined: {args.config}')
+
+    # Prepare parsl
+    parsl.load(config)
+    test_app = PythonApp(test_function)
+
+    # Submit each MOF
+    futures = []
+    with open('../lammps-md/example-mofs.json') as fp:
+        for line, _ in zip(fp, range(args.num_to_run)):
+            mof = MOFRecord(**json.loads(line))
+            future = test_app(mof, cp2k_cmd, args.steps)
+            future.mof = mof
+            futures.append(future)
+
+    # Store results
+    for future in tqdm(as_completed(futures), total=len(futures)):
+        if future.exception() is not None:
+            print(f'{future.mof.name} failed: {future.exception()}')
+            continue
+        runtime, (atoms, run_path) = future.result()
+
+        # Get the strain
+        charges = compute_partial_charges(run_path).arrays['q']
+        # Store the result
+        with open('runtimes.json', 'a') as fp:
+            print(json.dumps({
+                'host': node(),
+                'nodes': args.num_nodes,
+                'ranks-per-node': args.ranks_per_node,
+                'cp2k_cmd': cp2k_cmd,
+                'steps': args.steps,
+                'mof': mof.name,
+                'runtime': runtime,
+                'charges': charges.tolist(),
+                'strc': write_to_string(atoms, 'vasp')
+            }), file=fp)
diff --git a/component-tests/training/run_test.py b/component-tests/training/run_test.py
index 2ab7d18d..a280cb96 100644
--- a/component-tests/training/run_test.py
+++ b/component-tests/training/run_test.py
@@ -1,5 +1,6 @@
 import json
 import argparse
+from itertools import cycle
 from pathlib import Path
 
 import gzip
@@ -10,7 +11,7 @@
 from parsl.app.python import PythonApp
 from parsl.executors import HighThroughputExecutor
 from parsl.providers import PBSProProvider
-from parsl.launchers import MpiExecLauncher
+from parsl.launchers import SimpleLauncher
 
 from mofa.model import MOFRecord
 
@@ -71,6 +72,8 @@ def test_function(model_path: Path, config_path: Path, training_set: list, num_e
             record = json.loads(line)
             record.pop('_id')
             training_set.append(MOFRecord(**record))
+    if len(training_set) < args.training_size:
+        training_set = [l for l, _ in zip(cycle(training_set), range(args.training_size))]
 
     # Select the correct configuraion
     if args.config == "local":
@@ -108,29 +111,12 @@ def test_function(model_path: Path, config_path: Path, training_set: list, num_e
             )
         ])
     elif args.config.startswith("sunspot"):
-        if args.config == "sunspot":
-            accel_ids = [
-                f"{gid}.{tid}"
-                for gid in range(6)
-                for tid in range(2)
-            ]
-        elif args.config == "sunspot-device":
-            accel_ids = [
-                f"{gid}.0,{gid}.1"
-                for gid in range(6)
-            ]
-        else:
-            raise ValueError(f'Not supported: {args.config}')
         config = Config(
-            retries=2,
             executors=[
                 HighThroughputExecutor(
                     label="sunspot_test",
-                    available_accelerators=accel_ids,  # Ensures one worker per accelerator
-                    cpu_affinity="block",  # Assigns cpus in sequential order
                     prefetch_capacity=0,
-                    max_workers=len(accel_ids),
-                    cores_per_worker=208 // len(accel_ids),
+                    max_workers=1,
                     provider=PBSProProvider(
                         account="CSC249ADCD08_CNDA",
                         queue="workq",
@@ -144,16 +130,15 @@ def test_function(model_path: Path, config_path: Path, training_set: list, num_e
 module load gcc/12.2.0
 module list
 
-{"" if len(accel_ids) == 12 else "export IPEX_TILE_AS_DEVICE=0"}
+python -c "import intel_extension_for_pytorch as ipex; print(ipex.xpu.device_count())"
+
 cd $PBS_O_WORKDIR
 pwd
 which python
 hostname
                         """,
                         walltime="1:10:00",
-                        launcher=MpiExecLauncher(
-                            bind_cmd="--cpu-bind", overrides="--depth=208 --ppn 1"
-                        ),  # Ensures 1 manger per node and allows it to divide work among all 208 threads
+                        launcher=SimpleLauncher(),
                         select_options="system=sunspot,place=scatter",
                         nodes_per_block=1,
                         min_blocks=0,
diff --git a/envs/build-aurora.sh b/envs/build-aurora.sh
index 70acd73d..49278616 100755
--- a/envs/build-aurora.sh
+++ b/envs/build-aurora.sh
@@ -14,7 +14,7 @@ conda activate ./env
 
 # Build torch_ccl locally
 #  Clone from: https://github.com/intel/torch-ccl
-cd libs/torch_ccl
+cd libs/torch-ccl
 COMPUTE_BACKEND=dpcpp pip install -e .
 
 # Now install Corey's stuff
diff --git a/envs/environment-aurora.yml b/envs/environment-aurora.yml
index a3c8957d..01e68dea 100644
--- a/envs/environment-aurora.yml
+++ b/envs/environment-aurora.yml
@@ -38,9 +38,18 @@ dependencies:
   - pytorch==2.1.0
   - intel-extension-for-pytorch==2.1.10
 
+  # Tools to build CCL locally
+  - conda-forge::cmake
+  - ninja
+
   - pip
   - pip:
     - git+https://gitlab.com/ase/ase.git
     - git+https://github.com/exalearn/colmena.git  # Fixes for streaming not yet on PyPI
+
+    # Install ccl manually for now, uncomment when SSL doesn't disagree between
+    #  the following wheel's version and Sunspot/Aurora
+    #- --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+    #    - oneccl_bind_pt==2.1.200+xpu
     - -e ..[test]
 
diff --git a/mofa/difflinker_train.py b/mofa/difflinker_train.py
index 303da7b4..974f4c2c 100644
--- a/mofa/difflinker_train.py
+++ b/mofa/difflinker_train.py
@@ -5,6 +5,7 @@
 
 from pytorch_lightning import Trainer, callbacks
 from pytorch_lightning.callbacks import TQDMProgressBar
+from pytorch_lightning.strategies import SingleDeviceStrategy
 
 try:
     import intel_extension_for_pytorch as ipex  # noqa: F401
@@ -150,6 +151,11 @@ def main(
             if '.' in args.train_data_prefix:
                 context_node_nf += 1
 
+            # Lock XPU to single device for now
+            strategy = 'auto'
+            if args.device == 'xpu':
+                strategy = SingleDeviceStrategy(device='xpu')
+
             checkpoint_callback = [callbacks.ModelCheckpoint(
                 dirpath=checkpoints_dir,
                 filename='difflinker_{epoch:02d}',
@@ -164,6 +170,7 @@ def main(
                 accelerator=args.device,
                 num_sanity_val_steps=0,
                 enable_progress_bar=args.enable_progress_bar,
+                strategy=strategy
             )
 
             # Add a callback for fit setup