Open-Catalyst-Project · jeevster · Apr 22, 2024 · May 15, 2024 · May 15, 2024 · May 15, 2024
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,2 @@
+.pdb filter=lfs diff=lfs merge=lfs -text
+*.pdb filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@ checkpoints
 results
 logs
 *.traj
+*.pdb
 experimental
 
 # Byte-compiled / optimized / DLL files

diff --git a/electrolytes/run_extraction.sh b/electrolytes/run_extraction.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+#TODO: can we automatically extract the names of all the solute atoms from the PDB file so we don't have to re-run this command for each solute?
+
+# Run these scripts from om-data/electrolytes
+python solvation_shell_extract.py --pdb_file_path 'testfiles/water_nacl_example.pdb' \
+                                  --save_dir 'results' \
+                                  --system_name 'NaCl_Water' \
+                                  --solute_atom 'NA0' \
+                                  --min_coord 2 \
+                                  --max_coord 5 \
+                                  --top_n 20
+
+python solvation_shell_extract.py --pdb_file_path 'testfiles/water_nacl_example.pdb' \
+                                  --save_dir 'results' \
+                                  --system_name 'NaCl_Water' \
+                                  --solute_atom 'CL0' \
+                                  --min_coord 2 \
+                                  --max_coord 5 \
+                                  --top_n 20
diff --git a/electrolytes/solvation_shell_extract.py b/electrolytes/solvation_shell_extract.py
@@ -0,0 +1,179 @@
+import os
+import logging
+import argparse
+import random
+import numpy as np
+from tqdm import tqdm
+from collections import defaultdict
+import matplotlib.pyplot as plt
+import MDAnalysis as mda
+from solvation_analysis.solute import Solute
+from solvation_analysis._column_names import *
+from pymatgen.core.structure import Molecule
+from solvation_shell_utils import filter_by_rmsd, wrap_positions
+
+
+def extract_solvation_shells(
+    pdb_file_path: str,
+    save_dir: str,
+    system_name: str,
+    solute_atom: str,
+    min_coord: int,
+    max_coord: int,
+    top_n: int,
+):
+    """
+    Given a MD trajectory in a PDB file, perform a solvation analysis
+    on the specified solute to extract the first solvation shell. For each coordination number in the specified range,
+    extract and save the top_n most diverse snapshots based on a RMSD criterion.
+
+    Args:
+        pdb_file_path: Path to the PDB file containing the MD trajectory
+        save_dir: Directory in which to save extracted solvation shells
+        system_name: Name of the system - used for naming the save directory
+        solute_atom: Name (in the PDB file) of the solute atom type (e.g NA0) with which to perform the solvation analysis
+        min_coord: Minimum coordination number to consider
+        max_coord: Maximum coordination number to consider
+        top_n: Number of snapshots to extract per coordination number.
+    """
+
+    # Create save directory
+    os.makedirs(os.path.join(save_dir, system_name, solute_atom), exist_ok=True)
+
+    # Initialize MDA Universe
+    universe = mda.Universe(pdb_file_path)
+
+    # Add PBC box
+    with open(pdb_file_path) as file:
+        dimension_lines = file.readlines()[1]
+        a = float(dimension_lines.split()[1])
+        b = float(dimension_lines.split()[2])
+        c = float(dimension_lines.split()[3])
+        universe.dimensions = [a, b, c, 90, 90, 90]
+
+    lattices = np.array([a, b, c])[None][None]
+
+    # Choose solute atom
+    solu = universe.select_atoms(f"name {solute_atom}")
+
+    logging.info("Translating atoms to solute center of mass")
+    for ts in tqdm(universe.trajectory):
+        ts.dimensions = universe.dimensions
+        solu_center = solu.center_of_mass(wrap=True)
+        dim = ts.triclinic_dimensions
+        box_center = np.sum(dim, axis=0) / 2
+        universe.atoms.translate(box_center - solu_center)
+
+        universe.atoms.unwrap()
+
+    solvent = universe.atoms - solu
+
+    solv_anal = Solute.from_atoms(solu, {"solvent": solvent}, solute_name=solute_atom)
+
+    # Identify the cutoff for the first solvation shell, based on the MD trajectory
+    logging.info("Running solvation analysis")
+    solv_anal.run()
+
+    # Plot the RDF
+    solv_anal.plot_solvation_radius("solute", "solvent")
+    plt.savefig(os.path.join(save_dir, system_name, solute_atom, "solvation_rdf.png"))
+
+    # There's probably a much faster way to do this
+    # But for now, we're prototyping, so slow is okay
+    shells = dict()
+    for j in solv_anal.speciation.speciation_fraction["solvent"]:
+        shells[j] = solv_anal.speciation.get_shells({"solvent": j})
+
+    # Now let's try getting the most diverse structures for each particular coordination number
+    # This is also a bit slow, particularly for the more common and/or larger solvent shells
+    for c in range(min_coord, max_coord + 1):
+        logging.info(f"Processing shells with coordination number {c}")
+        os.makedirs(
+            os.path.join(save_dir, system_name, solute_atom, f"coord={c}"),
+            exist_ok=True,
+        )
+        shell_species = []
+        shell_positions = []
+        for index, _ in tqdm(shells[c].iterrows()):
+            ts = universe.trajectory[index[0]]
+            universe.atoms.unwrap()
+            shell = solv_anal.solvation_data.xs(
+                (index[0], index[1]), level=(FRAME, SOLUTE_IX)
+            )
+            shell = solv_anal._df_to_atom_group(shell, solute_index=index[1])
+            shell = shell.copy()
+            if len(shell.atoms.elements) > len(shell_species):
+                shell_species = shell.atoms.elements
+
+            shell_positions.append(wrap_positions(shell.atoms.positions, lattices))
+
+        by_num_atoms = defaultdict(list)
+        for sps in shell_positions:
+            by_num_atoms[len(sps)].append(sps)
+
+        # filter by number of atoms per shell
+        selections_by_num_atoms = {
+            num_atoms: filter_by_rmsd(shells_with_num_atoms, top_n)
+            for num_atoms, shells_with_num_atoms in by_num_atoms.items()
+        }
+
+        for (
+            shell_size,
+            shell_positions,
+        ) in selections_by_num_atoms.items():  # loop over sizes
+            for idx, shell_pos in enumerate(shell_positions):
+                if shell_pos.shape[0] == shell_species.shape[0]:
+
+                    # Save shell as xyz file
+                    mol = Molecule(shell_species, shell_pos, charge=-1)
+                    mol.to(
+                        os.path.join(
+                            save_dir,
+                            system_name,
+                            solute_atom,
+                            f"coord={c}",
+                            f"size{shell_size}_selection{idx}.xyz",
+                        ),
+                        "xyz",
+                    )
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
+    random.seed(10)
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pdb_file_path", type=str, help="PDB trajectory file path")
+    parser.add_argument("--save_dir", type=str, help="Path to save xyz files")
+    parser.add_argument(
+        "--system_name", type=str, help="Name of system used for directory naming"
+    )
+    parser.add_argument(
+        "--solute_atom",
+        type=str,
+        help="Which solute atom to extract solvation shells for",
+    )
+    parser.add_argument(
+        "--min_coord", type=int, help="Minimum shell coordination number to extract"
+    )
+    parser.add_argument(
+        "--max_coord", type=int, help="Maximum shell coordination number to extract"
+    )
+    parser.add_argument(
+        "--top_n",
+        type=int,
+        default=20,
+        help="Number of most diverse shells to extract per coordination number",
+    )
+
+    args = parser.parse_args()
+
+    extract_solvation_shells(
+        args.pdb_file_path,
+        args.save_dir,
+        args.system_name,
+        args.solute_atom,
+        args.min_coord,
+        args.max_coord,
+        args.top_n,
+    )
diff --git a/electrolytes/solvation_shell_utils.py b/electrolytes/solvation_shell_utils.py
@@ -0,0 +1,53 @@
+import copy
+import itertools
+import numpy as np
+import random
+from rmsd import kabsch_rmsd
+
+
+def filter_by_rmsd(coords, n=20):
+    """
+    From a set of coordinates, determine the n most diverse, where "most diverse" means "most different, in terms of minimum RMSD.
+    We use the Kabsch Algorithm (https://en.wikipedia.org/wiki/Kabsch_algorithm) to align coordinates based on rotation/translation before computing the RMSD.
+    Note: The Max-Min Diversity Problem is in general NP-hard. This algorithm generates a candidate solution to MMDP for these coords
+    by assuming that the random seed point is actually in the MMDP set (which there's no reason a priori to assume). As a result, if we ran this function multiple times, we would get different results.
+
+    Args:
+        coords: list of np.ndarrays of atom coordinates. Must all have the same shape ([N_atoms, 3]), and must all reflect the same atom order!
+            Note that this latter requirement shouldn't be a problem, specifically when dealing with IonSolvR data.
+        n: number of most diverse coordinates to return
+    """
+
+    seed_point = random.randint(0, len(coords) - 1)
+    states = {seed_point}
+    min_rmsds = np.array(
+        [kabsch_rmsd(coords[seed_point], coord, translate=True) for coord in coords]
+    )
+    for _ in range(n - 1):
+        best = np.argmax(min_rmsds)
+        min_rmsds = np.minimum(
+            min_rmsds,
+            np.array(
+                [kabsch_rmsd(coords[best], coord, translate=True) for coord in coords]
+            ),
+        )
+        states.add(best)
+
+    return [coords[i] for i in states]
+
+
+def wrap_positions(positions, lattices):
+    """
+    Wraps input positions based on periodic boundary conditions.
+    Args:
+        positions: numpy array of positions, shape [N_atoms, 3]
+        lattices: numpy array representing dimensions of simulation box, shape [1, 1, 3]
+    Returns:
+        numpy array of wrapped_positions, shape [N_atoms, 3]
+    """
+    displacements = positions[:, np.newaxis, :] - positions[np.newaxis, :, :]
+    idx = np.where(displacements > lattices / 2)[0]
+    dim = np.where(displacements > lattices / 2)[2]
+    if idx.shape[0] > 0:
+        positions[idx, dim] -= lattices[0, 0, dim]
+    return positions
diff --git a/electrolytes/testfiles/water_nacl_example.pdb b/electrolytes/testfiles/water_nacl_example.pdb
diff --git a/setup.py b/setup.py
@@ -12,6 +12,15 @@
     description="Code for generating OMOL input configurations",
     url="http://github.com/Open-Catalyst-Project/om-data",
     packages=find_packages(),
-    install_requires=["ase", "quacc[sella]>=0.7.2"],
+    install_requires=[
+        "ase",
+        "quacc[sella]>=0.7.2",
+        "numpy",
+        "tqdm",
+        "rmsd",
+        "MDAnalysis",
+        "pymatgen",
+        "solvation-analysis",
+    ],
     include_package_data=True,
 )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		.pdb filter=lfs diff=lfs merge=lfs -text
		*.pdb filter=lfs diff=lfs merge=lfs -text
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,6 +4,7 @@ checkpoints @@
     results
     logs
     *.traj
+    *.pdb
     experimental
     # Byte-compiled / optimized / DLL files
@@ Expand Down @@