From c8cea0517b5f0b7a7d40666796cc84a1397cfeff Mon Sep 17 00:00:00 2001 From: Alain Vaucher Date: Fri, 10 Feb 2023 15:48:03 +0100 Subject: [PATCH] Make canonicalize_any more flexible (#21) --- src/rxn/chemutils/miscellaneous.py | 57 +++++++++++++++++++++-- src/rxn/chemutils/scripts/canonicalize.py | 2 + src/rxn/chemutils/scripts/tokenize.py | 2 + tests/test_miscellaneous.py | 19 ++++++++ 4 files changed, 77 insertions(+), 3 deletions(-) diff --git a/src/rxn/chemutils/miscellaneous.py b/src/rxn/chemutils/miscellaneous.py index ad4d764..f2b1fd3 100644 --- a/src/rxn/chemutils/miscellaneous.py +++ b/src/rxn/chemutils/miscellaneous.py @@ -1,3 +1,4 @@ +import logging import re import typing from collections import Counter @@ -5,6 +6,12 @@ from typing import Callable, List from rdkit.Chem import AddHs, Atom, Mol +from rxn.utilities.files import ( + PathLike, + dump_list_to_file, + iterate_lines_from_file, + raise_if_paths_are_identical, +) from .conversion import canonicalize_smiles, smiles_to_mol from .exceptions import InvalidSmiles @@ -27,6 +34,9 @@ to_reaction_smiles, ) +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) + CHIRAL_CENTER_PATTERN = re.compile( r"\[([^],@]+)@+([^]]*)]" ) # Matches stereo centres, and groups what comes before and after "@" @@ -180,7 +190,12 @@ def apply_to_smiles_groups( return list_to_multicomponent_smiles(fn(compounds), fragment_bond="~") -def canonicalize_any(any_smiles: str, check_valence: bool = True) -> str: +def canonicalize_any( + any_smiles: str, + check_valence: bool = True, + sort_molecules: bool = False, + fallback_value: typing.Optional[str] = None, +) -> str: """ Canonicalize any SMILES string (molecule SMILES, multicomponent SMILES, reaction SMILES). @@ -189,6 +204,9 @@ def canonicalize_any(any_smiles: str, check_valence: bool = True) -> str: Args: any_smiles: any kind of SMILES string. check_valence: if False, will not do any valence check. + sort_molecules: whether to sort the compounds alphabetically at the same time. + fallback_value: what value to returns when the canonicalization is unsuccessful. + Default: no fallback, will propagate the exception. Raises: Exception: different kinds of exception may be raised during parsing. @@ -197,8 +215,41 @@ def canonicalize_any(any_smiles: str, check_valence: bool = True) -> str: Returns: the canonical (molecule, multicomponent, or reaction) SMILES string. """ - fn = partial(canonicalize_smiles, check_valence=check_valence) - return apply_to_any_smiles(any_smiles, fn) + try: + fn = partial(canonicalize_smiles, check_valence=check_valence) + canonical_smiles = apply_to_any_smiles(any_smiles, fn) + if sort_molecules: + canonical_smiles = sort_any(canonical_smiles) + return canonical_smiles + except Exception as e: + if fallback_value is not None: + logger.debug(f'Error when canonicalizing "{any_smiles}": {e}') + return fallback_value + raise + + +def canonicalize_file( + input_file: PathLike, + output_file: PathLike, + check_valence: bool = True, + fallback_value: str = "", + sort_molecules: bool = False, +) -> None: + raise_if_paths_are_identical(input_file, output_file) + logger.info(f'Canonicalizing file "{input_file}" -> "{output_file}".') + + # We formulate it as a generator, so that the file below is written directly + canonical = ( + canonicalize_any( + line, + check_valence=check_valence, + fallback_value=fallback_value, + sort_molecules=sort_molecules, + ) + for line in iterate_lines_from_file(input_file) + ) + + dump_list_to_file(canonical, output_file) def sort_any(any_smiles: str) -> str: diff --git a/src/rxn/chemutils/scripts/canonicalize.py b/src/rxn/chemutils/scripts/canonicalize.py index b5b2734..a84e4e2 100644 --- a/src/rxn/chemutils/scripts/canonicalize.py +++ b/src/rxn/chemutils/scripts/canonicalize.py @@ -2,6 +2,7 @@ from typing import Optional, TextIO import click +from rxn.utilities.logging import setup_console_logger from rxn.chemutils.miscellaneous import canonicalize_any @@ -27,6 +28,7 @@ def main( first argument, and write to stdout, or from a file given as the second argument. """ + setup_console_logger() for line in input_file: smiles = line.strip() diff --git a/src/rxn/chemutils/scripts/tokenize.py b/src/rxn/chemutils/scripts/tokenize.py index 2d0370a..6a0947e 100644 --- a/src/rxn/chemutils/scripts/tokenize.py +++ b/src/rxn/chemutils/scripts/tokenize.py @@ -2,6 +2,7 @@ from typing import Optional, TextIO import click +from rxn.utilities.logging import setup_console_logger from rxn.chemutils.tokenization import tokenize_smiles @@ -24,6 +25,7 @@ def main( first argument, and write to stdout, or from a file given as the second argument. """ + setup_console_logger() for line in input_file: smiles = line.strip() diff --git a/tests/test_miscellaneous.py b/tests/test_miscellaneous.py index e7cefd0..3a31795 100644 --- a/tests/test_miscellaneous.py +++ b/tests/test_miscellaneous.py @@ -183,6 +183,25 @@ def test_canonicalize_any_on_reaction_smiles() -> None: _ = canonicalize_any("CC>>CC>>C(O)") +def test_canonicalize_any_with_fallback_value() -> None: + fallback = "some_fallback_value" + + # No error -> fallback value not used + assert canonicalize_any("C(C)C", fallback_value=fallback) == "CCC" + assert canonicalize_any("CO>>C(C)C", fallback_value=fallback) == "CO>>CCC" + + # Error -> returns the fallback value + assert canonicalize_any("CoMo", fallback_value=fallback) == fallback + assert canonicalize_any("invalid>>C(C)C", fallback_value=fallback) == fallback + + +def test_canonicalize_any_with_sorting() -> None: + # Note: the sorting in `N~OO` comes from the RDKit canonicalization of that compound. + assert canonicalize_any("C(C).OO~N", sort_molecules=True) == "CC.N~OO" + assert canonicalize_any("OO~N.C(C)", sort_molecules=True) == "CC.N~OO" + assert canonicalize_any("OO~N.C(C)>>N.C", sort_molecules=True) == "CC.N~OO>>C.N" + + def test_sort_any() -> None: # Single-component SMILES assert sort_any("A.C.C.B") == "A.B.C.C"