Skip to content

Commit

Permalink
Make canonicalize_any more flexible (#21)
Browse files Browse the repository at this point in the history
  • Loading branch information
avaucher authored Feb 10, 2023
1 parent 868014b commit c8cea05
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 3 deletions.
57 changes: 54 additions & 3 deletions src/rxn/chemutils/miscellaneous.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
import logging
import re
import typing
from collections import Counter
from functools import partial
from typing import Callable, List

from rdkit.Chem import AddHs, Atom, Mol
from rxn.utilities.files import (
PathLike,
dump_list_to_file,
iterate_lines_from_file,
raise_if_paths_are_identical,
)

from .conversion import canonicalize_smiles, smiles_to_mol
from .exceptions import InvalidSmiles
Expand All @@ -27,6 +34,9 @@
to_reaction_smiles,
)

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())

CHIRAL_CENTER_PATTERN = re.compile(
r"\[([^],@]+)@+([^]]*)]"
) # Matches stereo centres, and groups what comes before and after "@"
Expand Down Expand Up @@ -180,7 +190,12 @@ def apply_to_smiles_groups(
return list_to_multicomponent_smiles(fn(compounds), fragment_bond="~")


def canonicalize_any(any_smiles: str, check_valence: bool = True) -> str:
def canonicalize_any(
any_smiles: str,
check_valence: bool = True,
sort_molecules: bool = False,
fallback_value: typing.Optional[str] = None,
) -> str:
"""
Canonicalize any SMILES string (molecule SMILES, multicomponent SMILES, reaction SMILES).
Expand All @@ -189,6 +204,9 @@ def canonicalize_any(any_smiles: str, check_valence: bool = True) -> str:
Args:
any_smiles: any kind of SMILES string.
check_valence: if False, will not do any valence check.
sort_molecules: whether to sort the compounds alphabetically at the same time.
fallback_value: what value to returns when the canonicalization is unsuccessful.
Default: no fallback, will propagate the exception.
Raises:
Exception: different kinds of exception may be raised during parsing.
Expand All @@ -197,8 +215,41 @@ def canonicalize_any(any_smiles: str, check_valence: bool = True) -> str:
Returns:
the canonical (molecule, multicomponent, or reaction) SMILES string.
"""
fn = partial(canonicalize_smiles, check_valence=check_valence)
return apply_to_any_smiles(any_smiles, fn)
try:
fn = partial(canonicalize_smiles, check_valence=check_valence)
canonical_smiles = apply_to_any_smiles(any_smiles, fn)
if sort_molecules:
canonical_smiles = sort_any(canonical_smiles)
return canonical_smiles
except Exception as e:
if fallback_value is not None:
logger.debug(f'Error when canonicalizing "{any_smiles}": {e}')
return fallback_value
raise


def canonicalize_file(
input_file: PathLike,
output_file: PathLike,
check_valence: bool = True,
fallback_value: str = "",
sort_molecules: bool = False,
) -> None:
raise_if_paths_are_identical(input_file, output_file)
logger.info(f'Canonicalizing file "{input_file}" -> "{output_file}".')

# We formulate it as a generator, so that the file below is written directly
canonical = (
canonicalize_any(
line,
check_valence=check_valence,
fallback_value=fallback_value,
sort_molecules=sort_molecules,
)
for line in iterate_lines_from_file(input_file)
)

dump_list_to_file(canonical, output_file)


def sort_any(any_smiles: str) -> str:
Expand Down
2 changes: 2 additions & 0 deletions src/rxn/chemutils/scripts/canonicalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Optional, TextIO

import click
from rxn.utilities.logging import setup_console_logger

from rxn.chemutils.miscellaneous import canonicalize_any

Expand All @@ -27,6 +28,7 @@ def main(
first argument, and write to stdout, or from a file given as the second
argument.
"""
setup_console_logger()

for line in input_file:
smiles = line.strip()
Expand Down
2 changes: 2 additions & 0 deletions src/rxn/chemutils/scripts/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Optional, TextIO

import click
from rxn.utilities.logging import setup_console_logger

from rxn.chemutils.tokenization import tokenize_smiles

Expand All @@ -24,6 +25,7 @@ def main(
first argument, and write to stdout, or from a file given as the second
argument.
"""
setup_console_logger()

for line in input_file:
smiles = line.strip()
Expand Down
19 changes: 19 additions & 0 deletions tests/test_miscellaneous.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,25 @@ def test_canonicalize_any_on_reaction_smiles() -> None:
_ = canonicalize_any("CC>>CC>>C(O)")


def test_canonicalize_any_with_fallback_value() -> None:
fallback = "some_fallback_value"

# No error -> fallback value not used
assert canonicalize_any("C(C)C", fallback_value=fallback) == "CCC"
assert canonicalize_any("CO>>C(C)C", fallback_value=fallback) == "CO>>CCC"

# Error -> returns the fallback value
assert canonicalize_any("CoMo", fallback_value=fallback) == fallback
assert canonicalize_any("invalid>>C(C)C", fallback_value=fallback) == fallback


def test_canonicalize_any_with_sorting() -> None:
# Note: the sorting in `N~OO` comes from the RDKit canonicalization of that compound.
assert canonicalize_any("C(C).OO~N", sort_molecules=True) == "CC.N~OO"
assert canonicalize_any("OO~N.C(C)", sort_molecules=True) == "CC.N~OO"
assert canonicalize_any("OO~N.C(C)>>N.C", sort_molecules=True) == "CC.N~OO>>C.N"


def test_sort_any() -> None:
# Single-component SMILES
assert sort_any("A.C.C.B") == "A.B.C.C"
Expand Down

0 comments on commit c8cea05

Please sign in to comment.