From 0c6d4d3d0bfa0955d0328fd2fc18aa0071a3369c Mon Sep 17 00:00:00 2001 From: Alain Vaucher Date: Thu, 5 Jan 2023 11:19:23 +0100 Subject: [PATCH] Add function to get list of individual compounds from any SMILES string (#16) --- src/rxn/chemutils/miscellaneous.py | 33 +++++++++++++++++++++++++++++- tests/test_miscellaneous.py | 26 +++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/src/rxn/chemutils/miscellaneous.py b/src/rxn/chemutils/miscellaneous.py index 2015a9a..b398f7f 100644 --- a/src/rxn/chemutils/miscellaneous.py +++ b/src/rxn/chemutils/miscellaneous.py @@ -19,7 +19,12 @@ apply_to_compounds, sort_compounds, ) -from .reaction_smiles import determine_format, parse_reaction_smiles, to_reaction_smiles +from .reaction_smiles import ( + determine_format, + parse_any_reaction_smiles, + parse_reaction_smiles, + to_reaction_smiles, +) CHIRAL_CENTER_PATTERN = re.compile( r"\[([^],@]+)@+([^]]*)]" @@ -220,3 +225,29 @@ def sort_any(any_smiles: str) -> str: else: # we call the same function for single- and multi-component SMILES return sort_multicomponent_smiles(any_smiles) + + +def get_individual_compounds(any_smiles: str) -> List[str]: + """ + Get the individual compound SMILES strings starting from any SMILES string + (multicomponent SMILES, reaction SMILES). + + Single-component SMILES with dots are interpreted as multicomponent SMILES strings. + + Args: + any_smiles: any kind of SMILES string. + + Raises: + Exception: different kinds of exception may be raised during parsing. + + Returns: + List of individual compound SMILES. + """ + if ">" in any_smiles: + # We have a reaction SMILES + reaction = parse_any_reaction_smiles(any_smiles) + return list(reaction.iter_all_smiles()) + else: + # We interpret it as a multicomponent SMILES. + # We use "~" as a fragment bond even if it is not actually needed. + return multicomponent_smiles_to_list(any_smiles, fragment_bond="~") diff --git a/tests/test_miscellaneous.py b/tests/test_miscellaneous.py index 01706f7..2b3f60d 100644 --- a/tests/test_miscellaneous.py +++ b/tests/test_miscellaneous.py @@ -10,6 +10,7 @@ atom_type_counter, canonicalize_any, equivalent_smiles, + get_individual_compounds, is_valid_smiles, remove_chiral_centers, remove_double_bond_stereochemistry, @@ -192,3 +193,28 @@ def test_sort_any() -> None: # reaction SMILES assert sort_any("B.A.E~D.A>>C.B") == "A.A.B.E~D>>B.C" assert sort_any("B.A.E.D.A>>C.B |f:2.3|") == "A.A.B.E.D>>B.C |f:3.4|" + + +def test_get_individual_compounds() -> None: + # Single-component SMILES and/or multi-component SMILES + assert get_individual_compounds("A.C.C.B") == ["A", "C", "C", "B"] + assert get_individual_compounds("A.D~C.B") == ["A", "D.C", "B"] + assert get_individual_compounds("CBA") == ["CBA"] + + # reaction SMILES + assert get_individual_compounds("B.A.E~D.A>>C.B") == [ + "B", + "A", + "E.D", + "A", + "C", + "B", + ] + assert get_individual_compounds("B.A.E.D.A>>C.B |f:2.3|") == [ + "B", + "A", + "A", + "E.D", # Note that it is located elsewhere, due to how extended SMILES are parsed + "C", + "B", + ]