From 1b4b8434c5906265b63719c03055d79fdb222838 Mon Sep 17 00:00:00 2001 From: "Weilin.Yuan" Date: Mon, 2 Oct 2023 15:04:51 +0900 Subject: [PATCH] Support for polymer SMILES added - Support encoding/decoding SMILES with wildcard symbol ``*`` - Run python test_polysf.py to test --- .DS_Store | Bin 0 -> 6148 bytes selfies/constants.py | 2 +- selfies/decoder.py | 15 +++++++++++++-- selfies/encoder.py | 3 ++- selfies/grammar_rules.py | 3 ++- selfies/mol_graph.py | 16 ++++++++++++++-- selfies/utils/smiles_utils.py | 21 +++++++++++++++++++-- test_polysf.py | 21 +++++++++++++++++++++ 8 files changed, 72 insertions(+), 9 deletions(-) create mode 100644 .DS_Store create mode 100644 test_polysf.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 str: >>> import selfies as sf >>> sf.encoder("C=CF") '[C][=C][F]' - +atom .. note:: This function does not currently support SMILES with: * The wildcard symbol ``*``. @@ -240,3 +240,4 @@ def _atom_to_selfies(bond, atom): assert not atom.is_aromatic bond_char = "" if (bond is None) else _bond_to_selfies(bond) return "[{}{}]".format(bond_char, atom_to_smiles(atom, brackets=False)) + diff --git a/selfies/grammar_rules.py b/selfies/grammar_rules.py index 9cd354e4..f837e7b9 100644 --- a/selfies/grammar_rules.py +++ b/selfies/grammar_rules.py @@ -107,7 +107,7 @@ def get_selfies_from_index(index: int) -> List[str]: r"^[\[]" # opening square bracket [ r"([=#/\\]?)" # bond char r"(\d*)" # isotope number (optional, e.g. 123, 26) - r"([A-Z][a-z]?)" # element symbol + r"([A-Z][a-z]?|\*)" # element symbol or wildcard r"([@]{0,2})" # chiral_tag (optional, only @ and @@ supported) r"((?:[H]\d)?)" # H count (optional, e.g. H1, H3) r"((?:[+-][1-9]+)?)" # charge (optional, e.g. +1) @@ -115,6 +115,7 @@ def get_selfies_from_index(index: int) -> List[str]: ) + def _process_atom_selfies_no_cache(symbol): m = SELFIES_ATOM_PATTERN.match(symbol) if m is None: diff --git a/selfies/mol_graph.py b/selfies/mol_graph.py index b4913336..6e70dd2d 100644 --- a/selfies/mol_graph.py +++ b/selfies/mol_graph.py @@ -42,7 +42,8 @@ def __init__( isotope: Optional[int] = None, chirality: Optional[str] = None, h_count: Optional[int] = None, - charge: int = 0 + charge: int = 0, + is_wildcard: bool = False ): self.index = None self.element = element @@ -51,7 +52,8 @@ def __init__( self.chirality = chirality self.h_count = h_count self.charge = charge - + self.is_wildcard = is_wildcard + @property @functools.lru_cache() def bonding_capacity(self): @@ -142,6 +144,8 @@ def get_out_dirbonds(self, src: int) -> List[DirectedBond]: def get_bond_count(self, idx: int) -> int: return self._bond_counts[idx] + + def add_atom(self, atom: Atom, mark_root: bool = False) -> Atom: atom.index = len(self) @@ -154,6 +158,14 @@ def add_atom(self, atom: Atom, mark_root: bool = False) -> Atom: if atom.is_aromatic: self._delocal_subgraph[atom.index] = list() return atom + + + def add_wildcard_atom(self, mark_root: bool = False) -> Atom: + wildcard_atom = Atom(element='*', is_aromatic=False, is_wildcard=True, h_count=0, charge=0) # add is_wildcard=True + added_atom = self.add_atom(wildcard_atom, mark_root) + return added_atom + + def add_attribution( self, diff --git a/selfies/utils/smiles_utils.py b/selfies/utils/smiles_utils.py index bd514c2e..e056376f 100644 --- a/selfies/utils/smiles_utils.py +++ b/selfies/utils/smiles_utils.py @@ -66,6 +66,11 @@ def tokenize_smiles(smiles: str) -> Iterator[SMILESToken]: i = 0 while i < len(smiles): + # 添加这部分来处理 * 符号 + if smiles[i] == "*" or smiles[i:i+3] == "[*]": + yield SMILESToken(None, i, i + 1, SMILESTokenTypes.ATOM, "*") + i += 1 + continue if smiles[i] == ".": yield SMILESToken(None, i, i + 1, SMILESTokenTypes.DOT, smiles[i]) @@ -127,12 +132,16 @@ def tokenize_smiles(smiles: str) -> Iterator[SMILESToken]: # ============================================================================= + + def smiles_to_atom(atom_symbol: str) -> Optional[Atom]: """Reads an atom from its SMILES representation. :param atom_symbol: a SMILES atom symbol. :return: the atom that the input symbol represents. """ + if atom_symbol == "*": + return Atom("*", False) if atom_symbol[0] == "[" and atom_symbol[-1] == "]": pass # continue below @@ -183,6 +192,7 @@ def smiles_to_atom(atom_symbol: str) -> Optional[Atom]: ) + def smiles_to_bond( bond_char: Optional[str] ) -> Tuple[Union[int, float], Optional[str]]: @@ -358,6 +368,8 @@ def atom_to_smiles(atom: Atom, brackets: bool = True) -> str: :return: a SMILES symbol representing the input atom. """ assert not atom.is_aromatic + if atom.element == '*': + return '*' specs = (atom.isotope, atom.chirality, atom.h_count, atom.charge) if specs == (None, None, None, 0): @@ -443,12 +455,17 @@ def _derive_smiles_from_fragment( ring_log, attribution_maps, attribution_index=0): curr_atom, curr = mol.get_atom(root), root - token = atom_to_smiles(curr_atom) + + if curr_atom.is_wildcard: # 使用新增属性进行判断 + token = "*" + else: + token = atom_to_smiles(curr_atom) + + # token = atom_to_smiles(curr_atom) derived.append(token) attribution_maps.append(AttributionMap( _strlen(derived) - 1 + attribution_index, token, mol.get_attribution(curr_atom))) - out_bonds = mol.get_out_dirbonds(curr) for i, bond in enumerate(out_bonds): if bond.ring_bond: diff --git a/test_polysf.py b/test_polysf.py new file mode 100644 index 00000000..22f1124e --- /dev/null +++ b/test_polysf.py @@ -0,0 +1,21 @@ +import selfies as sf +from rdkit import Chem + +polymer_smiles = ['*CC(*)(C)C', + 'C1=C(SC(=C1)[*])[*]', + 'CCCCC1=C(SC(=C1)[*])[*]', + 'CCCCCCC1=C(SC(=C1)[*])[*]', + 'CCCCCCCCC1=C(SC(=C1)[*])[*]', + 'C1(=CC(=C(C=C1C=C[*])OC)[*])OCC(CC)CCCC' + ] + +for i in polymer_smiles: + mol = Chem.MolFromSmiles(i) + ori_smi = Chem.MolToSmiles(mol) + selfies = sf.encoder(ori_smi) + de_smi = sf.decoder(selfies) + de_smi = Chem.MolToSmiles(Chem.MolFromSmiles(de_smi)) + print('polymer smiles:', ori_smi, 'selfies:', selfies, 'decode selfies:', de_smi, 'equal?:', ori_smi == de_smi) + + +