Support for polymer SMILES added

- Support encoding/decoding SMILES with wildcard symbol ``*`` - Run python test_polysf.py to test
aspuru-guzik-group · Oct 2, 2023 · 1b4b843 · 1b4b843
1 parent 120b776
commit 1b4b843
Show file tree

Hide file tree

Showing 8 changed files with 72 additions and 9 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/selfies/constants.py b/selfies/constants.py
@@ -9,7 +9,7 @@
     "Ds", "Rg", "Cn", "Fl", "Lv", "La", "Ce", "Pr", "Nd", "Pm", "Sm",
     "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Ac", "Th",
     "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md",
-    "No", "Lr"
+    "No", "Lr", "*", "[*]"
 }
 
 ORGANIC_SUBSET = {"B", "C", "N", "O", "S", "P", "F", "Cl", "Br", "I"}

diff --git a/selfies/decoder.py b/selfies/decoder.py
@@ -154,9 +154,19 @@ def _derive_mol_from_symbols(
         elif "eps" in symbol:
             next_state = 0 if (state == 0) else None
 
-        # Case 4: regular symbol (e.g. [N], [=C], [F])
-        else:
+        # Case 4: [*]
+        elif symbol == "*":
+            atom = mol.add_wildcard_atom()  # add wildcard atom
+            mol.add_attribution(
+                atom, attribute_stack +
+                [Attribution(index + attribution_index, symbol)]
+                if attribute_stack is not None else None
+            )
+            prev_atom = atom
+            next_state = 0
 
+        # Case 5: regular symbol (e.g. [N], [=C], [F])
+        else:
             output = process_atom_symbol(symbol)
             if output is None:
                 _raise_decoder_error(selfies, symbol)
@@ -185,6 +195,7 @@ def _derive_mol_from_symbols(
                     [Attribution(index + attribution_index, symbol)]
                     if attribute_stack is not None else None)
             prev_atom = atom
+
 
         if next_state is None:
             break

diff --git a/selfies/encoder.py b/selfies/encoder.py
@@ -47,7 +47,7 @@ def encoder(smiles: str, strict: bool = True, attribute: bool = False) -> str:
     >>> import selfies as sf
     >>> sf.encoder("C=CF")
     '[C][=C][F]'
-
+atom
     .. note:: This function does not currently support SMILES with:
 
         *   The wildcard symbol ``*``.
@@ -240,3 +240,4 @@ def _atom_to_selfies(bond, atom):
     assert not atom.is_aromatic
     bond_char = "" if (bond is None) else _bond_to_selfies(bond)
     return "[{}{}]".format(bond_char, atom_to_smiles(atom, brackets=False))
+
diff --git a/selfies/grammar_rules.py b/selfies/grammar_rules.py
@@ -107,14 +107,15 @@ def get_selfies_from_index(index: int) -> List[str]:
     r"^[\[]"  # opening square bracket [
     r"([=#/\\]?)"  # bond char
     r"(\d*)"  # isotope number (optional, e.g. 123, 26)
-    r"([A-Z][a-z]?)"  # element symbol
+    r"([A-Z][a-z]?|\*)"  # element symbol or wildcard
     r"([@]{0,2})"  # chiral_tag (optional, only @ and @@ supported)
     r"((?:[H]\d)?)"  # H count (optional, e.g. H1, H3)
     r"((?:[+-][1-9]+)?)"  # charge (optional, e.g. +1)
     r"[]]$"  # closing square bracket ]
 )
 
 
+
 def _process_atom_selfies_no_cache(symbol):
     m = SELFIES_ATOM_PATTERN.match(symbol)
     if m is None:

diff --git a/selfies/mol_graph.py b/selfies/mol_graph.py
@@ -42,7 +42,8 @@ def __init__(
             isotope: Optional[int] = None,
             chirality: Optional[str] = None,
             h_count: Optional[int] = None,
-            charge: int = 0
+            charge: int = 0,
+            is_wildcard: bool = False
     ):
         self.index = None
         self.element = element
@@ -51,7 +52,8 @@ def __init__(
         self.chirality = chirality
         self.h_count = h_count
         self.charge = charge
-
+        self.is_wildcard = is_wildcard
+
     @property
     @functools.lru_cache()
     def bonding_capacity(self):
@@ -142,6 +144,8 @@ def get_out_dirbonds(self, src: int) -> List[DirectedBond]:
     def get_bond_count(self, idx: int) -> int:
         return self._bond_counts[idx]
 
+
+
     def add_atom(self, atom: Atom, mark_root: bool = False) -> Atom:
         atom.index = len(self)
 
@@ -154,6 +158,14 @@ def add_atom(self, atom: Atom, mark_root: bool = False) -> Atom:
         if atom.is_aromatic:
             self._delocal_subgraph[atom.index] = list()
         return atom
+
+
+    def add_wildcard_atom(self, mark_root: bool = False) -> Atom:
+        wildcard_atom = Atom(element='*', is_aromatic=False, is_wildcard=True, h_count=0, charge=0)  # add is_wildcard=True
+        added_atom = self.add_atom(wildcard_atom, mark_root)
+        return added_atom
+
+
 
     def add_attribution(
             self,

diff --git a/selfies/utils/smiles_utils.py b/selfies/utils/smiles_utils.py
@@ -66,6 +66,11 @@ def tokenize_smiles(smiles: str) -> Iterator[SMILESToken]:
 
     i = 0
     while i < len(smiles):
+        # 添加这部分来处理 * 符号
+        if smiles[i] == "*" or smiles[i:i+3] == "[*]":
+            yield SMILESToken(None, i, i + 1, SMILESTokenTypes.ATOM, "*")
+            i += 1
+            continue
 
         if smiles[i] == ".":
             yield SMILESToken(None, i, i + 1, SMILESTokenTypes.DOT, smiles[i])
@@ -127,12 +132,16 @@ def tokenize_smiles(smiles: str) -> Iterator[SMILESToken]:
 # =============================================================================
 
 
+
+
 def smiles_to_atom(atom_symbol: str) -> Optional[Atom]:
     """Reads an atom from its SMILES representation.
 
     :param atom_symbol: a SMILES atom symbol.
     :return: the atom that the input symbol represents.
     """
+    if atom_symbol == "*":
+        return Atom("*", False)
 
     if atom_symbol[0] == "[" and atom_symbol[-1] == "]":
         pass  # continue below
@@ -183,6 +192,7 @@ def smiles_to_atom(atom_symbol: str) -> Optional[Atom]:
     )
 
 
+
 def smiles_to_bond(
         bond_char: Optional[str]
 ) -> Tuple[Union[int, float], Optional[str]]:
@@ -358,6 +368,8 @@ def atom_to_smiles(atom: Atom, brackets: bool = True) -> str:
     :return: a SMILES symbol representing the input atom.
     """
     assert not atom.is_aromatic
+    if atom.element == '*':
+        return '*'
 
     specs = (atom.isotope, atom.chirality, atom.h_count, atom.charge)
     if specs == (None, None, None, 0):
@@ -443,12 +455,17 @@ def _derive_smiles_from_fragment(
         ring_log,
         attribution_maps, attribution_index=0):
     curr_atom, curr = mol.get_atom(root), root
-    token = atom_to_smiles(curr_atom)
+
+    if curr_atom.is_wildcard:  # 使用新增属性进行判断
+        token = "*"
+    else:
+        token = atom_to_smiles(curr_atom)
+
+    # token = atom_to_smiles(curr_atom)
     derived.append(token)
     attribution_maps.append(AttributionMap(
         _strlen(derived) - 1 + attribution_index,
         token, mol.get_attribution(curr_atom)))
-
     out_bonds = mol.get_out_dirbonds(curr)
     for i, bond in enumerate(out_bonds):
         if bond.ring_bond:

diff --git a/test_polysf.py b/test_polysf.py
@@ -0,0 +1,21 @@
+import selfies as sf
+from rdkit import Chem
+
+polymer_smiles = ['*CC(*)(C)C',
+                  'C1=C(SC(=C1)[*])[*]',
+                  'CCCCC1=C(SC(=C1)[*])[*]',
+                  'CCCCCCC1=C(SC(=C1)[*])[*]',
+                  'CCCCCCCCC1=C(SC(=C1)[*])[*]',
+                  'C1(=CC(=C(C=C1C=C[*])OC)[*])OCC(CC)CCCC'
+                  ]
+
+for i in polymer_smiles:
+    mol = Chem.MolFromSmiles(i)
+    ori_smi = Chem.MolToSmiles(mol)
+    selfies = sf.encoder(ori_smi)
+    de_smi = sf.decoder(selfies)
+    de_smi = Chem.MolToSmiles(Chem.MolFromSmiles(de_smi))
+    print('polymer smiles:', ori_smi, 'selfies:', selfies, 'decode selfies:', de_smi, 'equal?:', ori_smi == de_smi)
+
+
+