Skip to content

Commit

Permalink
Support for polymer SMILES added
Browse files Browse the repository at this point in the history
- Support encoding/decoding SMILES with wildcard symbol ``*``
- Run python test_polysf.py to test
  • Loading branch information
onecoinbuybus committed Oct 2, 2023
1 parent 120b776 commit 1b4b843
Show file tree
Hide file tree
Showing 8 changed files with 72 additions and 9 deletions.
Binary file added .DS_Store
Binary file not shown.
2 changes: 1 addition & 1 deletion selfies/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"Ds", "Rg", "Cn", "Fl", "Lv", "La", "Ce", "Pr", "Nd", "Pm", "Sm",
"Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Ac", "Th",
"Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md",
"No", "Lr"
"No", "Lr", "*", "[*]"
}

ORGANIC_SUBSET = {"B", "C", "N", "O", "S", "P", "F", "Cl", "Br", "I"}
Expand Down
15 changes: 13 additions & 2 deletions selfies/decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,19 @@ def _derive_mol_from_symbols(
elif "eps" in symbol:
next_state = 0 if (state == 0) else None

# Case 4: regular symbol (e.g. [N], [=C], [F])
else:
# Case 4: [*]
elif symbol == "*":
atom = mol.add_wildcard_atom() # add wildcard atom
mol.add_attribution(
atom, attribute_stack +
[Attribution(index + attribution_index, symbol)]
if attribute_stack is not None else None
)
prev_atom = atom
next_state = 0

# Case 5: regular symbol (e.g. [N], [=C], [F])
else:
output = process_atom_symbol(symbol)
if output is None:
_raise_decoder_error(selfies, symbol)
Expand Down Expand Up @@ -185,6 +195,7 @@ def _derive_mol_from_symbols(
[Attribution(index + attribution_index, symbol)]
if attribute_stack is not None else None)
prev_atom = atom


if next_state is None:
break
Expand Down
3 changes: 2 additions & 1 deletion selfies/encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def encoder(smiles: str, strict: bool = True, attribute: bool = False) -> str:
>>> import selfies as sf
>>> sf.encoder("C=CF")
'[C][=C][F]'
atom
.. note:: This function does not currently support SMILES with:
* The wildcard symbol ``*``.
Expand Down Expand Up @@ -240,3 +240,4 @@ def _atom_to_selfies(bond, atom):
assert not atom.is_aromatic
bond_char = "" if (bond is None) else _bond_to_selfies(bond)
return "[{}{}]".format(bond_char, atom_to_smiles(atom, brackets=False))

3 changes: 2 additions & 1 deletion selfies/grammar_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,14 +107,15 @@ def get_selfies_from_index(index: int) -> List[str]:
r"^[\[]" # opening square bracket [
r"([=#/\\]?)" # bond char
r"(\d*)" # isotope number (optional, e.g. 123, 26)
r"([A-Z][a-z]?)" # element symbol
r"([A-Z][a-z]?|\*)" # element symbol or wildcard
r"([@]{0,2})" # chiral_tag (optional, only @ and @@ supported)
r"((?:[H]\d)?)" # H count (optional, e.g. H1, H3)
r"((?:[+-][1-9]+)?)" # charge (optional, e.g. +1)
r"[]]$" # closing square bracket ]
)



def _process_atom_selfies_no_cache(symbol):
m = SELFIES_ATOM_PATTERN.match(symbol)
if m is None:
Expand Down
16 changes: 14 additions & 2 deletions selfies/mol_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def __init__(
isotope: Optional[int] = None,
chirality: Optional[str] = None,
h_count: Optional[int] = None,
charge: int = 0
charge: int = 0,
is_wildcard: bool = False
):
self.index = None
self.element = element
Expand All @@ -51,7 +52,8 @@ def __init__(
self.chirality = chirality
self.h_count = h_count
self.charge = charge

self.is_wildcard = is_wildcard

@property
@functools.lru_cache()
def bonding_capacity(self):
Expand Down Expand Up @@ -142,6 +144,8 @@ def get_out_dirbonds(self, src: int) -> List[DirectedBond]:
def get_bond_count(self, idx: int) -> int:
return self._bond_counts[idx]



def add_atom(self, atom: Atom, mark_root: bool = False) -> Atom:
atom.index = len(self)

Expand All @@ -154,6 +158,14 @@ def add_atom(self, atom: Atom, mark_root: bool = False) -> Atom:
if atom.is_aromatic:
self._delocal_subgraph[atom.index] = list()
return atom


def add_wildcard_atom(self, mark_root: bool = False) -> Atom:
wildcard_atom = Atom(element='*', is_aromatic=False, is_wildcard=True, h_count=0, charge=0) # add is_wildcard=True
added_atom = self.add_atom(wildcard_atom, mark_root)
return added_atom



def add_attribution(
self,
Expand Down
21 changes: 19 additions & 2 deletions selfies/utils/smiles_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,11 @@ def tokenize_smiles(smiles: str) -> Iterator[SMILESToken]:

i = 0
while i < len(smiles):
# 添加这部分来处理 * 符号
if smiles[i] == "*" or smiles[i:i+3] == "[*]":
yield SMILESToken(None, i, i + 1, SMILESTokenTypes.ATOM, "*")
i += 1
continue

if smiles[i] == ".":
yield SMILESToken(None, i, i + 1, SMILESTokenTypes.DOT, smiles[i])
Expand Down Expand Up @@ -127,12 +132,16 @@ def tokenize_smiles(smiles: str) -> Iterator[SMILESToken]:
# =============================================================================




def smiles_to_atom(atom_symbol: str) -> Optional[Atom]:
"""Reads an atom from its SMILES representation.
:param atom_symbol: a SMILES atom symbol.
:return: the atom that the input symbol represents.
"""
if atom_symbol == "*":
return Atom("*", False)

if atom_symbol[0] == "[" and atom_symbol[-1] == "]":
pass # continue below
Expand Down Expand Up @@ -183,6 +192,7 @@ def smiles_to_atom(atom_symbol: str) -> Optional[Atom]:
)



def smiles_to_bond(
bond_char: Optional[str]
) -> Tuple[Union[int, float], Optional[str]]:
Expand Down Expand Up @@ -358,6 +368,8 @@ def atom_to_smiles(atom: Atom, brackets: bool = True) -> str:
:return: a SMILES symbol representing the input atom.
"""
assert not atom.is_aromatic
if atom.element == '*':
return '*'

specs = (atom.isotope, atom.chirality, atom.h_count, atom.charge)
if specs == (None, None, None, 0):
Expand Down Expand Up @@ -443,12 +455,17 @@ def _derive_smiles_from_fragment(
ring_log,
attribution_maps, attribution_index=0):
curr_atom, curr = mol.get_atom(root), root
token = atom_to_smiles(curr_atom)

if curr_atom.is_wildcard: # 使用新增属性进行判断
token = "*"
else:
token = atom_to_smiles(curr_atom)

# token = atom_to_smiles(curr_atom)
derived.append(token)
attribution_maps.append(AttributionMap(
_strlen(derived) - 1 + attribution_index,
token, mol.get_attribution(curr_atom)))

out_bonds = mol.get_out_dirbonds(curr)
for i, bond in enumerate(out_bonds):
if bond.ring_bond:
Expand Down
21 changes: 21 additions & 0 deletions test_polysf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import selfies as sf
from rdkit import Chem

polymer_smiles = ['*CC(*)(C)C',
'C1=C(SC(=C1)[*])[*]',
'CCCCC1=C(SC(=C1)[*])[*]',
'CCCCCCC1=C(SC(=C1)[*])[*]',
'CCCCCCCCC1=C(SC(=C1)[*])[*]',
'C1(=CC(=C(C=C1C=C[*])OC)[*])OCC(CC)CCCC'
]

for i in polymer_smiles:
mol = Chem.MolFromSmiles(i)
ori_smi = Chem.MolToSmiles(mol)
selfies = sf.encoder(ori_smi)
de_smi = sf.decoder(selfies)
de_smi = Chem.MolToSmiles(Chem.MolFromSmiles(de_smi))
print('polymer smiles:', ori_smi, 'selfies:', selfies, 'decode selfies:', de_smi, 'equal?:', ori_smi == de_smi)



0 comments on commit 1b4b843

Please sign in to comment.