-
Notifications
You must be signed in to change notification settings - Fork 2
/
smiles_randomization.py
107 lines (75 loc) · 3.28 KB
/
smiles_randomization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import random
from rdkit import Chem
from .conversion import mol_to_smiles, smiles_to_mol
# Highest value to give as a random seed for RDKit.
# Any value higher than that will cause problems.
_MAX_RDKIT_RANDOM_SEED = 2147483647
def randomize_smiles_rotated(smiles: str, with_order_reversal: bool = True) -> str:
"""
Randomize a SMILES string by doing a cyclic rotation of the atomic indices.
Adapted from https://github.com/GLambard/SMILES-X/blob/758478663030580a363a9ee61c11f6d6448e18a1/SMILESX/augm.py#L19.
The outputs of this function can be reproduced by setting the seed with random.seed().
Raises:
InvalidSmiles: for invalid molecules.
Args:
smiles: SMILES string to randomize.
with_order_reversal: whether to reverse the atom order with 50% chance.
Returns:
Randomized SMILES string.
"""
mol = smiles_to_mol(smiles, sanitize=False)
n_atoms = mol.GetNumAtoms()
# Generate random values
rotation_index = random.randint(0, n_atoms - 1)
reverse_order = with_order_reversal and random.choice([True, False])
# Generate new atom indices order
atoms = list(range(n_atoms))
new_atoms_order = (
atoms[rotation_index % len(atoms) :] + atoms[: rotation_index % len(atoms)]
)
if reverse_order:
new_atoms_order.reverse()
mol = Chem.RenumberAtoms(mol, new_atoms_order)
return mol_to_smiles(mol, canonical=False)
def randomize_smiles_restricted(smiles: str) -> str:
"""
Randomize a SMILES string in a restricted fashion.
The outputs of this function can be reproduced by setting the seed with random.seed().
Raises:
InvalidSmiles: for invalid molecules.
Args:
smiles: SMILES string to randomize.
Returns:
Randomized SMILES string.
"""
mol = smiles_to_mol(smiles, sanitize=False)
new_atom_order = list(range(mol.GetNumAtoms()))
random.shuffle(new_atom_order)
mol = Chem.RenumberAtoms(mol, newOrder=new_atom_order)
return mol_to_smiles(mol, canonical=False)
def randomize_smiles_unrestricted(smiles: str) -> str:
"""
Randomize a SMILES string in an unrestricted fashion.
The outputs of this function can be reproduced by setting the seed with random.seed().
Raises:
InvalidSmiles: for invalid molecules.
Args:
smiles: SMILES string to randomize.
Returns:
Randomized SMILES string.
"""
# We sample the seed to give to RDKit. This makes the call reproducible
# if one sets random.seed() outside this function.
seed = random.randint(1, _MAX_RDKIT_RANDOM_SEED)
# unlike for the other randomizations, unrestricted randomization does not
# work on compounds with multiple fragments. Hence we first split them and
# then do the randomization individually
sub_smiles = smiles.split(".")
mols = [smiles_to_mol(s, sanitize=False) for s in sub_smiles]
# Note: to allow for reproducibility, we do not rely on
# Chem.MolToSmiles(mol, canonical=False, doRandom=True)
# See https://www.rdkit.org/docs/Cookbook.html#enumerate-smiles
randomized_mols = [Chem.MolToRandomSmilesVect(mol, 1, seed)[0] for mol in mols]
# shuffle the order of the fragments and join them back
random.shuffle(randomized_mols)
return ".".join(randomized_mols)