-
Notifications
You must be signed in to change notification settings - Fork 5
/
clean.py
279 lines (238 loc) · 9.83 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
import argparse
import os
import pickle
import numpy as np
import pandas as pd
import yaml
from rdkit import Chem
from rdkit.Chem import Descriptors
def sanitize_smiles(smiles):
try:
smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles))
except:
print(f"Invalid SMILES: {smiles}")
smiles = np.nan
if smiles == "":
print("Invalid SMILES: ''")
smiles = np.nan
return smiles
def creating_single_molecule_directory(output_dir, name, years):
"""
Open the files and build a full dictionary
input:
name of the smiles directory
output:
dictionary with all the smiles molecules and patents
"""
subject_smiles_dictionary_temp = {}
subject_smiles_dictionary = {}
for year in years:
try:
with open(
os.path.join(
output_dir, name, "subject_smiles_dictionary_" + name + "_" + str(year)
),
"rb",
) as fp:
subject_smiles_dictionary_temp = pickle.load(fp)
print(str(year), "directory is", len(subject_smiles_dictionary_temp), "long.")
for k, v in subject_smiles_dictionary_temp.items():
subject_smiles_dictionary.setdefault(k, set())
subject_smiles_dictionary[k] |= set(v)
except Exception as e:
print("did not work", e)
continue
print("Length of", name, "directory is", len(subject_smiles_dictionary))
return subject_smiles_dictionary
def cleaning_data_set(directory, charged_only=False, neutral_only=False):
"""
Creates a pandas dataframe for easier overlooking of files.
Also selects only the molecules which have a charge in the smiles
representation of a molecule.
input: smiles molecule python dictionary with patent names
output: pandas DF with unique molecules and all the patents
that have these molecules included in them
"""
print("Size of directory before selecting charged molecules:", len(directory))
smiles_DF = pd.DataFrame.from_dict(directory, orient="index")
smiles_DF = smiles_DF.reset_index()
smiles_DF = smiles_DF.rename(columns={"index": "smiles"})
# select molecules with positive or negative charge
smiles_DF["indexes_pos"] = smiles_DF.smiles.str.find("+")
smiles_DF["indexes_neg"] = smiles_DF.smiles.str.find("-")
if charged_only:
# drop all the empty columns which are created when uncharged molecules are dropped
df_charged = smiles_DF[(smiles_DF.indexes_pos >= 0) | (smiles_DF.indexes_neg >= 0)]
df_charged = df_charged.dropna(axis=1, how="all")
df_charged = df_charged.drop(["indexes_pos", "indexes_neg"], axis=1)
print("Size of directory after selecting charged molecules:", df_charged.shape[0])
return df_charged
elif neutral_only:
# drop all the empty columns which are created when charged molecules are dropped
df_neutral = smiles_DF[(smiles_DF.indexes_pos == -1) & (smiles_DF.indexes_neg == -1)]
df_neutral = df_neutral.dropna(axis=1, how="all")
df_neutral = df_neutral.drop(["indexes_pos", "indexes_neg"], axis=1)
print("Size of directory after selecting neutral molecules:", df_neutral.shape[0])
return df_neutral
def cleaning_molecules_from_subst(df_molecules):
"""
Cleaning the molecules from the different
R (* in the dataset), Y substituents
Changing R into ethylene group and Y into oxygen
Also, dropping molecules which stayed invalid after change
input:
Pandas dataframe with smiles molecules
output:
Cleaned database with only chemically valid molecules
"""
# Manually changing syntax of smiles molecules to ethylene groups using regex
df_clean = df_molecules.copy()
num_mols_w_Y = sum(df_clean.smiles.str.count("Y") > 0)
num_mols_w_R = sum(df_clean.smiles.str.count(r"Y|\*") > 0)
total_num_special_mols = sum(df_clean.smiles.str.count(r"Y|\*") > 0)
num_normal_mols = df_clean.shape[0] - total_num_special_mols
print(f"Found {num_mols_w_Y} molecules with 'Y' substituents.")
print(f"Found {num_mols_w_R} molecules with '*' substituents.")
print(f"Found {total_num_special_mols} molecules with 'Y' or '*' substituents.")
print(f"Found {num_normal_mols} molecules without 'Y' or '*' substituents.")
print(
f"Percent of molecules with 'Y' or '*' substituents: {total_num_special_mols / df_clean.shape[0]}"
)
print('Substituting "*" and "Y" substituents with ethyl and O groups.')
df_clean.smiles = df_clean.smiles.str.replace(r"\[\d+\*\]", "CC")
df_clean.smiles = df_clean.smiles.str.replace(r"(\d\d\*\+)", "CC+")
df_clean.smiles = df_clean.smiles.str.replace(r"(\d\*\+)", "CC+")
df_clean.smiles = df_clean.smiles.str.replace(r"(\*\+)", "CC+")
df_clean.smiles = df_clean.smiles.str.replace(r"\[\d+\*+:\d\]", "CC")
df_clean.smiles = df_clean.smiles.str.replace(r"\[\*+:\d\]", "CC")
df_clean.smiles = df_clean.smiles.str.replace(r"\*", "CC")
df_clean.smiles = df_clean.smiles.str.replace("Y", "O")
# Sanitizing molecules, checking for and dropping invalid molecules
print("Sanitizing molecules, checking for invalid molecules...")
df_clean["smiles"] = df_clean["smiles"].apply(lambda x: sanitize_smiles(x))
print(f"Removing {df_clean.smiles.isna().sum()} invalid molecules...")
df_clean.dropna(subset=["smiles"], inplace=True)
print("Done.")
# Collect all the patents into a single column
df_clean["patents"] = df_clean[df_clean.columns[1:]].apply(
lambda x: ",".join(x.dropna().astype(str)), axis=1
)
return df_clean
def select_molecules_by_mw(df_molecules, min_mw=0, max_mw=1e4):
"""
Selects molecules by their molecular weight (MW) range.
input:
Pandas dataframe with smiles molecules
min_mw: minimum molecular weight
max_mw: maximum molecular weight
output:
Pandas dataframe with molecules within the MW range
"""
print(f"Selecting molecules with MW between {min_mw} and {max_mw}...")
df_molecules["MW"] = df_molecules["smiles"].apply(
lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x))
)
df_molecules = df_molecules[(df_molecules["MW"] >= min_mw) & (df_molecules["MW"] <= max_mw)]
print(f"Done. {df_molecules.shape[0]} molecules selected.")
return df_molecules
def get_parser():
parser = argparse.ArgumentParser(description="Downloads patent files from USPTO website")
parser.add_argument(
"--years",
type=str,
nargs="+",
required=True,
help="Year(s) of patent files to download (or 'all')",
)
parser.add_argument(
"--naming",
type=str,
required=True,
help="Name of query (location where results will be stored)",
)
parser.add_argument(
"--output_dir",
type=str,
default=".",
help="Parent directory of 'args.naming'",
)
parser.add_argument(
"--charged_only",
default=False,
action="store_true",
help="Whether to include only charged molecules in the dataset",
)
parser.add_argument(
"--neutral_only",
default=False,
action="store_true",
help="Whether to include only neutral molecules in the dataset",
)
parser.add_argument(
"--mw_min",
type=float,
default=0,
help="Minimum molecular weight to accept",
)
parser.add_argument(
"--mw_max",
type=float,
default=1e4,
help="Maximum molecular weight to accept",
)
return parser
def main(args):
if args.years == ["all"]:
print("Preparing to clean chemistry patents from 2001 to 2023...")
years = list(map(str, range(2001, 2024)))
else:
print("Preparing to clean chemistry patents from", ", ".join(args.years), "...")
years = args.years
subject_smiles_dictionary_abstract = creating_single_molecule_directory(
args.output_dir, args.naming, years
)
smiles_DF = pd.DataFrame.from_dict(subject_smiles_dictionary_abstract, orient="index")
smiles_DF = smiles_DF.reset_index()
smiles_DF = smiles_DF.rename(columns={"index": "smiles"})
if args.charged_only:
df = cleaning_data_set(subject_smiles_dictionary_abstract, charged_only=True)
elif args.neutral_only:
df = cleaning_data_set(subject_smiles_dictionary_abstract, neutral_only=True)
else:
df = smiles_DF
df_cleaned = cleaning_molecules_from_subst(df)
if args.mw_min > 0 or args.mw_max < 1e4:
df_cleaned = select_molecules_by_mw(df_cleaned, args.mw_min, args.mw_max)
df_cleaned_smiles_patents = df_cleaned[["smiles", "patents"]].copy()
# Create yaml file with SMILES and patents
smiles_dict_filelist = []
for ind in df_cleaned_smiles_patents.index:
smiles_dict_filelist.append(
{
"smiles": df_cleaned_smiles_patents.smiles.loc[ind],
"reference": [df_cleaned_smiles_patents.patents.loc[ind]],
}
)
with open(
os.path.join(
args.output_dir,
args.naming,
"subject_smiles_dictionary_" + args.naming + ".yml",
),
"w",
) as outfile:
yaml.dump(smiles_dict_filelist, outfile, default_flow_style=False)
# Write SMILES list to file
clean_smiles_set = set(df_cleaned_smiles_patents.smiles)
print(f"Writing {len(clean_smiles_set)} unique SMILES to file...")
smiles_txt_file = os.path.join(args.output_dir, args.naming, f"smiles_list_{args.naming}.txt")
with open(smiles_txt_file, "w") as f:
for i, smi in enumerate(clean_smiles_set):
if i == len(clean_smiles_set) - 1:
f.write(smi)
else:
f.write(smi + "\n")
return
if __name__ == "__main__":
args = get_parser().parse_args()
main(args)