-
Notifications
You must be signed in to change notification settings - Fork 4
/
1_fragmentation.py
103 lines (88 loc) · 4.86 KB
/
1_fragmentation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os
import subprocess
import itertools
from tqdm import tqdm
import pandas as pd
import rdkit.Chem.AllChem as Chem
from rdkit.Chem import AllChem
# data cleaning
df_info = pd.read_csv('data/hMOF_CO2_info.csv')
df_info = df_info.dropna() # drop entries containing 'NaN'
df_info = df_info[df_info.CO2_capacity_001>0] # only keep entries with positive CO2 working capacity
df_info = df_info[~df_info.MOFid.str.contains('ERROR')] # drop entries with error
df_info = df_info[~df_info.MOFid.str.contains('NA')] # drop entries with NA
# get node and linker information
metal_eles = ['Zn', 'Cu', 'Mn', 'Zr', 'Co', 'Ni', 'Fe', 'Cd', 'Pb', 'Al', 'Mg', 'V',
'Tb', 'Eu', 'Sm', 'Tm', 'Gd', 'Nd', 'Dy', 'La', 'Ba', 'Ga', 'In',
'Ti', 'Be', 'Ce', 'Li', 'Pd', 'Na', 'Er', 'Ho', 'Yb', 'Ag', 'Pr',
'Cs', 'Mo', 'Lu', 'Ca', 'Pt', 'Ge', 'Sc', 'Hf', 'Cr', 'Bi', 'Rh',
'Sn', 'Ir', 'Nb', 'Ru', 'Th', 'As', 'Sr']
# get a list of metal nodes & create a new column named "metal_nodes"
metal_nodes = []
organic_linkers = []
for i,mofid in tqdm(enumerate(df_info.MOFid)):
sbus = mofid.split()[0].split('.')
metal_nodes.append([c for c in sbus if any(e in c for e in metal_eles)][0])
organic_linkers.append([c for c in sbus if any(e in c for e in metal_eles)==False])
df_info['metal_node'] = metal_nodes
df_info['organic_linker'] = organic_linkers
# get most occuring nodes
unique_nodes = [n for n in list(df_info['metal_node'].unique()) if len(n)<=30] # node smiles should be shorter then 30 strings
df_info = df_info[df_info['metal_node'].isin(unique_nodes)] # filter df_info based on unique_nodes
freq = [df_info['metal_node'].value_counts()[value] for value in list(df_info.metal_node.unique())] # get frequency of unique nodes
df_freq = pd.DataFrame({'node':list(df_info.metal_node.unique()),'freq':freq})
print('node occurance:')
print(df_freq.iloc[:5,:])
unique_node_select = list(df_freq[df_freq.freq>=5000].node) # select occuring nodes
df_info_select = df_info[df_info['metal_node'].isin(unique_node_select)] # select df_info with node only in list(unique_node_select)
# create necessary folders
os.makedirs(f'data/conformers',exist_ok=True)
os.makedirs(f'data/data_by_node',exist_ok=True)
os.makedirs(f'data/fragments_smi',exist_ok=True)
# output each node to a separate csv files
for n in unique_node_select:
n_name = n.replace('[','').replace(']','').replace('(','').replace(')','')
df_info_select_node = df_info[df_info.metal_node == n]
df_info_select_node.to_csv(f'data/data_by_node/{n_name}.csv',index=False)
# load data
for node in ['CuCu']: # change to ['CuCu','ZnZn','ZnOZnZnZn'] to reproduce paper result
node_name = node.replace('[','').replace(']','').replace('(','').replace(')','')
print(f'Now on node {node_name} ... ')
input_data_path = f'data/data_by_node/{node_name}.csv'
df = pd.read_csv(input_data_path)
# MOFs with three linkers
len_linkers = [len(eval(df['organic_linker'].iloc[i])) for i in range(len(df['organic_linker']))]
df['len_linkers'] = len_linkers
df_three_linkers = df[df.len_linkers==3]
# MOFs with high working capactiy at (wc > 2mmol/g @ 0.1 bar)
df_high_wc = df[df['CO2_capacity_01'] >=2]
# MOFs with high working capacity and three linkers
len_linkers = [len(eval(df_high_wc['organic_linker'].iloc[i])) for i in range(len(df_high_wc['organic_linker']))]
df_high_wc['len_linkers'] = len_linkers
df_high_wc_3_linkers = df_high_wc[df_high_wc.len_linkers==3]
# get list of SMILES for all linkers
list_smiles = [eval(i) for i in df_high_wc['organic_linker']]
all_smiles = list(itertools.chain(*list_smiles))
print(f'number of smiles: {len(all_smiles)}')
all_smiles_unique = list(pd.Series(all_smiles).unique())
print(f'number of unique_smiles: {len(all_smiles_unique)}')
# remove the line below to reproduce paper results
all_smiles_unique = all_smiles_unique[:1000]
# output to sdf
print('Outputting conformers to sdf ... ')
conformer_sdf_path = f'data/conformers/conformers_{node_name}.sdf'
if not os.path.isfile(conformer_sdf_path):
writer = Chem.SDWriter(conformer_sdf_path)
for smile in tqdm(all_smiles_unique): # change to tqdm(all_smiles_unique) to reproduce paper result
try:
mol = Chem.AddHs(Chem.MolFromSmiles(smile))
conformers = AllChem.EmbedMultipleConfs(mol, numConfs=1)
conformer = mol.GetConformer(0)
for cid in range(mol.GetNumConformers()):
writer.write(mol, confId=cid)
except:
pass
if not os.path.isfile(f'data/fragments_smi/frag_{node_name}.txt'):
# generate fragment SMILES
print('Generating SMILES ... ')
subprocess.run(f'python utils/prepare_data_from_sdf.py --sdf_path data/conformers/conformers_{node_name}.sdf --output_path data/fragments_smi/frag_{node_name}.txt --verbose',shell=True)