From fbd3b45189cd51482442c30c904457796c9968d2 Mon Sep 17 00:00:00 2001 From: hyunp2 <42776897+hyunp2@users.noreply.github.com> Date: Mon, 18 Sep 2023 10:38:33 -0500 Subject: [PATCH] Update 1_fragmentation.py --- 1_fragmentation.py | 191 +++++++++++++++++++++++---------------------- 1 file changed, 96 insertions(+), 95 deletions(-) diff --git a/1_fragmentation.py b/1_fragmentation.py index 6d4008b8..9c260de6 100644 --- a/1_fragmentation.py +++ b/1_fragmentation.py @@ -6,98 +6,99 @@ import rdkit.Chem.AllChem as Chem from rdkit.Chem import AllChem -# data cleaning -df_info = pd.read_csv('data/hMOF_CO2_info.csv') -df_info = df_info.dropna() # drop entries containing 'NaN' -df_info = df_info[df_info.CO2_capacity_001>0] # only keep entries with positive CO2 working capacity -df_info = df_info[~df_info.MOFid.str.contains('ERROR')] # drop entries with error -df_info = df_info[~df_info.MOFid.str.contains('NA')] # drop entries with NA - -# get node and linker information -metal_eles = ['Zn', 'Cu', 'Mn', 'Zr', 'Co', 'Ni', 'Fe', 'Cd', 'Pb', 'Al', 'Mg', 'V', - 'Tb', 'Eu', 'Sm', 'Tm', 'Gd', 'Nd', 'Dy', 'La', 'Ba', 'Ga', 'In', - 'Ti', 'Be', 'Ce', 'Li', 'Pd', 'Na', 'Er', 'Ho', 'Yb', 'Ag', 'Pr', - 'Cs', 'Mo', 'Lu', 'Ca', 'Pt', 'Ge', 'Sc', 'Hf', 'Cr', 'Bi', 'Rh', - 'Sn', 'Ir', 'Nb', 'Ru', 'Th', 'As', 'Sr'] - -# get a list of metal nodes & create a new column named "metal_nodes" -metal_nodes = [] -organic_linkers = [] -for i,mofid in tqdm(enumerate(df_info.MOFid)): - sbus = mofid.split()[0].split('.') - metal_nodes.append([c for c in sbus if any(e in c for e in metal_eles)][0]) - organic_linkers.append([c for c in sbus if any(e in c for e in metal_eles)==False]) - -df_info['metal_node'] = metal_nodes -df_info['organic_linker'] = organic_linkers - -# get most occuring nodes -unique_nodes = [n for n in list(df_info['metal_node'].unique()) if len(n)<=30] # node smiles should be shorter then 30 strings -df_info = df_info[df_info['metal_node'].isin(unique_nodes)] # filter df_info based on unique_nodes -freq = [df_info['metal_node'].value_counts()[value] for value in list(df_info.metal_node.unique())] # get frequency of unique nodes -df_freq = pd.DataFrame({'node':list(df_info.metal_node.unique()),'freq':freq}) -print('node occurance:') -print(df_freq.iloc[:5,:]) -unique_node_select = list(df_freq[df_freq.freq>=5000].node) # select occuring nodes -df_info_select = df_info[df_info['metal_node'].isin(unique_node_select)] # select df_info with node only in list(unique_node_select) - -# create necessary folders -os.makedirs(f'data/conformers',exist_ok=True) -os.makedirs(f'data/data_by_node',exist_ok=True) -os.makedirs(f'data/fragments_smi',exist_ok=True) - -# output each node to a separate csv files -for n in unique_node_select: - n_name = n.replace('[','').replace(']','').replace('(','').replace(')','') - df_info_select_node = df_info[df_info.metal_node == n] - df_info_select_node.to_csv(f'data/data_by_node/{n_name}.csv',index=False) - -# load data -for node in ['CuCu']: # change to ['CuCu','ZnZn','ZnOZnZnZn'] to reproduce paper result - node_name = node.replace('[','').replace(']','').replace('(','').replace(')','') - print(f'Now on node {node_name} ... ') - input_data_path = f'data/data_by_node/{node_name}.csv' - - df = pd.read_csv(input_data_path) - - # MOFs with three linkers - len_linkers = [len(eval(df['organic_linker'].iloc[i])) for i in range(len(df['organic_linker']))] - df['len_linkers'] = len_linkers - df_three_linkers = df[df.len_linkers==3] - - # MOFs with high working capactiy at (wc > 2mmol/g @ 0.1 bar) - df_high_wc = df[df['CO2_capacity_01'] >=2] - - # MOFs with high working capacity and three linkers - len_linkers = [len(eval(df_high_wc['organic_linker'].iloc[i])) for i in range(len(df_high_wc['organic_linker']))] - df_high_wc['len_linkers'] = len_linkers - df_high_wc_3_linkers = df_high_wc[df_high_wc.len_linkers==3] - - # get list of SMILES for all linkers - list_smiles = [eval(i) for i in df_high_wc['organic_linker']] - all_smiles = list(itertools.chain(*list_smiles)) - print(f'number of smiles: {len(all_smiles)}') - all_smiles_unique = list(pd.Series(all_smiles).unique()) - print(f'number of unique_smiles: {len(all_smiles_unique)}') - - # remove the line below to reproduce paper results - all_smiles_unique = all_smiles_unique[:1000] - - # output to sdf - print('Outputting conformers to sdf ... ') - conformer_sdf_path = f'data/conformers/conformers_{node_name}.sdf' - if not os.path.isfile(conformer_sdf_path): - writer = Chem.SDWriter(conformer_sdf_path) - for smile in tqdm(all_smiles_unique): # change to tqdm(all_smiles_unique) to reproduce paper result - try: - mol = Chem.AddHs(Chem.MolFromSmiles(smile)) - conformers = AllChem.EmbedMultipleConfs(mol, numConfs=1) - conformer = mol.GetConformer(0) - for cid in range(mol.GetNumConformers()): - writer.write(mol, confId=cid) - except: - pass - if not os.path.isfile(f'data/fragments_smi/frag_{node_name}.txt'): - # generate fragment SMILES - print('Generating SMILES ... ') - subprocess.run(f'python utils/prepare_data_from_sdf.py --sdf_path data/conformers/conformers_{node_name}.sdf --output_path data/fragments_smi/frag_{node_name}.txt --verbose',shell=True) +def fragmentation(): + # data cleaning + df_info = pd.read_csv('data/hMOF_CO2_info.csv') + df_info = df_info.dropna() # drop entries containing 'NaN' + df_info = df_info[df_info.CO2_capacity_001>0] # only keep entries with positive CO2 working capacity + df_info = df_info[~df_info.MOFid.str.contains('ERROR')] # drop entries with error + df_info = df_info[~df_info.MOFid.str.contains('NA')] # drop entries with NA + + # get node and linker information + metal_eles = ['Zn', 'Cu', 'Mn', 'Zr', 'Co', 'Ni', 'Fe', 'Cd', 'Pb', 'Al', 'Mg', 'V', + 'Tb', 'Eu', 'Sm', 'Tm', 'Gd', 'Nd', 'Dy', 'La', 'Ba', 'Ga', 'In', + 'Ti', 'Be', 'Ce', 'Li', 'Pd', 'Na', 'Er', 'Ho', 'Yb', 'Ag', 'Pr', + 'Cs', 'Mo', 'Lu', 'Ca', 'Pt', 'Ge', 'Sc', 'Hf', 'Cr', 'Bi', 'Rh', + 'Sn', 'Ir', 'Nb', 'Ru', 'Th', 'As', 'Sr'] + + # get a list of metal nodes & create a new column named "metal_nodes" + metal_nodes = [] + organic_linkers = [] + for i,mofid in tqdm(enumerate(df_info.MOFid)): + sbus = mofid.split()[0].split('.') + metal_nodes.append([c for c in sbus if any(e in c for e in metal_eles)][0]) + organic_linkers.append([c for c in sbus if any(e in c for e in metal_eles)==False]) + + df_info['metal_node'] = metal_nodes + df_info['organic_linker'] = organic_linkers + + # get most occuring nodes + unique_nodes = [n for n in list(df_info['metal_node'].unique()) if len(n)<=30] # node smiles should be shorter then 30 strings + df_info = df_info[df_info['metal_node'].isin(unique_nodes)] # filter df_info based on unique_nodes + freq = [df_info['metal_node'].value_counts()[value] for value in list(df_info.metal_node.unique())] # get frequency of unique nodes + df_freq = pd.DataFrame({'node':list(df_info.metal_node.unique()),'freq':freq}) + print('node occurance:') + print(df_freq.iloc[:5,:]) + unique_node_select = list(df_freq[df_freq.freq>=5000].node) # select occuring nodes + df_info_select = df_info[df_info['metal_node'].isin(unique_node_select)] # select df_info with node only in list(unique_node_select) + + # create necessary folders + os.makedirs(f'data/conformers',exist_ok=True) + os.makedirs(f'data/data_by_node',exist_ok=True) + os.makedirs(f'data/fragments_smi',exist_ok=True) + + # output each node to a separate csv files + for n in unique_node_select: + n_name = n.replace('[','').replace(']','').replace('(','').replace(')','') + df_info_select_node = df_info[df_info.metal_node == n] + df_info_select_node.to_csv(f'data/data_by_node/{n_name}.csv',index=False) + + # load data + for node in ['CuCu']: # change to ['CuCu','ZnZn','ZnOZnZnZn'] to reproduce paper result + node_name = node.replace('[','').replace(']','').replace('(','').replace(')','') + print(f'Now on node {node_name} ... ') + input_data_path = f'data/data_by_node/{node_name}.csv' + + df = pd.read_csv(input_data_path) + + # MOFs with three linkers + len_linkers = [len(eval(df['organic_linker'].iloc[i])) for i in range(len(df['organic_linker']))] + df['len_linkers'] = len_linkers + df_three_linkers = df[df.len_linkers==3] + + # MOFs with high working capactiy at (wc > 2mmol/g @ 0.1 bar) + df_high_wc = df[df['CO2_capacity_01'] >=2] + + # MOFs with high working capacity and three linkers + len_linkers = [len(eval(df_high_wc['organic_linker'].iloc[i])) for i in range(len(df_high_wc['organic_linker']))] + df_high_wc['len_linkers'] = len_linkers + df_high_wc_3_linkers = df_high_wc[df_high_wc.len_linkers==3] + + # get list of SMILES for all linkers + list_smiles = [eval(i) for i in df_high_wc['organic_linker']] + all_smiles = list(itertools.chain(*list_smiles)) + print(f'number of smiles: {len(all_smiles)}') + all_smiles_unique = list(pd.Series(all_smiles).unique()) + print(f'number of unique_smiles: {len(all_smiles_unique)}') + + # remove the line below to reproduce paper results + all_smiles_unique = all_smiles_unique[:1000] + + # output to sdf + print('Outputting conformers to sdf ... ') + conformer_sdf_path = f'data/conformers/conformers_{node_name}.sdf' + if not os.path.isfile(conformer_sdf_path): + writer = Chem.SDWriter(conformer_sdf_path) + for smile in tqdm(all_smiles_unique): # change to tqdm(all_smiles_unique) to reproduce paper result + try: + mol = Chem.AddHs(Chem.MolFromSmiles(smile)) + conformers = AllChem.EmbedMultipleConfs(mol, numConfs=1) + conformer = mol.GetConformer(0) + for cid in range(mol.GetNumConformers()): + writer.write(mol, confId=cid) + except: + pass + if not os.path.isfile(f'data/fragments_smi/frag_{node_name}.txt'): + # generate fragment SMILES + print('Generating SMILES ... ') + subprocess.run(f'python utils/prepare_data_from_sdf.py --sdf_path data/conformers/conformers_{node_name}.sdf --output_path data/fragments_smi/frag_{node_name}.txt --verbose',shell=True)