diff --git a/README.md b/README.md index 4c1c518d..b9a75f88 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,19 @@ If you want to use this project for development, we recommend going through [lib To reproduce the substructure benchmark [Prody](https://prody.csb.pitt.edu/) and [Rdkit](https://www.rdkit.org/) are also required. +### Installation Guide +#### Using with conda environment +We can easily create a new enviroment which contains necessary dependencies following the command below: +``` +conda env create -f environment.yml +``` + +or using: +``` +chmod +x install.sh +bash install.sh +``` + ## Dataset Preprocessing PDB files are first parsed to remove hetero atoms, then converted to "gninatypes" files and finally collected into a "molcache2" file for quicker input and model training with libmolgrid. "gninatypes" and "molcache2" files are binary files that store an efficient representation of the input protein to be used for gridding the molecule. They are prepared for faster input with libmolgrid for quicker training of the CNN models. @@ -71,9 +84,17 @@ I have written down steps below that pertain to preparing training data for a da Steps for preparing training data: 1) remove hetero atoms (clean_pdb.py) +- Example: `python clean_pdb.py 8BRA.pdb 8BRA_protein.pdb` 2) run fpocket through structures (fpocket -f *_protein.pdb) +- Example: `fpocket -f 8BRA_protein.pdb`. It outputs a folder named "8BRA_protein_out" including: +``` +8BRA_protein.pml 8BRA_protein_PYMOL.sh 8BRA_protein_info.txt 8BRA_protein_pockets.pqr +8BRA_protein.tcl 8BRA_protein_VMD.sh 8BRA_protein_out.pdb pockets +``` 3) get candidate pocket centers for all structures (get_centers.py) +- Example: `python get_centers.py 8BRA_protein_out/`. It outputs a file named "bary_centers.txt" in the folder. 4) create .gninatypes files for all structure (gninatype() in types_and_gninatyper.py) +- Example: `python types_and_gninatyper.py 8BRA_protein_out/8BRA_protein_out.pdb`. It outputs a file named "8BRA_protein_out.gninatypes". 5) make train and test types (make_types.py) 6) create molcache file for training (create_molcache2.py) diff --git a/clean_pdb.py b/clean_pdb.py index f3f366df..cbb1c212 100644 --- a/clean_pdb.py +++ b/clean_pdb.py @@ -1,6 +1,7 @@ ''' Takes a PDB file and removes hetero atoms from its structure. First argument is path to original file, second argument is path to generated file +Examples: python clean_pdb.py 8BRA.pdb 8BRA_cleaned.pdb ''' from Bio.PDB import PDBParser, PDBIO, Select import Bio diff --git a/create_molcache2.py b/create_molcache2.py index 6458e7a0..8f618fbe 100644 --- a/create_molcache2.py +++ b/create_molcache2.py @@ -97,24 +97,27 @@ def create_cache2(molfiles, data_root, outfile): out.seek(0, os.SEEK_END) out.close() - -parser = argparse.ArgumentParser() -parser.add_argument('-c', '--col', required=True, type=int, help='Column receptor starts on') -parser.add_argument('--recmolcache', default='rec.molcache2', type=str, help='Filename of receptor cache') -parser.add_argument('-d', '--data_root', type=str, required=False, - help="Root folder for relative paths in train/test files", default='') -parser.add_argument('fnames', nargs='+', type=str, help='types files to process') - -args = parser.parse_args() - -# load all file names into memory -seenrec = set() -for fname in args.fnames: - for line in open(fname): - vals = line.split() - rec = vals[args.col] - - if rec not in seenrec: - seenrec.add(rec) - -create_cache2(sorted(list(seenrec)), args.data_root, args.recmolcache) +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--col', required=True, type=int, help='Column receptor starts on') + parser.add_argument('--recmolcache', default='rec.molcache2', type=str, help='Filename of receptor cache') + parser.add_argument('-d', '--data_root', type=str, required=False, + help="Root folder for relative paths in train/test files", default='') + parser.add_argument('fnames', nargs='+', type=str, help='types files to process') + + args = parser.parse_args() + + # load all file names (.gninatypes) for each pdb file into memory + seenrec = set() + for fname in args.fnames: + for line in open(fname): + vals = line.split() + rec = vals[args.col] #get .gninatypes path in the 5th column of the .types file + + if rec not in seenrec: + seenrec.add(rec) + + create_cache2(sorted(list(seenrec)), args.data_root, args.recmolcache) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/environment.yml b/environment.yml new file mode 100644 index 00000000..9665d724 --- /dev/null +++ b/environment.yml @@ -0,0 +1,153 @@ +name: deeppocket +channels: + - rdkit + - pytorch + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - biopython=1.79=py37h5e8e339_1 + - blas=1.0=mkl + - bottleneck=1.3.5=py37h7deecbd_0 + - brotlipy=0.7.0=py37h540881e_1004 + - bzip2=1.0.8=h7b6447c_0 + - c-ares=1.18.1=h7f98852_0 + - ca-certificates=2023.01.10=h06a4308_0 + - cairo=1.16.0=hb05425b_4 + - certifi=2022.12.7=py37h06a4308_0 + - cffi=1.15.0=py37h036bc23_0 + - charset-normalizer=3.1.0=pyhd8ed1ab_0 + - cryptography=37.0.2=py37h38fbfac_0 + - curl=7.79.1=h2574ce0_1 + - cycler=0.11.0=pyhd8ed1ab_0 + - expat=2.4.9=h6a678d5_0 + - ffmpeg=4.3=hf484d3e_0 + - fftw=3.3.10=nompi_h77c792f_102 + - flit-core=3.6.0=pyhd3eb1b0_0 + - fontconfig=2.14.1=h4c34cd2_2 + - fpocket=4.0.2=hbb826e7_0 + - freetype=2.12.1=h4a9f257_0 + - giflib=5.2.1=h5eee18b_3 + - glib=2.69.1=he621ea3_2 + - gmp=6.2.1=h295c915_3 + - gnutls=3.6.15=he1e5248_0 + - hdf4=4.2.15=h10796ff_3 + - hdf5=1.12.1=nompi_h2750804_100 + - icu=58.2=he6710b0_3 + - idna=3.4=pyhd8ed1ab_0 + - intel-openmp=2021.4.0=h06a4308_3561 + - jpeg=9e=h5eee18b_1 + - keyutils=1.6.1=h166bdaf_0 + - kiwisolver=1.3.2=py37h2527ec5_1 + - krb5=1.19.3=h3790be6_0 + - lame=3.100=h7b6447c_0 + - lcms2=2.12=h3be6417_0 + - ld_impl_linux-64=2.38=h1181459_1 + - lerc=3.0=h295c915_0 + - libboost=1.73.0=h28710b8_12 + - libcurl=7.79.1=h2574ce0_1 + - libdeflate=1.17=h5eee18b_0 + - libedit=3.1.20191231=he28a2e2_2 + - libev=4.33=h516909a_1 + - libffi=3.4.2=h6a678d5_6 + - libgcc-ng=11.2.0=h1234567_1 + - libgfortran-ng=12.2.0=h69a702a_19 + - libgfortran5=12.2.0=h337968e_19 + - libgomp=11.2.0=h1234567_1 + - libiconv=1.16=h7f8727e_2 + - libidn2=2.3.2=h7f8727e_0 + - libnetcdf=4.8.1=nompi_hb3fd0d9_101 + - libnghttp2=1.43.0=h812cca2_1 + - libpng=1.6.39=h5eee18b_0 + - libssh2=1.10.0=ha56f1ee_2 + - libstdcxx-ng=11.2.0=h1234567_1 + - libtasn1=4.19.0=h5eee18b_0 + - libtiff=4.5.0=h6a678d5_2 + - libunistring=0.9.10=h27cfd23_0 + - libuuid=1.41.5=h5eee18b_0 + - libwebp=1.2.4=h11a3e52_1 + - libwebp-base=1.2.4=h5eee18b_1 + - libxcb=1.15=h7f8727e_0 + - libxml2=2.10.3=hcbfbd50_0 + - libzip=1.8.0=h4de3113_1 + - lz4-c=1.9.4=h6a678d5_0 + - matplotlib-base=3.4.3=py37h1058ff1_2 + - mkl=2021.4.0=h06a4308_640 + - mkl-service=2.4.0=py37h7f8727e_0 + - mkl_fft=1.3.1=py37hd3c417c_0 + - mkl_random=1.2.2=py37h51133e4_0 + - ncurses=6.4=h6a678d5_0 + - nettle=3.7.3=hbbd107a_1 + - numexpr=2.8.4=py37he184ba9_0 + - numpy=1.21.5=py37h6c91a56_3 + - numpy-base=1.21.5=py37ha15fc14_3 + - openh264=2.1.1=h4ff587b_0 + - openssl=1.1.1t=h7f8727e_0 + - packaging=22.0=py37h06a4308_0 + - pandas=1.3.5=py37h8c16a72_0 + - pcre=8.45=h295c915_0 + - pillow=9.4.0=py37h6a678d5_0 + - pip=22.3.1=py37h06a4308_0 + - pixman=0.40.0=h7f8727e_1 + - prody=2.1.0=py37hd23a5d3_0 + - py-boost=1.73.0=py37h51133e4_12 + - pycparser=2.21=pyhd8ed1ab_0 + - pyopenssl=22.0.0=pyhd8ed1ab_1 + - pyparsing=3.0.9=pyhd8ed1ab_0 + - pysocks=1.7.1=py37h89c1867_5 + - python=3.7.16=h7a1cb2a_0 + - python-dateutil=2.8.2=pyhd3eb1b0_0 + - python_abi=3.7=2_cp37m + - pytorch=1.13.1=py3.7_cpu_0 + - pytorch-mutex=1.0=cpu + - pytz=2022.7=py37h06a4308_0 + - rdkit=2020.09.1.0=py37hd50e099_1 + - readline=8.2=h5eee18b_0 + - requests=2.29.0=pyhd8ed1ab_0 + - scipy=1.7.3=py37h6c91a56_2 + - setuptools=65.6.3=py37h06a4308_0 + - six=1.16.0=pyhd3eb1b0_1 + - sqlite=3.41.2=h5eee18b_0 + - tk=8.6.12=h1ccaba5_0 + - torchaudio=0.13.1=py37_cpu + - torchvision=0.14.1=py37_cpu + - tornado=6.1=py37h540881e_3 + - typing_extensions=4.4.0=py37h06a4308_0 + - urllib3=1.26.15=pyhd8ed1ab_0 + - wheel=0.38.4=py37h06a4308_0 + - xz=5.2.10=h5eee18b_1 + - zlib=1.2.13=h5eee18b_0 + - zstd=1.5.5=hc292b87_0 + - pip: + - appdirs==1.4.4 + - click==8.1.3 + - docker-pycreds==0.4.0 + - exceptiongroup==1.1.1 + - gitdb==4.0.10 + - gitpython==3.1.31 + - imageio==2.28.0 + - importlib-metadata==6.6.0 + - iniconfig==2.0.0 + - joblib==1.2.0 + - molgrid==0.5.3 + - networkx==2.6.3 + - pathtools==0.1.2 + - pluggy==1.0.0 + - protobuf==4.22.3 + - psutil==5.9.5 + - pyquaternion==0.9.9 + - pytest==7.3.1 + - pywavelets==1.3.0 + - pyyaml==6.0 + - scikit-image==0.19.3 + - scikit-learn==1.0.2 + - sentry-sdk==1.21.0 + - setproctitle==1.3.2 + - smmap==5.0.0 + - threadpoolctl==3.1.0 + - tifffile==2021.11.2 + - tomli==2.0.1 + - wandb==0.15.0 + - zipp==3.15.0 +prefix: /home/ubuntu/.conda/envs/deeppocket \ No newline at end of file diff --git a/get_centers.py b/get_centers.py index d25f39ed..284bce51 100644 --- a/get_centers.py +++ b/get_centers.py @@ -1,5 +1,6 @@ ''' Takes the *_out/pockets directory from fpocket as input and outputs a file containining candidate pocket centers in that directory +Example: python get_centers.py 8BRA_protein_out/ ''' import os import numpy as np diff --git a/install.sh b/install.sh new file mode 100644 index 00000000..05c3f592 --- /dev/null +++ b/install.sh @@ -0,0 +1,10 @@ +conda create -c conda-forge -n deeppocket rdkit python=3.7 +conda activate deeppocket +conda install pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 -c pytorch -y +conda install -c conda-forge fpocket -y +conda install -c conda-forge prody -y +pip install molgrid +conda install -c conda-forge biopython -y +pip install scikit-learn +pip install wandb +pip install scikit-image \ No newline at end of file diff --git a/make_types.py b/make_types.py index 34c0b78c..25187b92 100755 --- a/make_types.py +++ b/make_types.py @@ -19,7 +19,7 @@ def types_from_file(f,holo_f): c=mol.GetConformer() atom_np=c.GetPositions() atom_nps.append(atom_np) - centers = np.loadtxt(path + "/" + prot +"/"+prot+ "_protein_nowat_out/pockets/" + "bary" + "_centers.txt") + centers = np.loadtxt(path + "/" + prot +"/"+prot+ "_protein_nowat_out/pockets/bary_centers.txt") if centers.shape[0]==4 and len(centers.shape)==1: centers=np.expand_dims(centers,axis=0) limit = centers.shape[0] diff --git a/make_types_singly.py b/make_types_singly.py new file mode 100644 index 00000000..2bb0c856 --- /dev/null +++ b/make_types_singly.py @@ -0,0 +1,38 @@ +import os +import sys +import numpy as np +import time +import rdkit +from rdkit.Chem import AllChem as Chem +from rdkit.Chem import AllChem + + +def types_from_file(ligand_file, bary_centers_file, output_types_file): + distance = 4 + atom_nps=[] + mol = Chem.MolFromMolFile(ligand_file,sanitize=False) + c=mol.GetConformer() + atom_np=c.GetPositions() + atom_nps.append(atom_np) + centers = np.loadtxt(bary_centers_file) + if centers.shape[0]==4 and len(centers.shape)==1: + centers=np.expand_dims(centers,axis=0) + limit = centers.shape[0] + if not (centers.shape[0]==0 and len(centers.shape)==1): + sorted_centers = centers[:int(limit), 1:] + for i in range(int(limit)): + label=0 + for atom_np in atom_nps: + dist = np.linalg.norm((atom_np - sorted_centers[i,:]), axis=1) + rel_centers = np.where(dist <= float(distance), 1, 0) + if (np.count_nonzero(rel_centers) > 0): + label =1 + else: + label =0 + output_types_file.write(str(label)+ ' '+ str(sorted_centers[i][0])+ ' '+ str(sorted_centers[i][1])+ ' '+ str(sorted_centers[i][2])+ ' '+prot+'/'+prot+'_protein_nowat.gninatypes\n') + +def main(): + output_types_file = open('pdbbind_train.types','w') + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/predict.py b/predict.py index b6f106d3..f6dfc66f 100644 --- a/predict.py +++ b/predict.py @@ -96,7 +96,7 @@ def get_model_gmaker_eprovider(test_types,batch_size,model,checkpoint,dims=None) zipped_lists = zip(class_probs[:len(types_lines)], types_lines) sorted_zipped_lists = sorted(zipped_lists,reverse=True) ranked_types = [element for _, element in sorted_zipped_lists] - confidence_types = [element for element, _ in sorted_zipped_lists] + confidence_types = [str(element) for element, _ in sorted_zipped_lists] seg_types= class_types.replace('.types','_ranked.types') probs_types_file=class_types.replace('.types','_confidence.txt') fout=open(seg_types,'w') diff --git a/prepare_data_pipeline.py b/prepare_data_pipeline.py new file mode 100644 index 00000000..894cbe0c --- /dev/null +++ b/prepare_data_pipeline.py @@ -0,0 +1,68 @@ +import os +from clean_pdb import clean_pdb +from create_molcache2 import create_cache2 +from get_centers import get_centers +from make_types_singly import types_from_file +from prepare_data_pipeline_helper import initialize_parameters +from types_and_gninatyper import gninatype + +EXTENSION_CLEAN_STEP = '_protein' + +def clean_pdb_step(input_file, output_path): + print("Starting cleaning pdb...") + input_file_path = input_file.path + cleaned_pdb_file_name = str(input_file.name).replace('.pdb', EXTENSION_CLEAN_STEP +".pdb") + cleaned_pdb_file_path = os.path.join(output_path, cleaned_pdb_file_name) + clean_pdb(input_file_path,cleaned_pdb_file_path) + print("Finished cleaning pdb!") + return cleaned_pdb_file_name, cleaned_pdb_file_path + +def fpocket_step(cleaned_pdb_file_path): + os.system("fpocket -f {}".format(cleaned_pdb_file_path)) + fpocket_dir = cleaned_pdb_file_path.replace('.pdb', '_out') + fpocket_pdb_file_path = os.path.join(fpocket_dir, cleaned_pdb_file_path.split('/')[-1].split('.')[0]+'_out.pdb') + return fpocket_dir, fpocket_pdb_file_path + +def get_centers_step(fpocket_dir): + print("Starting getting centers...") + get_centers(fpocket_dir) + print("Finished getting centers!") + +def types_and_gninatyper_step(fpocket_pdb_file_path): + print("Starting typing and gninatyping...") + gninatype(fpocket_pdb_file_path) + gninatypes_file_path = fpocket_pdb_file_path.replace('.pdb', '.gninatypes') + print("Finished typing and gninatyping!") + return gninatypes_file_path + +def main(): + options = initialize_parameters() + output_types_filepath = os.path.join(options.output_path, options.output_types_filename) + output_types_file = open(output_types_filepath, 'w') + + output_recmolcache_path = os.path.join(options.output_path, options.output_recmolcache) + + gninatypes_file_path_list = [] + for input_file in os.scandir(options.input_path): + if (input_file.is_file() == False) or (str(input_file.name).endswith('.pdb') == False): + continue + print(input_file.path) + ligand_file = str(input_file.path).replace('.pdb', "_ligand.sdf") + if os.path.isfile(ligand_file) == False: + print("There is no ligand file for pdb file named {} with {}. Skip run".format(input_file.name, ligand_file)) + continue + cleaned_pdb_file_name, cleaned_pdb_file_path = clean_pdb_step(input_file, options.output_path) + fpocket_dir, fpocket_pdb_file_path = fpocket_step(cleaned_pdb_file_path) + get_centers_step(fpocket_dir) + bary_centers_file = os.path.join(fpocket_dir, 'bary_centers.txt') + gninatypes_file_path = types_and_gninatyper_step(fpocket_pdb_file_path) + gninatypes_file_path_list.append(gninatypes_file_path) + print("Starting typing from file...") + types_from_file(ligand_file, bary_centers_file, output_types_file) + print("Finished typing from file!") + output_types_file.close() + create_cache2(gninatypes_file_path_list, '', output_recmolcache_path) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/prepare_data_pipeline_helper.py b/prepare_data_pipeline_helper.py new file mode 100644 index 00000000..c6656e07 --- /dev/null +++ b/prepare_data_pipeline_helper.py @@ -0,0 +1,17 @@ +import argparse + + +def initialize_parameters(): + parser = argparse.ArgumentParser() + parser.add_argument('--input_path', type=str, + default='pdb_files/') + parser.add_argument('--output_path', type=str, + default='prepared_pdb_files/') + parser.add_argument('--output_types_filename', type=str, + default='custome_train.types') + parser.add_argument('--output_recmolcache', type=str, + default='rec.molcache2', + help='Filename of receptor cache') + + options = parser.parse_args() + return options \ No newline at end of file