devalab · mainguyenanhvu · Apr 21, 2023 · Apr 21, 2023 · Apr 27, 2023 · Apr 27, 2023
diff --git a/README.md b/README.md
@@ -12,6 +12,19 @@ If you want to use this project for development, we recommend going through [lib
 
 To reproduce the substructure benchmark [Prody](https://prody.csb.pitt.edu/) and [Rdkit](https://www.rdkit.org/) are also required.
 
+### Installation Guide
+#### Using with conda environment
+We can easily create a new enviroment which contains necessary dependencies following the command below:
+```
+conda env create -f environment.yml
+```
+
+or using:
+```
+chmod +x install.sh
+bash install.sh
+```
+
 ## Dataset Preprocessing
 
 PDB files are first parsed to remove hetero atoms, then converted to "gninatypes" files and finally collected into a "molcache2" file for quicker input and model training with libmolgrid. "gninatypes" and "molcache2" files are binary files that store an efficient representation of the input protein to be used for gridding the molecule. They are prepared for faster input with libmolgrid for quicker training of the CNN models.
@@ -71,9 +84,17 @@ I have written down steps below that pertain to preparing training data for a da
 Steps for preparing training data:
 
 1) remove hetero atoms (clean_pdb.py)
+- Example: `python clean_pdb.py 8BRA.pdb 8BRA_protein.pdb`
 2) run fpocket through structures (fpocket -f *_protein.pdb)
+- Example: `fpocket -f 8BRA_protein.pdb`. It outputs a folder named "8BRA_protein_out" including: 
+```
+8BRA_protein.pml  8BRA_protein_PYMOL.sh  8BRA_protein_info.txt  8BRA_protein_pockets.pqr
+8BRA_protein.tcl  8BRA_protein_VMD.sh    8BRA_protein_out.pdb   pockets
+```
 3) get candidate pocket centers for all structures (get_centers.py)
+- Example: `python get_centers.py 8BRA_protein_out/`. It outputs a file named "bary_centers.txt" in the folder.
 4) create .gninatypes files for all structure (gninatype() in types_and_gninatyper.py)
+- Example: `python types_and_gninatyper.py 8BRA_protein_out/8BRA_protein_out.pdb`. It outputs a file named "8BRA_protein_out.gninatypes".
 5) make train and test types (make_types.py)
 6) create molcache file for training (create_molcache2.py)
 

diff --git a/clean_pdb.py b/clean_pdb.py
@@ -1,6 +1,7 @@
 '''
 Takes a PDB file and removes hetero atoms from its structure.
 First argument is path to original file, second argument is path to generated file
+Examples: python clean_pdb.py 8BRA.pdb 8BRA_cleaned.pdb
 '''
 from Bio.PDB import PDBParser, PDBIO, Select
 import Bio

diff --git a/create_molcache2.py b/create_molcache2.py
@@ -97,24 +97,27 @@ def create_cache2(molfiles, data_root, outfile):
     out.seek(0, os.SEEK_END)
     out.close()
 
-
-parser = argparse.ArgumentParser()
-parser.add_argument('-c', '--col', required=True, type=int, help='Column receptor starts on')
-parser.add_argument('--recmolcache', default='rec.molcache2', type=str, help='Filename of receptor cache')
-parser.add_argument('-d', '--data_root', type=str, required=False,
-                    help="Root folder for relative paths in train/test files", default='')
-parser.add_argument('fnames', nargs='+', type=str, help='types files to process')
-
-args = parser.parse_args()
-
-# load all file names into memory
-seenrec = set()
-for fname in args.fnames:
-    for line in open(fname):
-        vals = line.split()
-        rec = vals[args.col]
-
-        if rec not in seenrec:
-            seenrec.add(rec)
-
-create_cache2(sorted(list(seenrec)), args.data_root, args.recmolcache)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--col', required=True, type=int, help='Column receptor starts on')
+    parser.add_argument('--recmolcache', default='rec.molcache2', type=str, help='Filename of receptor cache')
+    parser.add_argument('-d', '--data_root', type=str, required=False,
+                        help="Root folder for relative paths in train/test files", default='')
+    parser.add_argument('fnames', nargs='+', type=str, help='types files to process')
+
+    args = parser.parse_args()
+
+    # load all file names (.gninatypes) for each pdb file into memory
+    seenrec = set()
+    for fname in args.fnames:
+        for line in open(fname):
+            vals = line.split()
+            rec = vals[args.col] #get .gninatypes path in the 5th column of the .types file
+
+            if rec not in seenrec:
+                seenrec.add(rec)
+
+    create_cache2(sorted(list(seenrec)), args.data_root, args.recmolcache)
+
+if __name__ == "__main__":
+    main()
diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,153 @@
+name: deeppocket
+channels:
+  - rdkit
+  - pytorch
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - biopython=1.79=py37h5e8e339_1
+  - blas=1.0=mkl
+  - bottleneck=1.3.5=py37h7deecbd_0
+  - brotlipy=0.7.0=py37h540881e_1004
+  - bzip2=1.0.8=h7b6447c_0
+  - c-ares=1.18.1=h7f98852_0
+  - ca-certificates=2023.01.10=h06a4308_0
+  - cairo=1.16.0=hb05425b_4
+  - certifi=2022.12.7=py37h06a4308_0
+  - cffi=1.15.0=py37h036bc23_0
+  - charset-normalizer=3.1.0=pyhd8ed1ab_0
+  - cryptography=37.0.2=py37h38fbfac_0
+  - curl=7.79.1=h2574ce0_1
+  - cycler=0.11.0=pyhd8ed1ab_0
+  - expat=2.4.9=h6a678d5_0
+  - ffmpeg=4.3=hf484d3e_0
+  - fftw=3.3.10=nompi_h77c792f_102
+  - flit-core=3.6.0=pyhd3eb1b0_0
+  - fontconfig=2.14.1=h4c34cd2_2
+  - fpocket=4.0.2=hbb826e7_0
+  - freetype=2.12.1=h4a9f257_0
+  - giflib=5.2.1=h5eee18b_3
+  - glib=2.69.1=he621ea3_2
+  - gmp=6.2.1=h295c915_3
+  - gnutls=3.6.15=he1e5248_0
+  - hdf4=4.2.15=h10796ff_3
+  - hdf5=1.12.1=nompi_h2750804_100
+  - icu=58.2=he6710b0_3
+  - idna=3.4=pyhd8ed1ab_0
+  - intel-openmp=2021.4.0=h06a4308_3561
+  - jpeg=9e=h5eee18b_1
+  - keyutils=1.6.1=h166bdaf_0
+  - kiwisolver=1.3.2=py37h2527ec5_1
+  - krb5=1.19.3=h3790be6_0
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.12=h3be6417_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - lerc=3.0=h295c915_0
+  - libboost=1.73.0=h28710b8_12
+  - libcurl=7.79.1=h2574ce0_1
+  - libdeflate=1.17=h5eee18b_0
+  - libedit=3.1.20191231=he28a2e2_2
+  - libev=4.33=h516909a_1
+  - libffi=3.4.2=h6a678d5_6
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgfortran-ng=12.2.0=h69a702a_19
+  - libgfortran5=12.2.0=h337968e_19
+  - libgomp=11.2.0=h1234567_1
+  - libiconv=1.16=h7f8727e_2
+  - libidn2=2.3.2=h7f8727e_0
+  - libnetcdf=4.8.1=nompi_hb3fd0d9_101
+  - libnghttp2=1.43.0=h812cca2_1
+  - libpng=1.6.39=h5eee18b_0
+  - libssh2=1.10.0=ha56f1ee_2
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libtasn1=4.19.0=h5eee18b_0
+  - libtiff=4.5.0=h6a678d5_2
+  - libunistring=0.9.10=h27cfd23_0
+  - libuuid=1.41.5=h5eee18b_0
+  - libwebp=1.2.4=h11a3e52_1
+  - libwebp-base=1.2.4=h5eee18b_1
+  - libxcb=1.15=h7f8727e_0
+  - libxml2=2.10.3=hcbfbd50_0
+  - libzip=1.8.0=h4de3113_1
+  - lz4-c=1.9.4=h6a678d5_0
+  - matplotlib-base=3.4.3=py37h1058ff1_2
+  - mkl=2021.4.0=h06a4308_640
+  - mkl-service=2.4.0=py37h7f8727e_0
+  - mkl_fft=1.3.1=py37hd3c417c_0
+  - mkl_random=1.2.2=py37h51133e4_0
+  - ncurses=6.4=h6a678d5_0
+  - nettle=3.7.3=hbbd107a_1
+  - numexpr=2.8.4=py37he184ba9_0
+  - numpy=1.21.5=py37h6c91a56_3
+  - numpy-base=1.21.5=py37ha15fc14_3
+  - openh264=2.1.1=h4ff587b_0
+  - openssl=1.1.1t=h7f8727e_0
+  - packaging=22.0=py37h06a4308_0
+  - pandas=1.3.5=py37h8c16a72_0
+  - pcre=8.45=h295c915_0
+  - pillow=9.4.0=py37h6a678d5_0
+  - pip=22.3.1=py37h06a4308_0
+  - pixman=0.40.0=h7f8727e_1
+  - prody=2.1.0=py37hd23a5d3_0
+  - py-boost=1.73.0=py37h51133e4_12
+  - pycparser=2.21=pyhd8ed1ab_0
+  - pyopenssl=22.0.0=pyhd8ed1ab_1
+  - pyparsing=3.0.9=pyhd8ed1ab_0
+  - pysocks=1.7.1=py37h89c1867_5
+  - python=3.7.16=h7a1cb2a_0
+  - python-dateutil=2.8.2=pyhd3eb1b0_0
+  - python_abi=3.7=2_cp37m
+  - pytorch=1.13.1=py3.7_cpu_0
+  - pytorch-mutex=1.0=cpu
+  - pytz=2022.7=py37h06a4308_0
+  - rdkit=2020.09.1.0=py37hd50e099_1
+  - readline=8.2=h5eee18b_0
+  - requests=2.29.0=pyhd8ed1ab_0
+  - scipy=1.7.3=py37h6c91a56_2
+  - setuptools=65.6.3=py37h06a4308_0
+  - six=1.16.0=pyhd3eb1b0_1
+  - sqlite=3.41.2=h5eee18b_0
+  - tk=8.6.12=h1ccaba5_0
+  - torchaudio=0.13.1=py37_cpu
+  - torchvision=0.14.1=py37_cpu
+  - tornado=6.1=py37h540881e_3
+  - typing_extensions=4.4.0=py37h06a4308_0
+  - urllib3=1.26.15=pyhd8ed1ab_0
+  - wheel=0.38.4=py37h06a4308_0
+  - xz=5.2.10=h5eee18b_1
+  - zlib=1.2.13=h5eee18b_0
+  - zstd=1.5.5=hc292b87_0
+  - pip:
+      - appdirs==1.4.4
+      - click==8.1.3
+      - docker-pycreds==0.4.0
+      - exceptiongroup==1.1.1
+      - gitdb==4.0.10
+      - gitpython==3.1.31
+      - imageio==2.28.0
+      - importlib-metadata==6.6.0
+      - iniconfig==2.0.0
+      - joblib==1.2.0
+      - molgrid==0.5.3
+      - networkx==2.6.3
+      - pathtools==0.1.2
+      - pluggy==1.0.0
+      - protobuf==4.22.3
+      - psutil==5.9.5
+      - pyquaternion==0.9.9
+      - pytest==7.3.1
+      - pywavelets==1.3.0
+      - pyyaml==6.0
+      - scikit-image==0.19.3
+      - scikit-learn==1.0.2
+      - sentry-sdk==1.21.0
+      - setproctitle==1.3.2
+      - smmap==5.0.0
+      - threadpoolctl==3.1.0
+      - tifffile==2021.11.2
+      - tomli==2.0.1
+      - wandb==0.15.0
+      - zipp==3.15.0
+prefix: /home/ubuntu/.conda/envs/deeppocket
diff --git a/get_centers.py b/get_centers.py
@@ -1,5 +1,6 @@
 '''
 Takes the *_out/pockets directory from fpocket as input and outputs a file containining candidate pocket centers in that directory
+Example: python get_centers.py 8BRA_protein_out/
 '''
 import os
 import numpy as np

diff --git a/install.sh b/install.sh
@@ -0,0 +1,10 @@
+conda create -c conda-forge -n deeppocket rdkit python=3.7
+conda activate deeppocket
+conda install pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 -c pytorch -y
+conda install -c conda-forge fpocket -y
+conda install -c conda-forge prody -y
+pip install molgrid
+conda install -c conda-forge biopython -y
+pip install scikit-learn
+pip install wandb
+pip install scikit-image
diff --git a/make_types.py b/make_types.py
@@ -19,7 +19,7 @@ def types_from_file(f,holo_f):
         c=mol.GetConformer()
         atom_np=c.GetPositions()
         atom_nps.append(atom_np)
-        centers = np.loadtxt(path + "/" + prot +"/"+prot+ "_protein_nowat_out/pockets/" + "bary" + "_centers.txt")
+        centers = np.loadtxt(path + "/" + prot +"/"+prot+ "_protein_nowat_out/pockets/bary_centers.txt")
         if centers.shape[0]==4 and len(centers.shape)==1:
             centers=np.expand_dims(centers,axis=0)
         limit = centers.shape[0]

diff --git a/make_types_singly.py b/make_types_singly.py
@@ -0,0 +1,38 @@
+import os
+import sys
+import numpy as np
+import time
+import rdkit
+from rdkit.Chem import AllChem as Chem
+from rdkit.Chem import AllChem
+
+
+def types_from_file(ligand_file, bary_centers_file, output_types_file):
+    distance = 4
+    atom_nps=[]
+    mol = Chem.MolFromMolFile(ligand_file,sanitize=False)
+    c=mol.GetConformer()
+    atom_np=c.GetPositions()
+    atom_nps.append(atom_np)
+    centers = np.loadtxt(bary_centers_file)
+    if centers.shape[0]==4 and len(centers.shape)==1:
+        centers=np.expand_dims(centers,axis=0)
+    limit = centers.shape[0]
+    if not (centers.shape[0]==0 and len(centers.shape)==1):
+        sorted_centers = centers[:int(limit), 1:]
+        for i in range(int(limit)):
+            label=0
+            for atom_np in atom_nps:
+                dist = np.linalg.norm((atom_np - sorted_centers[i,:]), axis=1)
+                rel_centers = np.where(dist <= float(distance), 1, 0)
+                if (np.count_nonzero(rel_centers) > 0):
+                    label =1
+                else:
+                    label =0
+                output_types_file.write(str(label)+ ' '+ str(sorted_centers[i][0])+ ' '+ str(sorted_centers[i][1])+ ' '+ str(sorted_centers[i][2])+ ' '+prot+'/'+prot+'_protein_nowat.gninatypes\n')
+
+def main():
+    output_types_file = open('pdbbind_train.types','w')
+
+if __name__ == "__main__":
+    main()
diff --git a/predict.py b/predict.py
@@ -96,7 +96,7 @@ def get_model_gmaker_eprovider(test_types,batch_size,model,checkpoint,dims=None)
     zipped_lists = zip(class_probs[:len(types_lines)], types_lines)
     sorted_zipped_lists = sorted(zipped_lists,reverse=True)
     ranked_types = [element for _, element in sorted_zipped_lists]
-    confidence_types = [element for element, _ in sorted_zipped_lists]
+    confidence_types = [str(element) for element, _ in sorted_zipped_lists]
     seg_types= class_types.replace('.types','_ranked.types')
     probs_types_file=class_types.replace('.types','_confidence.txt')
     fout=open(seg_types,'w')

diff --git a/prepare_data_pipeline.py b/prepare_data_pipeline.py
@@ -0,0 +1,68 @@
+import os
+from clean_pdb import clean_pdb
+from create_molcache2 import create_cache2
+from get_centers import get_centers
+from make_types_singly import types_from_file
+from prepare_data_pipeline_helper import initialize_parameters
+from types_and_gninatyper import gninatype
+
+EXTENSION_CLEAN_STEP = '_protein'
+
+def clean_pdb_step(input_file, output_path):
+    print("Starting cleaning pdb...")
+    input_file_path = input_file.path
+    cleaned_pdb_file_name = str(input_file.name).replace('.pdb', EXTENSION_CLEAN_STEP +".pdb")
+    cleaned_pdb_file_path = os.path.join(output_path, cleaned_pdb_file_name)
+    clean_pdb(input_file_path,cleaned_pdb_file_path)
+    print("Finished cleaning pdb!")
+    return cleaned_pdb_file_name, cleaned_pdb_file_path
+
+def fpocket_step(cleaned_pdb_file_path):
+    os.system("fpocket -f {}".format(cleaned_pdb_file_path))
+    fpocket_dir = cleaned_pdb_file_path.replace('.pdb', '_out')
+    fpocket_pdb_file_path = os.path.join(fpocket_dir, cleaned_pdb_file_path.split('/')[-1].split('.')[0]+'_out.pdb')
+    return fpocket_dir, fpocket_pdb_file_path
+
+def get_centers_step(fpocket_dir):
+    print("Starting getting centers...")
+    get_centers(fpocket_dir)
+    print("Finished getting centers!")
+
+def types_and_gninatyper_step(fpocket_pdb_file_path):
+    print("Starting typing and gninatyping...")
+    gninatype(fpocket_pdb_file_path)
+    gninatypes_file_path = fpocket_pdb_file_path.replace('.pdb', '.gninatypes')
+    print("Finished typing and gninatyping!")
+    return gninatypes_file_path
+
+def main():
+    options = initialize_parameters()
+    output_types_filepath = os.path.join(options.output_path, options.output_types_filename)
+    output_types_file = open(output_types_filepath, 'w')
+
+    output_recmolcache_path = os.path.join(options.output_path, options.output_recmolcache)
+
+    gninatypes_file_path_list = []
+    for input_file in os.scandir(options.input_path):
+        if (input_file.is_file() == False) or (str(input_file.name).endswith('.pdb') == False):
+            continue
+        print(input_file.path)
+        ligand_file = str(input_file.path).replace('.pdb', "_ligand.sdf")
+        if os.path.isfile(ligand_file) == False:
+            print("There is no ligand file for pdb file named {} with {}. Skip run".format(input_file.name, ligand_file))
+            continue
+        cleaned_pdb_file_name, cleaned_pdb_file_path = clean_pdb_step(input_file, options.output_path)
+        fpocket_dir, fpocket_pdb_file_path = fpocket_step(cleaned_pdb_file_path)
+        get_centers_step(fpocket_dir)
+        bary_centers_file = os.path.join(fpocket_dir, 'bary_centers.txt')
+        gninatypes_file_path = types_and_gninatyper_step(fpocket_pdb_file_path)
+        gninatypes_file_path_list.append(gninatypes_file_path)
+        print("Starting typing from file...")
+        types_from_file(ligand_file, bary_centers_file, output_types_file)
+        print("Finished typing from file!")
+    output_types_file.close()
+    create_cache2(gninatypes_file_path_list, '', output_recmolcache_path)
+
+
+if __name__ == "__main__":
+    main()