Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add conda environment file to fasten the installation process. #25

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,19 @@ If you want to use this project for development, we recommend going through [lib

To reproduce the substructure benchmark [Prody](https://prody.csb.pitt.edu/) and [Rdkit](https://www.rdkit.org/) are also required.

### Installation Guide
#### Using with conda environment
We can easily create a new enviroment which contains necessary dependencies following the command below:
```
conda env create -f environment.yml
```

or using:
```
chmod +x install.sh
bash install.sh
```

## Dataset Preprocessing

PDB files are first parsed to remove hetero atoms, then converted to "gninatypes" files and finally collected into a "molcache2" file for quicker input and model training with libmolgrid. "gninatypes" and "molcache2" files are binary files that store an efficient representation of the input protein to be used for gridding the molecule. They are prepared for faster input with libmolgrid for quicker training of the CNN models.
Expand Down Expand Up @@ -71,9 +84,17 @@ I have written down steps below that pertain to preparing training data for a da
Steps for preparing training data:

1) remove hetero atoms (clean_pdb.py)
- Example: `python clean_pdb.py 8BRA.pdb 8BRA_protein.pdb`
2) run fpocket through structures (fpocket -f *_protein.pdb)
- Example: `fpocket -f 8BRA_protein.pdb`. It outputs a folder named "8BRA_protein_out" including:
```
8BRA_protein.pml 8BRA_protein_PYMOL.sh 8BRA_protein_info.txt 8BRA_protein_pockets.pqr
8BRA_protein.tcl 8BRA_protein_VMD.sh 8BRA_protein_out.pdb pockets
```
3) get candidate pocket centers for all structures (get_centers.py)
- Example: `python get_centers.py 8BRA_protein_out/`. It outputs a file named "bary_centers.txt" in the folder.
4) create .gninatypes files for all structure (gninatype() in types_and_gninatyper.py)
- Example: `python types_and_gninatyper.py 8BRA_protein_out/8BRA_protein_out.pdb`. It outputs a file named "8BRA_protein_out.gninatypes".
5) make train and test types (make_types.py)
6) create molcache file for training (create_molcache2.py)

Expand Down
1 change: 1 addition & 0 deletions clean_pdb.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
'''
Takes a PDB file and removes hetero atoms from its structure.
First argument is path to original file, second argument is path to generated file
Examples: python clean_pdb.py 8BRA.pdb 8BRA_cleaned.pdb
'''
from Bio.PDB import PDBParser, PDBIO, Select
import Bio
Expand Down
45 changes: 24 additions & 21 deletions create_molcache2.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,24 +97,27 @@ def create_cache2(molfiles, data_root, outfile):
out.seek(0, os.SEEK_END)
out.close()


parser = argparse.ArgumentParser()
parser.add_argument('-c', '--col', required=True, type=int, help='Column receptor starts on')
parser.add_argument('--recmolcache', default='rec.molcache2', type=str, help='Filename of receptor cache')
parser.add_argument('-d', '--data_root', type=str, required=False,
help="Root folder for relative paths in train/test files", default='')
parser.add_argument('fnames', nargs='+', type=str, help='types files to process')

args = parser.parse_args()

# load all file names into memory
seenrec = set()
for fname in args.fnames:
for line in open(fname):
vals = line.split()
rec = vals[args.col]

if rec not in seenrec:
seenrec.add(rec)

create_cache2(sorted(list(seenrec)), args.data_root, args.recmolcache)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--col', required=True, type=int, help='Column receptor starts on')
parser.add_argument('--recmolcache', default='rec.molcache2', type=str, help='Filename of receptor cache')
parser.add_argument('-d', '--data_root', type=str, required=False,
help="Root folder for relative paths in train/test files", default='')
parser.add_argument('fnames', nargs='+', type=str, help='types files to process')

args = parser.parse_args()

# load all file names (.gninatypes) for each pdb file into memory
seenrec = set()
for fname in args.fnames:
for line in open(fname):
vals = line.split()
rec = vals[args.col] #get .gninatypes path in the 5th column of the .types file

if rec not in seenrec:
seenrec.add(rec)

create_cache2(sorted(list(seenrec)), args.data_root, args.recmolcache)

if __name__ == "__main__":
main()
153 changes: 153 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
name: deeppocket
channels:
- rdkit
- pytorch
- conda-forge
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- _openmp_mutex=5.1=1_gnu
- biopython=1.79=py37h5e8e339_1
- blas=1.0=mkl
- bottleneck=1.3.5=py37h7deecbd_0
- brotlipy=0.7.0=py37h540881e_1004
- bzip2=1.0.8=h7b6447c_0
- c-ares=1.18.1=h7f98852_0
- ca-certificates=2023.01.10=h06a4308_0
- cairo=1.16.0=hb05425b_4
- certifi=2022.12.7=py37h06a4308_0
- cffi=1.15.0=py37h036bc23_0
- charset-normalizer=3.1.0=pyhd8ed1ab_0
- cryptography=37.0.2=py37h38fbfac_0
- curl=7.79.1=h2574ce0_1
- cycler=0.11.0=pyhd8ed1ab_0
- expat=2.4.9=h6a678d5_0
- ffmpeg=4.3=hf484d3e_0
- fftw=3.3.10=nompi_h77c792f_102
- flit-core=3.6.0=pyhd3eb1b0_0
- fontconfig=2.14.1=h4c34cd2_2
- fpocket=4.0.2=hbb826e7_0
- freetype=2.12.1=h4a9f257_0
- giflib=5.2.1=h5eee18b_3
- glib=2.69.1=he621ea3_2
- gmp=6.2.1=h295c915_3
- gnutls=3.6.15=he1e5248_0
- hdf4=4.2.15=h10796ff_3
- hdf5=1.12.1=nompi_h2750804_100
- icu=58.2=he6710b0_3
- idna=3.4=pyhd8ed1ab_0
- intel-openmp=2021.4.0=h06a4308_3561
- jpeg=9e=h5eee18b_1
- keyutils=1.6.1=h166bdaf_0
- kiwisolver=1.3.2=py37h2527ec5_1
- krb5=1.19.3=h3790be6_0
- lame=3.100=h7b6447c_0
- lcms2=2.12=h3be6417_0
- ld_impl_linux-64=2.38=h1181459_1
- lerc=3.0=h295c915_0
- libboost=1.73.0=h28710b8_12
- libcurl=7.79.1=h2574ce0_1
- libdeflate=1.17=h5eee18b_0
- libedit=3.1.20191231=he28a2e2_2
- libev=4.33=h516909a_1
- libffi=3.4.2=h6a678d5_6
- libgcc-ng=11.2.0=h1234567_1
- libgfortran-ng=12.2.0=h69a702a_19
- libgfortran5=12.2.0=h337968e_19
- libgomp=11.2.0=h1234567_1
- libiconv=1.16=h7f8727e_2
- libidn2=2.3.2=h7f8727e_0
- libnetcdf=4.8.1=nompi_hb3fd0d9_101
- libnghttp2=1.43.0=h812cca2_1
- libpng=1.6.39=h5eee18b_0
- libssh2=1.10.0=ha56f1ee_2
- libstdcxx-ng=11.2.0=h1234567_1
- libtasn1=4.19.0=h5eee18b_0
- libtiff=4.5.0=h6a678d5_2
- libunistring=0.9.10=h27cfd23_0
- libuuid=1.41.5=h5eee18b_0
- libwebp=1.2.4=h11a3e52_1
- libwebp-base=1.2.4=h5eee18b_1
- libxcb=1.15=h7f8727e_0
- libxml2=2.10.3=hcbfbd50_0
- libzip=1.8.0=h4de3113_1
- lz4-c=1.9.4=h6a678d5_0
- matplotlib-base=3.4.3=py37h1058ff1_2
- mkl=2021.4.0=h06a4308_640
- mkl-service=2.4.0=py37h7f8727e_0
- mkl_fft=1.3.1=py37hd3c417c_0
- mkl_random=1.2.2=py37h51133e4_0
- ncurses=6.4=h6a678d5_0
- nettle=3.7.3=hbbd107a_1
- numexpr=2.8.4=py37he184ba9_0
- numpy=1.21.5=py37h6c91a56_3
- numpy-base=1.21.5=py37ha15fc14_3
- openh264=2.1.1=h4ff587b_0
- openssl=1.1.1t=h7f8727e_0
- packaging=22.0=py37h06a4308_0
- pandas=1.3.5=py37h8c16a72_0
- pcre=8.45=h295c915_0
- pillow=9.4.0=py37h6a678d5_0
- pip=22.3.1=py37h06a4308_0
- pixman=0.40.0=h7f8727e_1
- prody=2.1.0=py37hd23a5d3_0
- py-boost=1.73.0=py37h51133e4_12
- pycparser=2.21=pyhd8ed1ab_0
- pyopenssl=22.0.0=pyhd8ed1ab_1
- pyparsing=3.0.9=pyhd8ed1ab_0
- pysocks=1.7.1=py37h89c1867_5
- python=3.7.16=h7a1cb2a_0
- python-dateutil=2.8.2=pyhd3eb1b0_0
- python_abi=3.7=2_cp37m
- pytorch=1.13.1=py3.7_cpu_0
- pytorch-mutex=1.0=cpu
- pytz=2022.7=py37h06a4308_0
- rdkit=2020.09.1.0=py37hd50e099_1
- readline=8.2=h5eee18b_0
- requests=2.29.0=pyhd8ed1ab_0
- scipy=1.7.3=py37h6c91a56_2
- setuptools=65.6.3=py37h06a4308_0
- six=1.16.0=pyhd3eb1b0_1
- sqlite=3.41.2=h5eee18b_0
- tk=8.6.12=h1ccaba5_0
- torchaudio=0.13.1=py37_cpu
- torchvision=0.14.1=py37_cpu
- tornado=6.1=py37h540881e_3
- typing_extensions=4.4.0=py37h06a4308_0
- urllib3=1.26.15=pyhd8ed1ab_0
- wheel=0.38.4=py37h06a4308_0
- xz=5.2.10=h5eee18b_1
- zlib=1.2.13=h5eee18b_0
- zstd=1.5.5=hc292b87_0
- pip:
- appdirs==1.4.4
- click==8.1.3
- docker-pycreds==0.4.0
- exceptiongroup==1.1.1
- gitdb==4.0.10
- gitpython==3.1.31
- imageio==2.28.0
- importlib-metadata==6.6.0
- iniconfig==2.0.0
- joblib==1.2.0
- molgrid==0.5.3
- networkx==2.6.3
- pathtools==0.1.2
- pluggy==1.0.0
- protobuf==4.22.3
- psutil==5.9.5
- pyquaternion==0.9.9
- pytest==7.3.1
- pywavelets==1.3.0
- pyyaml==6.0
- scikit-image==0.19.3
- scikit-learn==1.0.2
- sentry-sdk==1.21.0
- setproctitle==1.3.2
- smmap==5.0.0
- threadpoolctl==3.1.0
- tifffile==2021.11.2
- tomli==2.0.1
- wandb==0.15.0
- zipp==3.15.0
prefix: /home/ubuntu/.conda/envs/deeppocket
1 change: 1 addition & 0 deletions get_centers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
'''
Takes the *_out/pockets directory from fpocket as input and outputs a file containining candidate pocket centers in that directory
Example: python get_centers.py 8BRA_protein_out/
'''
import os
import numpy as np
Expand Down
10 changes: 10 additions & 0 deletions install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
conda create -c conda-forge -n deeppocket rdkit python=3.7
conda activate deeppocket
conda install pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 -c pytorch -y
conda install -c conda-forge fpocket -y
conda install -c conda-forge prody -y
pip install molgrid
conda install -c conda-forge biopython -y
pip install scikit-learn
pip install wandb
pip install scikit-image
2 changes: 1 addition & 1 deletion make_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def types_from_file(f,holo_f):
c=mol.GetConformer()
atom_np=c.GetPositions()
atom_nps.append(atom_np)
centers = np.loadtxt(path + "/" + prot +"/"+prot+ "_protein_nowat_out/pockets/" + "bary" + "_centers.txt")
centers = np.loadtxt(path + "/" + prot +"/"+prot+ "_protein_nowat_out/pockets/bary_centers.txt")
if centers.shape[0]==4 and len(centers.shape)==1:
centers=np.expand_dims(centers,axis=0)
limit = centers.shape[0]
Expand Down
38 changes: 38 additions & 0 deletions make_types_singly.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import os
import sys
import numpy as np
import time
import rdkit
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import AllChem


def types_from_file(ligand_file, bary_centers_file, output_types_file):
distance = 4
atom_nps=[]
mol = Chem.MolFromMolFile(ligand_file,sanitize=False)
c=mol.GetConformer()
atom_np=c.GetPositions()
atom_nps.append(atom_np)
centers = np.loadtxt(bary_centers_file)
if centers.shape[0]==4 and len(centers.shape)==1:
centers=np.expand_dims(centers,axis=0)
limit = centers.shape[0]
if not (centers.shape[0]==0 and len(centers.shape)==1):
sorted_centers = centers[:int(limit), 1:]
for i in range(int(limit)):
label=0
for atom_np in atom_nps:
dist = np.linalg.norm((atom_np - sorted_centers[i,:]), axis=1)
rel_centers = np.where(dist <= float(distance), 1, 0)
if (np.count_nonzero(rel_centers) > 0):
label =1
else:
label =0
output_types_file.write(str(label)+ ' '+ str(sorted_centers[i][0])+ ' '+ str(sorted_centers[i][1])+ ' '+ str(sorted_centers[i][2])+ ' '+prot+'/'+prot+'_protein_nowat.gninatypes\n')

def main():
output_types_file = open('pdbbind_train.types','w')

if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def get_model_gmaker_eprovider(test_types,batch_size,model,checkpoint,dims=None)
zipped_lists = zip(class_probs[:len(types_lines)], types_lines)
sorted_zipped_lists = sorted(zipped_lists,reverse=True)
ranked_types = [element for _, element in sorted_zipped_lists]
confidence_types = [element for element, _ in sorted_zipped_lists]
confidence_types = [str(element) for element, _ in sorted_zipped_lists]
seg_types= class_types.replace('.types','_ranked.types')
probs_types_file=class_types.replace('.types','_confidence.txt')
fout=open(seg_types,'w')
Expand Down
68 changes: 68 additions & 0 deletions prepare_data_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import os
from clean_pdb import clean_pdb
from create_molcache2 import create_cache2
from get_centers import get_centers
from make_types_singly import types_from_file
from prepare_data_pipeline_helper import initialize_parameters
from types_and_gninatyper import gninatype

EXTENSION_CLEAN_STEP = '_protein'

def clean_pdb_step(input_file, output_path):
print("Starting cleaning pdb...")
input_file_path = input_file.path
cleaned_pdb_file_name = str(input_file.name).replace('.pdb', EXTENSION_CLEAN_STEP +".pdb")
cleaned_pdb_file_path = os.path.join(output_path, cleaned_pdb_file_name)
clean_pdb(input_file_path,cleaned_pdb_file_path)
print("Finished cleaning pdb!")
return cleaned_pdb_file_name, cleaned_pdb_file_path

def fpocket_step(cleaned_pdb_file_path):
os.system("fpocket -f {}".format(cleaned_pdb_file_path))
fpocket_dir = cleaned_pdb_file_path.replace('.pdb', '_out')
fpocket_pdb_file_path = os.path.join(fpocket_dir, cleaned_pdb_file_path.split('/')[-1].split('.')[0]+'_out.pdb')
return fpocket_dir, fpocket_pdb_file_path

def get_centers_step(fpocket_dir):
print("Starting getting centers...")
get_centers(fpocket_dir)
print("Finished getting centers!")

def types_and_gninatyper_step(fpocket_pdb_file_path):
print("Starting typing and gninatyping...")
gninatype(fpocket_pdb_file_path)
gninatypes_file_path = fpocket_pdb_file_path.replace('.pdb', '.gninatypes')
print("Finished typing and gninatyping!")
return gninatypes_file_path

def main():
options = initialize_parameters()
output_types_filepath = os.path.join(options.output_path, options.output_types_filename)
output_types_file = open(output_types_filepath, 'w')

output_recmolcache_path = os.path.join(options.output_path, options.output_recmolcache)

gninatypes_file_path_list = []
for input_file in os.scandir(options.input_path):
if (input_file.is_file() == False) or (str(input_file.name).endswith('.pdb') == False):
continue
print(input_file.path)
ligand_file = str(input_file.path).replace('.pdb', "_ligand.sdf")
if os.path.isfile(ligand_file) == False:
print("There is no ligand file for pdb file named {} with {}. Skip run".format(input_file.name, ligand_file))
continue
cleaned_pdb_file_name, cleaned_pdb_file_path = clean_pdb_step(input_file, options.output_path)
fpocket_dir, fpocket_pdb_file_path = fpocket_step(cleaned_pdb_file_path)
get_centers_step(fpocket_dir)
bary_centers_file = os.path.join(fpocket_dir, 'bary_centers.txt')
gninatypes_file_path = types_and_gninatyper_step(fpocket_pdb_file_path)
gninatypes_file_path_list.append(gninatypes_file_path)
print("Starting typing from file...")
types_from_file(ligand_file, bary_centers_file, output_types_file)
print("Finished typing from file!")
output_types_file.close()
create_cache2(gninatypes_file_path_list, '', output_recmolcache_path)


if __name__ == "__main__":
main()
Loading