-
Notifications
You must be signed in to change notification settings - Fork 26
/
create_molcache2.py
120 lines (100 loc) · 3.93 KB
/
create_molcache2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
'''Takes a bunch of types training files. First argument is what index the receptor starts on
Reads in the gninatypes files specified in these types files and writes out a monolithic receptor cache file.
Version 2 is optimized for memory mapped storage of caches. keys (file names) are stored
first followed by dense storage of values (coordinates and types).
Thanks to David Koes for original script (https://github.com/gnina/scripts/blob/master/create_caches2.py)
'''
import os, sys
import struct, argparse, traceback
import multiprocessing
mols_to_read = multiprocessing.Queue()
mols_to_write = multiprocessing.Queue()
N = multiprocessing.cpu_count() * 2
def read_data(data_root):
'''read a types file and put it in mols_to_write'''
while True:
sys.stdout.flush()
mol = mols_to_read.get()
if mol == None:
break
fname = mol
if len(data_root):
fname = data_root + '/' + mol
try:
with open(fname, 'rb') as gninatype:
data = gninatype.read()
assert (len(data) % 16 == 0)
if len(data) == 0:
print(fname, "EMPTY")
else:
mols_to_write.put((mol, data))
except Exception as e:
print(fname)
print(e)
mols_to_write.put(None)
def fill_queue(molfiles):
'thread for filling mols_to_read'
for mol in molfiles:
mols_to_read.put(mol)
for _ in range(N):
mols_to_read.put(None)
def create_cache2(molfiles, data_root, outfile):
'''Create an outfile molcache2 file from the list molfiles stored at data_root.'''
out = open(outfile, 'wb')
# first byte is for versioning
out.write(struct.pack('i', -1))
out.write(struct.pack('L', 0)) # placeholder for offset to keys
filler = multiprocessing.Process(target=fill_queue, args=(molfiles,))
filler.start()
readers = multiprocessing.Pool(N)
for _ in range(N):
readers.apply_async(read_data, (data_root,))
offsets = dict() # indxed by mol, location of data
# start writing molecular data
endcnt = 0
while True:
moldata = mols_to_write.get()
if moldata == None:
endcnt += 1
if endcnt == N:
break
else:
continue
(mol, data) = moldata
offsets[mol] = out.tell()
natoms = len(data) // 16
out.write(struct.pack('i', natoms))
out.write(data)
start = out.tell() # where the names start
for mol in molfiles:
if len(mol) > 255:
print("Skipping", mol, "since filename is too long")
continue
if mol not in offsets:
print("SKIPPING", mol, "since failed to read it in")
continue
s = bytes(mol, encoding='UTF-8')
out.write(struct.pack('B', len(s)))
out.write(s)
out.write(struct.pack('L', offsets[mol]))
# now set start
out.seek(4)
out.write(struct.pack('L', start))
out.seek(0, os.SEEK_END)
out.close()
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--col', required=True, type=int, help='Column receptor starts on')
parser.add_argument('--recmolcache', default='rec.molcache2', type=str, help='Filename of receptor cache')
parser.add_argument('-d', '--data_root', type=str, required=False,
help="Root folder for relative paths in train/test files", default='')
parser.add_argument('fnames', nargs='+', type=str, help='types files to process')
args = parser.parse_args()
# load all file names into memory
seenrec = set()
for fname in args.fnames:
for line in open(fname):
vals = line.split()
rec = vals[args.col]
if rec not in seenrec:
seenrec.add(rec)
create_cache2(sorted(list(seenrec)), args.data_root, args.recmolcache)