-
Notifications
You must be signed in to change notification settings - Fork 0
/
bigfile.py
124 lines (94 loc) · 3.98 KB
/
bigfile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pdb
import os, sys, array
import numpy as np
import logging
logger = logging.getLogger(__file__)
logging.basicConfig(
format="[%(asctime)s - %(filename)s:line %(lineno)s] %(message)s",
datefmt='%d %b %H:%M:%S')
logger.setLevel(logging.INFO)
class BigFile:
def __init__(self, datadir):
self.nr_of_images, self.ndims = map(int, open(os.path.join(datadir,'shape.txt')).readline().split())
id_file = os.path.join(datadir, "id.txt")
self.names = open(id_file).readline().decode('u8').strip().split()
assert(len(self.names) == self.nr_of_images)
self.name2index = dict(zip(self.names, range(self.nr_of_images)))
self.binary_file = os.path.join(datadir, "feature.bin")
logger.info("%dx%d instances loaded from %s", self.nr_of_images, self.ndims, datadir)
def read(self, requested, isname=True):
requested = set(requested)
if isname:
index_name_array = [(self.name2index[x], x) for x in requested if x in self.name2index]
else:
assert(min(requested)>=0)
assert(max(requested)<len(self.names))
index_name_array = [(x, self.names[x]) for x in requested]
if len(index_name_array) == 0:
return [], []
index_name_array.sort(key=lambda v:v[0])
sorted_index = [x[0] for x in index_name_array]
nr_of_images = len(index_name_array)
vecs = [None] * nr_of_images
offset = np.float32(1).nbytes * self.ndims
# offset = self.ndims * 2 # one pic one cap############################################## changed by gexuri
res = array.array('f')
fr = open(self.binary_file, 'rb')
fr.seek(index_name_array[0][0] * offset)
# print(index_name_array[0][0])
# print(offset)
res.fromfile(fr, self.ndims)
previous = index_name_array[0][0]
for next in sorted_index[1:]:
move = (next-1-previous) * offset
#print next, move
fr.seek(move, 1)
res.fromfile(fr, self.ndims)
previous = next
fr.close()
return [x[1] for x in index_name_array], [ res[i*self.ndims:(i+1)*self.ndims].tolist() for i in range(nr_of_images) ]
def read_one(self, name):
renamed, vectors = self.read([name])
return vectors[0]
def shape(self):
return [self.nr_of_images, self.ndims]
class StreamFile:
def __init__(self, datadir):
self.feat_dir = datadir
self.nr_of_images, self.ndims = map(int, open(os.path.join(datadir,'shape.txt')).readline().split())
id_file = os.path.join(datadir, "id.txt")
self.names = open(id_file).readline().strip().split()
assert(len(self.names) == self.nr_of_images)
self.name2index = dict(zip(self.names, range(self.nr_of_images)))
self.binary_file = os.path.join(datadir, "feature.bin")
print ("[%s] %dx%d instances loaded from %s" % (self.__class__.__name__, self.nr_of_images, self.ndims, datadir))
self.fr = None
self.current = 0
def open(self):
self.fr = open(os.path.join(self.feat_dir,'feature.bin'), 'rb')
self.current = 0
def close(self):
if self.fr:
self.fr.close()
self.fr = None
def __iter__(self):
return self
def next(self):
if self.current >= self.nr_of_images:
self.close()
raise StopIteration
else:
res = array.array('f')
res.fromfile(self.fr, self.ndims)
_id = self.names[self.current]
self.current += 1
return _id, res.tolist()
if __name__ == '__main__':
bigfile = BigFile('toydata/FeatureData/f1')
imset = str.split('b z a a b c')
renamed, vectors = bigfile.read(imset)
for name,vec in zip(renamed, vectors):
print (name, vec)