-
Notifications
You must be signed in to change notification settings - Fork 2
/
reader.py
54 lines (47 loc) · 1.45 KB
/
reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
"""
File Reader API for external clustering benchmark.
reads the bin data and converts it to data and label np array
"""
import numpy as np
import zlib
import json
import os
def read_dataset(name):
"""
returns data and label np array having the name
"""
path = "./compressed/" + name + "/"
path_data = path + "data.bin"
path_labels = path + "label.bin"
## open the data and label binary file
with open(path_data, 'rb') as f:
data_comp = f.read()
with open(path_labels, 'rb') as f:
labels_comp = f.read()
## convert the data and label to np array
data = np.array(json.loads(zlib.decompress(data_comp).decode('utf8')))
labels = np.array(json.loads(zlib.decompress(labels_comp).decode('utf8')))
return data, labels
def read_dataset_by_path(path):
path_data = path + "data.bin"
path_labels = path + "label.bin"
## open the data and label binary file
with open(path_data, 'rb') as f:
data_comp = f.read()
with open(path_labels, 'rb') as f:
labels_comp = f.read()
## convert the data and label to np array
data = np.array(json.loads(zlib.decompress(data_comp).decode('utf8')))
labels = np.array(json.loads(zlib.decompress(labels_comp).decode('utf8')))
return data, labels
def read_multiple_datasets(names):
data = {}
labels = {}
for name in names:
data_, labels_ = read_dataset(name)
data[name] = data_
labels[name] = labels_
return data, labels
def read_all_datasets():
names = os.listdir("./compressed/")
return read_multiple_datasets(names)