This repository has been archived by the owner on Mar 23, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 12
/
embedding_2_h5.py
46 lines (38 loc) · 1.62 KB
/
embedding_2_h5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# encoding: utf-8
import zipfile
import numpy as np
import h5py
import gensim
def export_data_h5(vocabulary, embedding_matrix, output='embedding.h5'):
f = h5py.File(output, "w")
compress_option = dict(compression="gzip", compression_opts=9, shuffle=True)
words_flatten = '\n'.join(vocabulary)
f.attrs['vocab_len'] = len(vocabulary)
dt = h5py.special_dtype(vlen=str)
_dset_vocab = f.create_dataset('words_flatten', (1, ), dtype=dt, **compress_option)
_dset_vocab[...] = [words_flatten]
_dset = f.create_dataset('embedding', embedding_matrix.shape, dtype=embedding_matrix.dtype, **compress_option)
_dset[...] = embedding_matrix
f.flush()
f.close()
def glove_export(embedding_file):
with zipfile.ZipFile(embedding_file) as zf:
for name in zf.namelist():
vocabulary = []
embeddings = []
with zf.open(name) as f:
for line in f:
vals = line.split(' ')
vocabulary.append(vals[0])
embeddings.append([float(x) for x in vals[1:]])
export_data_h5(vocabulary, np.array(embeddings, dtype=np.float32), output=name + ".h5")
def gensim_export(embedding_file):
name = embedding_file
model = gensim.models.Word2Vec.load_word2vec_format(embedding_file, binary=True)
vocabulary, embeddings = [], []
for word in model.vocab:
vocabulary.append(word)
embeddings.append(model[word])
export_data_h5(vocabulary, np.array(embeddings, dtype=np.float32), output=name + ".h5")
if __name__ == '__main__':
gensim_export('GoogleNews-vectors-negative300.bin.gz')