-
Notifications
You must be signed in to change notification settings - Fork 0
/
gen_word2vec.py
197 lines (173 loc) · 5.21 KB
/
gen_word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
"""Python wrappers around Brain.
This file is MACHINE GENERATED! Do not edit.
"""
import collections
from google.protobuf import text_format
from tensorflow.core.framework import op_def_pb2
# Needed to trigger the call to _set_call_cpp_shape_fn.
from tensorflow.python.framework import common_shapes
from tensorflow.python.framework import op_def_registry
from tensorflow.python.framework import ops
from tensorflow.python.framework import op_def_library
_neg_train_outputs = [""]
def neg_train(w_in, w_out, examples, labels, lr, vocab_count,
num_negative_samples, name=None):
r"""Training via negative sampling.
Args:
w_in: A `Tensor` of type mutable `float32`. input word embedding.
w_out: A `Tensor` of type mutable `float32`. output word embedding.
examples: A `Tensor` of type `int32`. A vector of word ids.
labels: A `Tensor` of type `int32`. A vector of word ids.
lr: A `Tensor` of type `float32`.
vocab_count: A list of `ints`. Count of words in the vocabulary.
num_negative_samples: An `int`. Number of negative samples per example.
name: A name for the operation (optional).
Returns:
The created Operation.
"""
result = _op_def_lib.apply_op("NegTrain", w_in=w_in, w_out=w_out,
examples=examples, labels=labels, lr=lr,
vocab_count=vocab_count,
num_negative_samples=num_negative_samples,
name=name)
return result
ops.RegisterShape("NegTrain")(None)
_skipgram_outputs = ["vocab_word", "vocab_freq", "words_per_epoch",
"current_epoch", "total_words_processed", "examples",
"labels"]
_SkipgramOutput = collections.namedtuple("Skipgram", _skipgram_outputs)
def skipgram(filename, batch_size, window_size=None, min_count=None,
subsample=None, name=None):
r"""Parses a text file and creates a batch of examples.
Args:
filename: A `string`. The corpus's text file name.
batch_size: An `int`. The size of produced batch.
window_size: An optional `int`. Defaults to `5`.
The number of words to predict to the left and right of the target.
min_count: An optional `int`. Defaults to `5`.
The minimum number of word occurrences for it to be included in the
vocabulary.
subsample: An optional `float`. Defaults to `0.001`.
Threshold for word occurrence. Words that appear with higher
frequency will be randomly down-sampled. Set to 0 to disable.
name: A name for the operation (optional).
Returns:
A tuple of `Tensor` objects (vocab_word, vocab_freq, words_per_epoch, current_epoch, total_words_processed, examples, labels).
vocab_word: A `Tensor` of type `string`. A vector of words in the corpus.
vocab_freq: A `Tensor` of type `int32`. Frequencies of words. Sorted in the non-ascending order.
words_per_epoch: A `Tensor` of type `int64`. Number of words per epoch in the data file.
current_epoch: A `Tensor` of type `int32`. The current epoch number.
total_words_processed: A `Tensor` of type `int64`. The total number of words processed so far.
examples: A `Tensor` of type `int32`. A vector of word ids.
labels: A `Tensor` of type `int32`. A vector of word ids.
"""
result = _op_def_lib.apply_op("Skipgram", filename=filename,
batch_size=batch_size,
window_size=window_size, min_count=min_count,
subsample=subsample, name=name)
return _SkipgramOutput._make(result)
ops.RegisterShape("Skipgram")(None)
def _InitOpDefLibrary():
op_list = op_def_pb2.OpList()
text_format.Merge(_InitOpDefLibrary.op_list_ascii, op_list)
op_def_registry.register_op_list(op_list)
op_def_lib = op_def_library.OpDefLibrary()
op_def_lib.add_op_list(op_list)
return op_def_lib
_InitOpDefLibrary.op_list_ascii = """op {
name: "NegTrain"
input_arg {
name: "w_in"
type: DT_FLOAT
is_ref: true
}
input_arg {
name: "w_out"
type: DT_FLOAT
is_ref: true
}
input_arg {
name: "examples"
type: DT_INT32
}
input_arg {
name: "labels"
type: DT_INT32
}
input_arg {
name: "lr"
type: DT_FLOAT
}
attr {
name: "vocab_count"
type: "list(int)"
}
attr {
name: "num_negative_samples"
type: "int"
}
is_stateful: true
}
op {
name: "Skipgram"
output_arg {
name: "vocab_word"
type: DT_STRING
}
output_arg {
name: "vocab_freq"
type: DT_INT32
}
output_arg {
name: "words_per_epoch"
type: DT_INT64
}
output_arg {
name: "current_epoch"
type: DT_INT32
}
output_arg {
name: "total_words_processed"
type: DT_INT64
}
output_arg {
name: "examples"
type: DT_INT32
}
output_arg {
name: "labels"
type: DT_INT32
}
attr {
name: "filename"
type: "string"
}
attr {
name: "batch_size"
type: "int"
}
attr {
name: "window_size"
type: "int"
default_value {
i: 5
}
}
attr {
name: "min_count"
type: "int"
default_value {
i: 5
}
}
attr {
name: "subsample"
type: "float"
default_value {
f: 0.001
}
}
is_stateful: true
}
"""
_op_def_lib = _InitOpDefLibrary()