Skip to content

Commit

Permalink
adaptive hash table sizes
Browse files Browse the repository at this point in the history
Summary: adaptive hash table sizes

Reviewed By: EdouardGrave

Differential Revision: D6165454

fbshipit-source-id: 01009e50178d56cd3403d61903febe405f3fc2b1
  • Loading branch information
kahne authored and facebook-github-bot committed Dec 21, 2017
1 parent ae2cdc0 commit add7db5
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 6 deletions.
18 changes: 14 additions & 4 deletions src/dictionary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,20 @@ Dictionary::Dictionary(std::shared_ptr<Args> args) : args_(args),
word2int_(MAX_VOCAB_SIZE, -1), size_(0), nwords_(0), nlabels_(0),
ntokens_(0), pruneidx_size_(-1) {}

Dictionary::Dictionary(std::shared_ptr<Args> args, std::istream& in) : args_(args),
size_(0), nwords_(0), nlabels_(0), ntokens_(0), pruneidx_size_(-1) {
load(in);
}

int32_t Dictionary::find(const std::string& w) const {
return find(w, hash(w));
}

int32_t Dictionary::find(const std::string& w, uint32_t h) const {
int32_t id = h % MAX_VOCAB_SIZE;
int32_t word2intsize = word2int_.size();
int32_t id = h % word2intsize;
while (word2int_[id] != -1 && words_[word2int_[id]].word != w) {
id = (id + 1) % MAX_VOCAB_SIZE;
id = (id + 1) % word2intsize;
}
return id;
}
Expand Down Expand Up @@ -413,7 +419,6 @@ void Dictionary::save(std::ostream& out) const {

void Dictionary::load(std::istream& in) {
words_.clear();
std::fill(word2int_.begin(), word2int_.end(), -1);
in.read((char*) &size_, sizeof(int32_t));
in.read((char*) &nwords_, sizeof(int32_t));
in.read((char*) &nlabels_, sizeof(int32_t));
Expand All @@ -428,7 +433,6 @@ void Dictionary::load(std::istream& in) {
in.read((char*) &e.count, sizeof(int64_t));
in.read((char*) &e.type, sizeof(entry_type));
words_.push_back(e);
word2int_[find(e.word)] = i;
}
pruneidx_.clear();
for (int32_t i = 0; i < pruneidx_size_; i++) {
Expand All @@ -440,6 +444,12 @@ void Dictionary::load(std::istream& in) {
}
initTableDiscard();
initNgrams();

int32_t word2intsize = std::ceil(size_ / 0.7);
word2int_.assign(word2intsize, -1);
for (int32_t i = 0; i < size_; i++) {
word2int_[find(words_[i].word)] = i;
}
}

void Dictionary::prune(std::vector<int32_t>& idx) {
Expand Down
1 change: 1 addition & 0 deletions src/dictionary.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ class Dictionary {
static const std::string EOW;

explicit Dictionary(std::shared_ptr<Args>);
explicit Dictionary(std::shared_ptr<Args>, std::istream&);
int32_t nwords() const;
int32_t nlabels() const;
int64_t ntokens() const;
Expand Down
3 changes: 1 addition & 2 deletions src/fasttext.cc
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,6 @@ void FastText::loadModel(const std::string& filename) {

void FastText::loadModel(std::istream& in) {
args_ = std::make_shared<Args>();
dict_ = std::make_shared<Dictionary>(args_);
input_ = std::make_shared<Matrix>();
output_ = std::make_shared<Matrix>();
qinput_ = std::make_shared<QMatrix>();
Expand All @@ -204,7 +203,7 @@ void FastText::loadModel(std::istream& in) {
// backward compatibility: old supervised models do not use char ngrams.
args_->maxn = 0;
}
dict_->load(in);
dict_ = std::make_shared<Dictionary>(args_, in);

bool quant_input;
in.read((char*) &quant_input, sizeof(bool));
Expand Down

0 comments on commit add7db5

Please sign in to comment.