From bcd5250fd441283c134d6c5972d8e16a7151fbc7 Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Fri, 8 Dec 2017 07:24:43 -0800 Subject: [PATCH] move tests to OSS / util / update package name Summary: See title. Differential Revision: D6477912 fbshipit-source-id: 6f9f0f4d6e1c8a4b20f117f2ad2d12211d09ac5a --- CONTRIBUTING.md | 3 + python/fastText/FastText.py | 2 - python/fastText/test/README.md | 7 - python/fastText/test/test_script.py | 607 ------------------- python/fastText/tests/__init__.py | 15 + python/fastText/tests/test_configurations.py | 104 ++++ python/fastText/tests/test_script.py | 125 ++++ python/fastText/util/__init__.py | 14 + python/fastText/util/util.py | 60 ++ runtests.py | 31 + setup.py | 20 +- tests/fetch_test_data.sh | 129 ++++ 12 files changed, 494 insertions(+), 623 deletions(-) delete mode 100644 python/fastText/test/README.md delete mode 100644 python/fastText/test/test_script.py create mode 100644 python/fastText/tests/__init__.py create mode 100644 python/fastText/tests/test_configurations.py create mode 100644 python/fastText/tests/test_script.py create mode 100644 python/fastText/util/__init__.py create mode 100644 python/fastText/util/util.py create mode 100644 runtests.py create mode 100644 tests/fetch_test_data.sh diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 418c70d1c..354ff88b0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -19,6 +19,9 @@ To create a pull request: 5. Make sure your code lints. 6. If you haven't already, complete the Contributor License Agreement ("CLA"). +## Tests +First, you will need to make sure you have the required data. For that, please have a look at the fetch_test_data.sh script under tests. Next run the tests using the runtests.py script passing a path to the directory containing the datasets. + ## Contributor License Agreement ("CLA") In order to accept your pull request, we need you to submit a CLA. You only need to do this once to work on any of Facebook's open source projects. diff --git a/python/fastText/FastText.py b/python/fastText/FastText.py index 0afadbe57..e82f94a26 100644 --- a/python/fastText/FastText.py +++ b/python/fastText/FastText.py @@ -280,7 +280,6 @@ def train_supervised( label="__label__", verbose=2, pretrainedVectors="", - saveOutput=0 ): """ Train a supervised model and return a model object. @@ -322,7 +321,6 @@ def train_unsupervised( label="__label__", verbose=2, pretrainedVectors="", - saveOutput=0 ): """ Train an unsupervised model and return a model object. diff --git a/python/fastText/test/README.md b/python/fastText/test/README.md deleted file mode 100644 index 131d2d07b..000000000 --- a/python/fastText/test/README.md +++ /dev/null @@ -1,7 +0,0 @@ -To run this test script you need to provide a path to the fasttext binary build in debug mode and a folder with the datsets downloaded by classification-results.sh and word-vector-example.sh. - -Example run: - -``` -FASTTEXT_BIN=fasttext_bin FASTTEXT_DATA=data python test_script.py -``` diff --git a/python/fastText/test/test_script.py b/python/fastText/test/test_script.py deleted file mode 100644 index 276e5c3b3..000000000 --- a/python/fastText/test/test_script.py +++ /dev/null @@ -1,607 +0,0 @@ -# Copyright (c) 2017-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. An additional grant -# of patent rights can be found in the PATENTS file in the same directory. - -from __future__ import absolute_import -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -from fastText import train_supervised -from fastText import train_unsupervised -from fastText import load_model -from fastText import tokenize -import random -import sys -import os -import subprocess -import multiprocessing -import numpy as np -import unittest -import tempfile -import math -from scipy import stats - - -def compat_splitting(line): - return line.decode('utf8').split() - - -def similarity(v1, v2): - n1 = np.linalg.norm(v1) - n2 = np.linalg.norm(v2) - return np.dot(v1, v2) / n1 / n2 - - -def read_vectors(model_path): - vectors = {} - with open(model_path, 'rb') as fin: - for _, line in enumerate(fin): - try: - tab = compat_splitting(line) - vec = np.array(tab[1:], dtype=float) - word = tab[0] - if np.linalg.norm(vec) == 0: - continue - if word not in vectors: - vectors[word] = vec - except ValueError: - continue - except UnicodeDecodeError: - continue - return vectors - - -def compute_similarity(model_path, data_path, vectors=None): - if not vectors: - vectors = read_vectors(model_path) - - mysim = [] - gold = [] - drop = 0.0 - nwords = 0.0 - - with open(data_path, 'rb') as fin: - for line in fin: - tline = compat_splitting(line) - word1 = tline[0].lower() - word2 = tline[1].lower() - nwords = nwords + 1.0 - - if (word1 in vectors) and (word2 in vectors): - v1 = vectors[word1] - v2 = vectors[word2] - d = similarity(v1, v2) - mysim.append(d) - gold.append(float(tline[2])) - else: - drop = drop + 1.0 - - corr = stats.spearmanr(mysim, gold) - dataset = os.path.basename(data_path) - correlation = corr[0] * 100 - oov = math.ceil(drop / nwords * 100.0) - return dataset, correlation, oov - - -def get_random_unicode(length): - # See: https://stackoverflow.com/questions/1477294/generate-random-utf-8-string-in-python - - try: - get_char = unichr - except NameError: - get_char = chr - - # Update this to include code point ranges to be sampled - include_ranges = [ - (0x0021, 0x0021), - (0x0023, 0x0026), - (0x0028, 0x007E), - (0x00A1, 0x00AC), - (0x00AE, 0x00FF), - (0x0100, 0x017F), - (0x0180, 0x024F), - (0x2C60, 0x2C7F), - (0x16A0, 0x16F0), - (0x0370, 0x0377), - (0x037A, 0x037E), - (0x0384, 0x038A), - (0x038C, 0x038C), - ] - - alphabet = [ - get_char(code_point) - for current_range in include_ranges - for code_point in range(current_range[0], current_range[1] + 1) - ] - return ''.join(random.choice(alphabet) for i in range(length)) - - -def get_random_words(N, a, b): - words = [] - for _ in range(N): - length = random.randint(a, b) - words.append(get_random_unicode(length)) - return words - - -class TestFastTextPy(unittest.TestCase): - @classmethod - def eprint(cls, *args, **kwargs): - print(*args, file=sys.stderr, **kwargs) - - @classmethod - def num_thread(cls): - return multiprocessing.cpu_count() - 1 - - @classmethod - def build_paths(cls, train, test, output): - train = os.path.join(cls.data_dir, train) - test = os.path.join(cls.data_dir, test) - output = os.path.join(cls.result_dir, output) - return train, test, output - - @classmethod - def build_train_args(cls, params, mode, train, output): - args = [cls.bin, mode, "-input", train, "-output", output] - return args + params.split(' ') - - @classmethod - def get_train_output(cls, train_args): - cls.eprint("Executing: " + ' '.join(train_args)) - return subprocess.check_output(train_args).decode('utf-8') - - @classmethod - def get_path_size(cls, path): - path_size = subprocess.check_output(["stat", "-c", "%s", - path]).decode('utf-8') - path_size = int(path_size) - return path_size - - @classmethod - def default_test_args(cls, model, test, quantize=False): - return [cls.bin, "test", model, test] - - @classmethod - def get_test_output(cls, test_args): - cls.eprint("Executing: " + ' '.join(test_args)) - test_output = subprocess.check_output(test_args) - test_output = test_output.decode('utf-8') - cls.eprint("Test output:\n" + test_output) - return list( - map(lambda x: x.split('\t')[1], test_output.split('\n')[:-1]) - ) - - @classmethod - def train_generic_classifier(cls, train, output): - thread = cls.num_thread() - cls.eprint("Using {} threads".format(thread)) - sup_params = ( - "-dim 10 -lr 0.1 -wordNgrams 2 -minCount 1 -bucket 10000000 " - "-epoch 5 -thread {}".format(thread) - ) - mode = 'supervised' - cls.get_train_output( - cls.build_train_args(sup_params, mode, train, output) - ) - - @classmethod - def train_generic_embeddings(cls, train, output): - thread = cls.num_thread() - cls.eprint("Using {} threads".format(thread)) - unsup_params = ( - "-thread {} -lr 0.025 -dim 100 -ws 5 -epoch 1 -minCount 5 " - "-neg 5 -loss ns -bucket 2000000 -minn 3 -maxn 6 -t 1e-4 " - "-lrUpdateRate 100".format(thread) - ) - mode = 'cbow' - cls.get_train_output( - cls.build_train_args(unsup_params, mode, train, output) - ) - - def get_predictions_from_list(self, output, words, k): - args = [self.bin, "predict-prob", output + '.bin', '-', str(k)] - self.eprint("Executing: " + ' '.join(args)) - p = subprocess.Popen( - args, stdin=subprocess.PIPE, stdout=subprocess.PIPE - ) - test_text = "" - if words: - test_text = '\n'.join(words) + '\n' - test_text = test_text.encode('utf-8') - stdout, stderr = p.communicate(test_text) - stdout = stdout.decode('utf-8') - return stdout, stderr, p.returncode - - def get_word_vectors_from_list(self, output, words): - args = [self.bin, "print-word-vectors", output + '.bin'] - self.eprint("Executing: " + ' '.join(args)) - p = subprocess.Popen( - args, stdin=subprocess.PIPE, stdout=subprocess.PIPE - ) - test_text = '\n'.join(words).encode('utf-8') - stdout, stderr = p.communicate(test_text) - return stdout - - -class TestFastTextPyUnit(TestFastTextPy): - @classmethod - def setUpClass(cls): - cls.bin = os.environ['FASTTEXT_BIN'] - cls.data_dir = os.environ['FASTTEXT_DATA'] - cls.result_dir = tempfile.mkdtemp() - train, _, output = cls.build_paths("fil9", "rw/rw.txt", "fil9") - cls.train_generic_embeddings(train, output) - cls.output = output - train, _, output_sup = cls.build_paths( - "dbpedia.train", "dbpedia.test", "dbpedia" - ) - cls.train_generic_classifier(train, output_sup) - cls.output_sup = output_sup - - @classmethod - def tearDownClass(cls): - pass - # shutil.rmtree(cls.result_dir) - - # Check if get_word_vector aligns with vectors from stdin - def test_getvector(self): - f = load_model(self.output + '.bin') - words, _ = f.get_words(include_freq=True) - words += get_random_words(100, 1, 100) - ftbin_vectors = self.get_word_vectors_from_list(self.output, words) - ftbin_vectors = ftbin_vectors.decode('utf-8').split('\n')[:-1] - for v in ftbin_vectors: - word = v.split(' ')[0] - vector = v.split(' ')[1:-1] - vector = np.array(list(map(float, vector))) - pvec = f.get_word_vector(word) - # The fasttext cli returns floats with 5 digits, - # but we use the full 6 digits. - self.assertTrue(np.allclose(vector, pvec, rtol=1e-04)) - - def test_predict(self): - # TODO: I went a little crazy here as an exercise for - # a rigorous test case. This could be turned into - # a few utility functions. - f = load_model(self.output_sup + '.bin') - - def _test(N, min_length, max_length, k, add_vocab=0): - words = get_random_words(N, min_length, max_length) - if add_vocab > 0: - vocab, _ = f.get_words(include_freq=True) - for _ in range(add_vocab): - ind = random.randint(0, len(vocab)) - words += [vocab[ind]] - all_labels = [] - all_probs = [] - ii = 0 - gotError = False - for w in words: - try: - labels, probs = f.predict(w, k) - except ValueError: - gotError = True - continue - all_labels.append(labels) - all_probs.append(probs) - ii += 1 - preds, _, retcode = self.get_predictions_from_list( - self.output_sup, words, k - ) - if gotError and retcode == 0: - self.eprint( - "Didn't get error. Make sure your compiled " - "binary kept the assert statements" - ) - self.assertTrue(False) - else: - return - preds = preds.split('\n')[:-1] - self.assertEqual(len(preds), len(all_labels)) - for i in range(len(preds)): - labels = preds[i].split() - probs = np.array(list(map(float, labels[1::2]))) - labels = np.array(labels[::2]) - self.assertTrue(np.allclose(probs, all_probs[i], rtol=1e-04)) - self.assertTrue(np.array_equal(labels, all_labels[i])) - - _test(0, 0, 0, 0) - _test(1, 0, 0, 0) - _test(10, 0, 0, 0) - _test(1, 1, 1, 0) - _test(1, 1, 1, 1) - _test(1, 2, 3, 0) - _test(1, 2, 3, 1) - _test(10, 1, 1, 1) - _test(1, 1, 1, 0, add_vocab=10) - _test(1, 1, 1, 1, add_vocab=10) - _test(1, 2, 3, 0, add_vocab=10) - _test(1, 2, 3, 1, add_vocab=10) - reach = 10 - for _ in range(10): - N = random.randint(0, reach) - init = random.randint(0, reach) - offset = random.randint(0, reach) - k = random.randint(0, reach) - _test(N, init, init + offset, k) - - def test_vocab(self): - f = load_model(self.output + '.bin') - words, freq = f.get_words(include_freq=True) - self.eprint( - "There is no way to access words from the cli yet. " - "Therefore there can be no rigorous test." - ) - - def test_subwords(self): - f = load_model(self.output + '.bin') - words, _ = f.get_words(include_freq=True) - words += get_random_words(10, 1, 10) - for w in words: - f.get_subwords(w) - self.eprint( - "There is no way to access words from the cli yet. " - "Therefore there can be no test." - ) - - def test_tokenize(self): - train, _, _ = self.build_paths("fil9", "rw/rw.txt", "fil9") - with open(train, 'r') as f: - _ = tokenize(f.read()) - - def test_dimension(self): - f = load_model(self.output + '.bin') - f.get_dimension() - - def test_subword_vector(self): - f = load_model(self.output + '.bin') - words, _ = f.get_words(include_freq=True) - words += get_random_words(10000, 1, 200) - input_matrix = f.get_input_matrix() - for word in words: - - # Universal api to get word vector - vec1 = f.get_word_vector(word) - - # Build word vector from subwords - subwords, subinds = f.get_subwords(word) - subvectors = list(map(lambda x: f.get_input_vector(x), subinds)) - subvectors = np.stack(subvectors) - vec2 = np.sum((subvectors / len(subwords)), 0) - - # Build word vector from subinds - vec3 = np.sum(input_matrix[subinds] / len(subinds), 0) - - # Build word vectors from word and subword ids - wid = f.get_word_id(word) - if wid >= 0: - swids = list(map(lambda x: f.get_subword_id(x), subwords[1:])) - swids.append(wid) - else: - swids = list(map(lambda x: f.get_subword_id(x), subwords)) - swids = np.array(swids) - vec4 = np.sum(input_matrix[swids] / len(swids), 0) - - self.assertTrue(np.isclose(vec1, vec2, atol=1e-5, rtol=0).all()) - self.assertTrue(np.isclose(vec2, vec3, atol=1e-5, rtol=0).all()) - self.assertTrue(np.isclose(vec3, vec4, atol=1e-5, rtol=0).all()) - self.assertTrue(np.isclose(vec4, vec1, atol=1e-5, rtol=0).all()) - - # TODO: Compare with .vec file - def test_get_words(self): - f = load_model(self.output + '.bin') - words1, freq1 = f.get_words(include_freq=True) - words2 = f.get_words(include_freq=False) - self.assertEqual(len(words1), len(words2)) - self.assertEqual(len(words1), len(freq1)) - f = load_model(self.output_sup + '.bin') - words1, freq1 = f.get_words(include_freq=True) - words2 = f.get_words(include_freq=False) - self.assertEqual(len(words1), len(words2)) - self.assertEqual(len(words1), len(freq1)) - - # TODO: Compare with .vec file for unsup - def test_get_labels(self): - f = load_model(self.output + '.bin') - labels1, freq1 = f.get_labels(include_freq=True) - labels2 = f.get_labels(include_freq=False) - words2 = f.get_words(include_freq=False) - self.assertEqual(len(labels1), len(labels2)) - self.assertEqual(len(labels1), len(freq1)) - self.assertEqual(len(labels1), len(words2)) - for w1, w2 in zip(labels2, words2): - self.assertEqual(w1, w2) - f = load_model(self.output_sup + '.bin') - labels1, freq1 = f.get_labels(include_freq=True) - labels2 = f.get_labels(include_freq=False) - self.assertEqual(len(labels1), len(labels2)) - self.assertEqual(len(labels1), len(freq1)) - - def test_exercise_is_quant(self): - f = load_model(self.output + '.bin') - gotError = False - try: - f.quantize() - except ValueError: - gotError = True - self.assertTrue(gotError) - f = load_model(self.output_sup + '.bin') - self.assertTrue(not f.is_quantized()) - f.quantize() - self.assertTrue(f.is_quantized()) - - def test_newline_predict_sentence(self): - f = load_model(self.output_sup + '.bin') - sentence = get_random_words(1, 1000, 2000)[0] - f.predict(sentence, k=5) - sentence += "\n" - gotError = False - try: - f.predict(sentence, k=5) - except ValueError: - gotError = True - self.assertTrue(gotError) - - f = load_model(self.output + '.bin') - sentence = get_random_words(1, 1000, 2000)[0] - f.get_sentence_vector(sentence) - sentence += "\n" - gotError = False - try: - f.get_sentence_vector(sentence) - except ValueError: - gotError = True - self.assertTrue(gotError) - - -class TestFastTextPyIntegration(TestFastTextPy): - @classmethod - def setUpClass(cls): - cls.bin = os.environ['FASTTEXT_BIN'] - cls.data_dir = os.environ['FASTTEXT_DATA'] - cls.result_dir = tempfile.mkdtemp() - - def test_unsup1(self): - train, test, output = self.build_paths("fil9", "rw/rw.txt", "fil9") - - model = train_unsupervised( - input=train, - model="skipgram", - lr=0.025, - dim=100, - ws=5, - epoch=1, - minCount=5, - neg=5, - loss="ns", - bucket=2000000, - minn=3, - maxn=6, - t=1e-4, - lrUpdateRate=100, - thread=self.num_thread(), - ) - model.save_model(output) - - path_size = self.get_path_size(output) - vectors = {} - with open(test, 'r') as test_f: - for line in test_f: - query0 = line.split()[0].strip() - query1 = line.split()[1].strip() - vector0 = model.get_word_vector(query0) - vector1 = model.get_word_vector(query1) - vectors[query0] = vector0 - vectors[query1] = vector1 - dataset, correlation, oov = compute_similarity(None, test, vectors) - correlation = np.around(correlation) - - self.assertTrue( - correlation >= 41, "Correlation: Want: 41 Is: " + str(correlation) - ) - self.assertEqual(oov, 0.0, "Oov: Want: 0 Is: " + str(oov)) - self.assertEqual( - path_size, 978480868, "Size: Want: 978480868 Is: " + str(path_size) - ) - - -def gen_sup_test(lr, dataset, n, p1, r1, p1_q, r1_q, size, quant_size): - def sup_test(self): - def check( - output_local, test_local, n_local, p1_local, r1_local, size_local, - lessthan - ): - test_args = self.default_test_args(output_local, test_local) - test_output = self.get_test_output(test_args) - self.assertEqual( - str(test_output[0]), - str(n_local), - "N: Want: " + str(n_local) + " Is: " + str(test_output[0]) - ) - self.assertTrue( - float(test_output[1]) >= float(p1_local), - "p1: Want: " + str(p1_local) + " Is: " + str(test_output[1]) - ) - self.assertTrue( - float(test_output[2]) >= float(r1_local), - "r1: Want: " + str(r1_local) + " Is: " + str(test_output[2]) - ) - path_size = self.get_path_size(output_local) - if lessthan: - self.assertTrue( - path_size <= size_local, "Size: Want at most: " + - str(size_local) + " Is: " + str(path_size) - ) - else: - self.assertTrue( - path_size == size_local, - "Size: Want: " + str(size_local) + " Is: " + str(path_size) - ) - - train, test, output = self.build_paths( - dataset + ".train", dataset + ".test", dataset - ) - model = train_supervised( - input=train, - dim=10, - lr=lr, - wordNgrams=2, - minCount=1, - bucket=10000000, - epoch=5, - thread=self.num_thread() - ) - model.save_model(output) - check(output, test, n, p1, r1, size, False) - # Exercising - model.predict("hello world") - model.quantize(input=train, retrain=True, cutoff=100000, qnorm=True) - model.save_model(output + ".ftz") - # Exercising - model.predict("hello world") - check(output + ".ftz", test, n, p1_q, r1_q, quant_size, True) - - return sup_test - - -if __name__ == "__main__": - sup_job_lr = [0.25, 0.5, 0.5, 0.1, 0.1, 0.1, 0.05, 0.05] - sup_job_n = [7600, 60000, 70000, 38000, 50000, 60000, 650000, 400000] - sup_job_p1 = [0.921, 0.968, 0.984, 0.956, 0.638, 0.723, 0.603, 0.946] - sup_job_r1 = [0.921, 0.968, 0.984, 0.956, 0.638, 0.723, 0.603, 0.946] - sup_job_quant_p1 = [0.918, 0.965, 0.984, 0.953, 0.629, 0.707, 0.58, 0.940] - sup_job_quant_r1 = [0.918, 0.965, 0.984, 0.953, 0.629, 0.707, 0.58, 0.940] - sup_job_size = [ - 405607193, 421445471, 447481878, 427867393, 431292576, 517549567, - 483742593, 493604598 - ] - sup_job_quant_size = [ - 405607193, 421445471, 447481878, 427867393, 431292576, 517549567, - 483742593, 493604598 - ] - sup_job_quant_size = [ - 1600000, 1457000, 1690000, 1550000, 1567896, 1655000, 1600000, 1575010 - ] - # Yelp_review_full can be a bit flaky - sup_job_dataset = [ - "ag_news", "sogou_news", "dbpedia", "yelp_review_polarity", - "yelp_review_full", "yahoo_answers", "amazon_review_full", - "amazon_review_polarity" - ] - sup_job_args = [ - sup_job_lr, sup_job_dataset, sup_job_n, sup_job_p1, sup_job_r1, - sup_job_quant_p1, sup_job_quant_r1, sup_job_size, sup_job_quant_size - ] - for lr, dataset, n, p1, r1, p1_q, r1_q, size, quant_size in zip( - *sup_job_args - ): - setattr( - TestFastTextPyIntegration, "test_" + dataset, - gen_sup_test(lr, dataset, n, p1, r1, p1_q, r1_q, size, quant_size) - ) - unittest.main() diff --git a/python/fastText/tests/__init__.py b/python/fastText/tests/__init__.py new file mode 100644 index 000000000..f248756e5 --- /dev/null +++ b/python/fastText/tests/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from .test_configurations import get_supervised_models +from .test_script import gen_tests +from .test_script import gen_small_tests diff --git a/python/fastText/tests/test_configurations.py b/python/fastText/tests/test_configurations.py new file mode 100644 index 000000000..d72adf375 --- /dev/null +++ b/python/fastText/tests/test_configurations.py @@ -0,0 +1,104 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import multiprocessing +import os + +# This script represents a collection of integration tests +# Each integration test comes with a full set of parameters, +# a dataset, and expected metrics. +# These configurations can be used by various fastText apis +# to confirm some level of correctness. + +# Supervised models +# See https://fasttext.cc/docs/en/supervised-models.html + + +def max_thread(): + return multiprocessing.cpu_count() - 1 + + +def get_supervised_models(data_dir=""): + sup_job_dataset = [ + "ag_news", "sogou_news", "dbpedia", "yelp_review_polarity", + "yelp_review_full", "yahoo_answers", "amazon_review_full", + "amazon_review_polarity" + ] + + sup_params = { + "dim": 10, + "wordNgrams": 2, + "minCount": 1, + "bucket": 10000000, + "epoch": 5, + "thread": max_thread(), + "verbose": 1, + } + quant_params = { + "retrain": True, + "cutoff": 100000, + "qnorm": True, + "verbose": 1, + } + sup_job_lr = [0.25, 0.5, 0.5, 0.1, 0.1, 0.1, 0.05, 0.05] + + sup_job_n = [7600, 60000, 70000, 38000, 50000, 60000, 650000, 400000] + + sup_job_p1 = [0.921, 0.968, 0.984, 0.956, 0.638, 0.723, 0.603, 0.946] + sup_job_r1 = [0.921, 0.968, 0.984, 0.956, 0.638, 0.723, 0.603, 0.946] + sup_job_size = [ + 405607193, 421445471, 447481878, 427867393, 431292576, 517549567, + 483742593, 493604598 + ] + + sup_job_quant_p1 = [0.918, 0.965, 0.984, 0.950, 0.625, 0.707, 0.58, 0.940] + sup_job_quant_r1 = [0.918, 0.965, 0.984, 0.950, 0.625, 0.707, 0.58, 0.940] + sup_job_quant_size = [ + 1600000, 1457000, 1690000, 1550000, 1567896, 1655000, 1600000, 1575000 + ] + + configurations = [] + for i in range(len(sup_job_dataset)): + configuration = {} + configuration["dataset"] = sup_job_dataset[i] + args = sup_params.copy() + quant_args = quant_params.copy() + args["lr"] = sup_job_lr[i] + args["input"] = sup_job_dataset[i] + ".train" + quant_args["lr"] = sup_job_lr[i] + quant_args["input"] = sup_job_dataset[i] + ".train" + if data_dir: + args["input"] = os.path.join(data_dir, args["input"]) + quant_args["input"] = os.path.join(data_dir, quant_args["input"]) + configuration["train_args"] = args + configuration["quant_args"] = quant_args + test = { + "n": sup_job_n[i], + "p1": sup_job_p1[i], + "r1": sup_job_r1[i], + "size": sup_job_size[i], + "data": sup_job_dataset[i] + ".test", + } + quant_test = { + "n": sup_job_n[i], + "p1": sup_job_quant_p1[i], + "r1": sup_job_quant_r1[i], + "size": sup_job_quant_size[i], + "data": sup_job_dataset[i] + ".test", + } + if data_dir: + test["data"] = os.path.join(data_dir, test["data"]) + quant_test["data"] = os.path.join(data_dir, quant_test["data"]) + configuration["test"] = test + configuration["quant_test"] = quant_test + configurations.append(configuration) + return configurations diff --git a/python/fastText/tests/test_script.py b/python/fastText/tests/test_script.py new file mode 100644 index 000000000..b6241efb6 --- /dev/null +++ b/python/fastText/tests/test_script.py @@ -0,0 +1,125 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + +from __future__ import absolute_import +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from fastText import train_supervised +from fastText import util +import os +import subprocess +import unittest +import tempfile +try: + import unicode +except ImportError: + pass +from fastText.tests.test_configurations import get_supervised_models + + +def read_labels(data_file): + labels = [] + lines = [] + with open(data_file, 'r') as f: + for line in f: + labels_line = [] + words_line = [] + try: + line = unicode(line, "UTF-8").split() + except NameError: + line = line.split() + for word in line: + if word.startswith("__label__"): + labels_line.append(word) + else: + words_line.append(word) + labels.append(labels_line) + lines.append(" ".join(words_line)) + return lines, labels + + +# Generate a supervised test case +# The returned function will be set as an attribute to a test class +def gen_sup_test(configuration): + def sup_test(self): + def get_path_size(path): + path_size = subprocess.check_output(["stat", "-c", "%s", + path]).decode('utf-8') + path_size = int(path_size) + return path_size + + def check(model, model_filename, test, lessthan, msg_prefix=""): + lines, labels = read_labels(test["data"]) + predictions = [] + for line in lines: + pred_label, _ = model.predict(line) + predictions.append(pred_label) + p1_local_out, r1_local_out = util.test(predictions, labels) + self.assertEqual( + len(predictions), test["n"], msg_prefix + "N: Want: " + + str(test["n"]) + " Is: " + str(len(predictions)) + ) + self.assertTrue( + p1_local_out >= test["p1"], msg_prefix + "p1: Want: " + + str(test["p1"]) + " Is: " + str(p1_local_out) + ) + self.assertTrue( + r1_local_out >= test["r1"], msg_prefix + "r1: Want: " + + str(test["r1"]) + " Is: " + str(r1_local_out) + ) + path_size = get_path_size(model_filename) + size_msg = str(test["size"]) + " Is: " + str(path_size) + if lessthan: + self.assertTrue( + path_size <= test["size"], + msg_prefix + "Size: Want at most: " + size_msg + ) + else: + self.assertTrue( + path_size == test["size"], + msg_prefix + "Size: Want: " + size_msg + ) + + output = os.path.join(tempfile.mkdtemp(), configuration["dataset"]) + model = train_supervised(**configuration["train_args"]) + model.save_model(output + ".bin") + check(model, output + ".bin", configuration["test"], False) + model.quantize(**configuration["quant_args"]) + model.save_model(output + ".ftz") + check( + model, output + ".ftz", configuration["quant_test"], True, "Quant: " + ) + + return sup_test + + +def gen_small_tests(data_dir): + class TestFastTextSmallPy(unittest.TestCase): + pass + + for configuration in get_supervised_models(data_dir=data_dir): + if configuration["dataset"] == "dbpedia": + setattr( + TestFastTextSmallPy, "test_small_" + configuration["dataset"], + gen_sup_test(configuration) + ) + return TestFastTextSmallPy + + +def gen_tests(data_dir): + class TestFastTextPy(unittest.TestCase): + pass + + for configuration in get_supervised_models(data_dir=data_dir): + setattr( + TestFastTextPy, "test_" + configuration["dataset"], + gen_sup_test(configuration) + ) + return TestFastTextPy diff --git a/python/fastText/util/__init__.py b/python/fastText/util/__init__.py new file mode 100644 index 000000000..5116e5f2e --- /dev/null +++ b/python/fastText/util/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from .util import test +from .util import find_nearest_neighbor diff --git a/python/fastText/util/util.py b/python/fastText/util/util.py new file mode 100644 index 000000000..801146a43 --- /dev/null +++ b/python/fastText/util/util.py @@ -0,0 +1,60 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + +# NOTE: The purpose of this file is not to accumulate all useful utility +# functions. This file should contain very commonly used and requested functions +# (such as test). If you think you have a function at that level, please create +# an issue and we will happily review your suggestion. This file is also not supposed +# to pull in dependencies outside of numpy/scipy without very good reasons. For +# example, this file should not use sklearn and matplotlib to produce a t-sne +# plot of word embeddings or such. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np + + +def test(predictions, labels, k=1): + """ + Return precision and recall modeled after fasttext's test + """ + precision = 0.0 + nexamples = 0 + nlabels = 0 + for prediction, labels in zip(predictions, labels): + for p in prediction: + if p in labels: + precision += 1 + nexamples += 1 + nlabels += len(labels) + return (precision / (k * nexamples), precision / nlabels) + + +def find_nearest_neighbor(query, vectors, ban_set, cossims=None): + """ + query is a 1d numpy array corresponding to the vector to which you want to + find the closest vector + vectors is a 2d numpy array corresponding to the vectors you want to consider + ban_set is a set of indicies within vectors you want to ignore for nearest match + cossims is a 1d numpy array of size len(vectors), which can be passed for efficiency + + returns the index of the closest match to query within vectors + + """ + if cossims is None: + cossims = np.matmul(vectors, query, out=cossims) + else: + np.matmul(vectors, query, out=cossims) + rank = len(cossims) - 1 + result_i = np.argpartition(cossims, rank)[rank] + while result_i in ban_set: + rank -= 1 + result_i = np.argpartition(cossims, rank)[rank] + return result_i diff --git a/runtests.py b/runtests.py new file mode 100644 index 000000000..4e19c344c --- /dev/null +++ b/runtests.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2016-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. +# + +# To run this tests you must first fetch all the required test data. +# Have a look at tests/fetch_test_data.sh +# You will then need to point this script to the corresponding folder + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import unittest +import argparse +from fastText.tests import gen_tests + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("data_dir", help="Full path to data directory") + args = parser.parse_args() + tests = gen_tests(args.data_dir) + suite = unittest.TestLoader().loadTestsFromTestCase(tests) + unittest.TextTestRunner(verbosity=3).run(suite) diff --git a/setup.py b/setup.py index 58464cfca..21dfe5cc4 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ import setuptools import os -__version__ = '0.0.3' +__version__ = '0.0.6' FASTTEXT_SRC = "src" # Based on https://github.com/pybind/python_example @@ -81,16 +81,18 @@ def has_flag(compiler, flagname): def cpp_flag(compiler): - """Return the -std=c++[11/14] compiler flag. - The c++14 is preferred over c++11 (when it is available). + """Return the -std=c++[0x/11/14] compiler flag. + The c++14 is preferred over c++0x/11 (when it is available). """ if has_flag(compiler, '-std=c++14'): return '-std=c++14' elif has_flag(compiler, '-std=c++11'): return '-std=c++11' + elif has_flag(compiler, '-std=c++0x'): + return '-std=c++0x' else: raise RuntimeError( - 'Unsupported compiler -- at least C++11 support ' + 'Unsupported compiler -- at least C++0x support ' 'is needed!' ) @@ -124,7 +126,7 @@ def build_extensions(self): setup( - name='fastTextpy', + name='fasttext', version=__version__, author='Christian Puhrsch', author_email='cpuhrsch@fb.com', @@ -135,7 +137,11 @@ def build_extensions(self): license='BSD', install_requires=['pybind11>=2.2', "setuptools >= 0.7.0"], cmdclass={'build_ext': BuildExt}, - packages=[str('fastText')], + packages=[ + str('fastText'), + str('fastText.util'), + str('fastText.tests'), + ], package_dir={str(''): str('python')}, - zip_safe=False + zip_safe=False, ) diff --git a/tests/fetch_test_data.sh b/tests/fetch_test_data.sh new file mode 100644 index 000000000..b1e5cacc0 --- /dev/null +++ b/tests/fetch_test_data.sh @@ -0,0 +1,129 @@ +#!/usr/bin/env bash +# +# Copyright (c) 2016-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. +# + +DATADIR=data + +report_error() { + echo "Error on line $1 of $0" +} + +myshuf() { + perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@"; +} + +normalize_text() { + tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \ + sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/
/ /g' \ + -e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \ + -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf +} + +set -e +trap 'report_error $LINENO' ERR + +mkdir "${DATADIR}" + +data_result="${DATADIR}/dbpedia_csv.tar.gz" +if [ ! -f "$data_result" ] || \ + [ $(md5sum "$data_result" | cut -f 1 -d ' ') != "8139d58cf075c7f70d085358e73af9b3" ] +then + wget -c "https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz" -O "$data_result" + tar -xzvf "$data_result" -C "${DATADIR}" +fi + +data_result="${DATADIR}/dbpedia.train" +if [ ! -f "$data_result" ] +then + cat "${DATADIR}/dbpedia_csv/train.csv" | normalize_text > "$data_result" || rm -f "$data_result" +fi + +data_result="${DATADIR}/dbpedia.test" +if [ ! -f "$data_result" ] +then + cat "${DATADIR}/dbpedia_csv/test.csv" | normalize_text > "$data_result" || rm -f "$data_result" +fi + +data_result="${DATADIR}/rw_queries.txt" +if [ ! -f "$data_result" ] +then + cut -f 1,2 "${DATADIR}"/rw/rw.txt | awk '{print tolower($0)}' | tr '\t' '\n' > "$data_result" || rm -f "$data_result" +fi + +data_result="${DATADIR}/enwik9.zip" +if [ ! -f "$data_result" ] || \ + [ $(md5sum "$data_result" | cut -f 1 -d ' ') != "3e773f8a1577fda2e27f871ca17f31fd" ] +then + wget -c http://mattmahoney.net/dc/enwik9.zip -P "${DATADIR}" || rm -f "$data_result" + unzip "$data_result" -d "${DATADIR}" || rm -f "$data_result" +fi + +data_result="${DATADIR}/fil9" +if [ ! -f "$data_result" ] +then + perl wikifil.pl "${DATADIR}/enwik9" > "$data_result" || rm -f "$data_result" +fi + +data_result="${DATADIR}/rw/rw.txt" +if [ ! -f "$data_result" ] +then + wget -c https://nlp.stanford.edu/~lmthang/morphoNLM/rw.zip -P "${DATADIR}" + unzip "${DATADIR}/rw.zip" -d "${DATADIR}" || rm -f "$data_result" +fi + +DATASET=( + ag_news + sogou_news + dbpedia + yelp_review_polarity + yelp_review_full + yahoo_answers + amazon_review_full + amazon_review_polarity +) + +ID=( + 0Bz8a_Dbh9QhbUDNpeUdjb0wxRms # ag_news + 0Bz8a_Dbh9QhbUkVqNEszd0pHaFE # sogou_news + 0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k # dbpedia + 0Bz8a_Dbh9QhbNUpYQ2N3SGlFaDg # yelp_review_polarity + 0Bz8a_Dbh9QhbZlU4dXhHTFhZQU0 # yelp_review_full + 0Bz8a_Dbh9Qhbd2JNdDBsQUdocVU # yahoo_answers + 0Bz8a_Dbh9QhbZVhsUnRWRDhETzA # amazon_review_full + 0Bz8a_Dbh9QhbaW12WVVZS2drcnM # amazon_review_polarity +) + +# Small datasets first + +for i in {0..0} +do + echo "Downloading dataset ${DATASET[i]}" + if [ ! -f "${DATADIR}/${DATASET[i]}.train" ] + then + wget -c "https://drive.google.com/uc?export=download&id=${ID[i]}" -O "${DATADIR}/${DATASET[i]}_csv.tar.gz" + tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}" + cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train" + cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test" + fi +done + +# Large datasets require a bit more work due to the extra request page + +for i in {1..7} +do + echo "Downloading dataset ${DATASET[i]}" + if [ ! -f "${DATADIR}/${DATASET[i]}.train" ] + then + curl -c /tmp/cookies "https://drive.google.com/uc?export=download&id=${ID[i]}" > /tmp/intermezzo.html + curl -L -b /tmp/cookies "https://drive.google.com$(cat /tmp/intermezzo.html | grep -Po 'uc-download-link" [^>]* href="\K[^"]*' | sed 's/\&/\&/g')" > "${DATADIR}/${DATASET[i]}_csv.tar.gz" + tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}" + cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train" + cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test" + fi +done