Skip to content

Commit

Permalink
more integration tests
Browse files Browse the repository at this point in the history
Summary: See title.

Reviewed By: kahne

Differential Revision: D6606277

fbshipit-source-id: 37045b67beee16c6b616345e2af0108f329cf769
  • Loading branch information
cpuhrsch authored and facebook-github-bot committed Dec 20, 2017
1 parent 8b10430 commit eb9703a
Show file tree
Hide file tree
Showing 4 changed files with 183 additions and 20 deletions.
154 changes: 139 additions & 15 deletions python/fastText/tests/test_configurations.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,145 @@
from __future__ import unicode_literals

import multiprocessing
import os

# This script represents a collection of integration tests
# Each integration test comes with a full set of parameters,
# a dataset, and expected metrics.
# These configurations can be used by various fastText apis
# to confirm some level of correctness.

# Supervised models
# See https://fasttext.cc/docs/en/supervised-models.html


def max_thread():
return multiprocessing.cpu_count() - 1


def get_supervised_models(data_dir=""):
def check_supervised_configuration(configuration, verbose=1):
configuration["args"]["verbose"] = verbose
configuration["quant_args"]["verbose"] = verbose
return configuration


def check_supervised_configurations(configurations, verbose=1):
for i in range(len(configurations)):
configurations[i] = check_supervised_configuration(
configurations[i], verbose=verbose
)
return configurations


def flickr_job(thread=max_thread()):
config = {}
config["dataset"] = "YFCC100M"
config["args"] = {
"dim": 256,
"wordNgrams": 2,
"minCount": 10,
"bucket": 10000000,
"epoch": 20,
"loss": "hs",
"minCountLabel": 100,
"thread": thread
}
config["args"]["input"] = "YFCC100M/train"
config["quant_args"] = {
"dsub": 2,
"lr": "0.1",
"epoch": 5,
"cutoff": 100000,
"qnorm": True,
"retrain": True,
"qout": True
}
config["quant_args"]["input"] = config["args"]["input"]
config["test"] = {
"n": 647224,
"p1": 0.471,
"r1": 0.0722,
"size": 12060039727,
"data": "YFCC100M/test",
}
# One quant example (to illustrate slack): 0.344, 0.0528, 64506972
config["quant_test"] = {
"n": 647224,
"p1": 0.300,
"r1": 0.0450,
"size": 70000000,
"data": "YFCC100M/test",
}
return config


def langid_job1(thread=max_thread()):
config = {}
config["dataset"] = "langid"
config["args"] = {"dim": 16, "minn": 2, "maxn": 4, "thread": thread}
config["args"]["input"] = "langid.train"
config["quant_args"] = {"qnorm": True, "cutoff": 50000, "retrain": True}
config["quant_args"]["input"] = config["args"]["input"]
config["test"] = {
"n": 10000,
"p1": 0.985,
"r1": 0.985,
"size": 368369579,
"data": "langid.valid",
}
# One quant example (to illustrate slack): 0.984 0.984 932793
config["quant_test"] = {
"p1": 0.97,
"r1": 0.97,
"size": 1000000,
}
config["quant_test"]["n"] = config["test"]["n"]
config["quant_test"]["data"] = config["test"]["data"]
return config


def langid_job2(thread=max_thread()):
config = langid_job1(thread).copy()
config["args"]["loss"] = "hs"
return config


def cooking_job1(thread=max_thread()):
config = {}
config["dataset"] = "cooking"
config["args"] = {
"epoch": 25,
"lr": 1.0,
"wordNgrams": 2,
"minCount": 1,
"thread": thread,
}
config["args"]["input"] = "cooking.train"
config["quant_args"] = {"qnorm": True, "cutoff": 50000, "retrain": True}
config["quant_args"]["input"] = config["args"]["input"]
config["test"] = {
"n": 3000,
"p1": 0.59,
"r1": 0.25,
"size": 804047585,
"data": "cooking.valid",
}
# One quant example (to illustrate slack): 0.602 0.26 3439172
config["quant_test"] = {
"p1": 0.55,
"r1": 0.20,
"size": 4000000,
}
config["quant_test"]["n"] = config["test"]["n"]
config["quant_test"]["data"] = config["test"]["data"]
return config


def cooking_job2(thread=max_thread()):
config = cooking_job1(thread).copy()
config["args"]["loss"] = "hs"
return config


# Supervised models
# See https://fasttext.cc/docs/en/supervised-models.html
def get_supervised_models(thread=max_thread(), verbose=1):
sup_job_dataset = [
"ag_news", "sogou_news", "dbpedia", "yelp_review_polarity",
"yelp_review_full", "yahoo_answers", "amazon_review_full",
Expand All @@ -40,7 +162,7 @@ def get_supervised_models(data_dir=""):
"minCount": 1,
"bucket": 10000000,
"epoch": 5,
"thread": max_thread(),
"thread": thread,
"verbose": 1,
}
quant_params = {
Expand All @@ -53,8 +175,8 @@ def get_supervised_models(data_dir=""):

sup_job_n = [7600, 60000, 70000, 38000, 50000, 60000, 650000, 400000]

sup_job_p1 = [0.921, 0.968, 0.984, 0.956, 0.638, 0.723, 0.603, 0.946]
sup_job_r1 = [0.921, 0.968, 0.984, 0.956, 0.638, 0.723, 0.603, 0.946]
sup_job_p1 = [0.915, 0.968, 0.984, 0.956, 0.638, 0.723, 0.603, 0.946]
sup_job_r1 = [0.915, 0.968, 0.984, 0.956, 0.638, 0.723, 0.603, 0.946]
sup_job_size = [
405607193, 421445471, 447481878, 427867393, 431292576, 517549567,
483742593, 493604598
Expand All @@ -76,10 +198,7 @@ def get_supervised_models(data_dir=""):
args["input"] = sup_job_dataset[i] + ".train"
quant_args["lr"] = sup_job_lr[i]
quant_args["input"] = sup_job_dataset[i] + ".train"
if data_dir:
args["input"] = os.path.join(data_dir, args["input"])
quant_args["input"] = os.path.join(data_dir, quant_args["input"])
configuration["train_args"] = args
configuration["args"] = args
configuration["quant_args"] = quant_args
test = {
"n": sup_job_n[i],
Expand All @@ -95,10 +214,15 @@ def get_supervised_models(data_dir=""):
"size": sup_job_quant_size[i],
"data": sup_job_dataset[i] + ".test",
}
if data_dir:
test["data"] = os.path.join(data_dir, test["data"])
quant_test["data"] = os.path.join(data_dir, quant_test["data"])
configuration["test"] = test
configuration["quant_test"] = quant_test
configurations.append(configuration)
configurations.append(flickr_job())
configurations.append(langid_job1())
configurations.append(langid_job2())
configurations.append(cooking_job1())
configurations.append(cooking_job2())
configurations = check_supervised_configurations(
configurations, verbose=verbose
)
return configurations
22 changes: 17 additions & 5 deletions python/fastText/tests/test_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ def gen_test_newline_predict_sentence(self, kwargs):

# Generate a supervised test case
# The returned function will be set as an attribute to a test class
def gen_sup_test(configuration):
def gen_sup_test(configuration, data_dir):
def sup_test(self):
def get_path_size(path):
path_size = subprocess.check_output(["stat", "-c", "%s",
Expand Down Expand Up @@ -404,10 +404,20 @@ def check(model, model_filename, test, lessthan, msg_prefix=""):
msg_prefix + "Size: Want: " + size_msg
)

configuration["args"]["input"] = os.path.join(
data_dir, configuration["args"]["input"]
)
configuration["quant_args"]["input"] = configuration["args"]["input"]
configuration["test"]["data"] = os.path.join(
data_dir, configuration["test"]["data"]
)
configuration["quant_test"]["data"] = configuration["test"]["data"]
output = os.path.join(tempfile.mkdtemp(), configuration["dataset"])
model = train_supervised(**configuration["train_args"])
print()
model = train_supervised(**configuration["args"])
model.save_model(output + ".bin")
check(model, output + ".bin", configuration["test"], False)
print()
model.quantize(**configuration["quant_args"])
model.save_model(output + ".ftz")
check(
Expand Down Expand Up @@ -503,9 +513,11 @@ def gen_tests(data_dir):
class TestFastTextPy(unittest.TestCase):
pass

for configuration in get_supervised_models(data_dir=data_dir):
i = 0
for configuration in get_supervised_models():
setattr(
TestFastTextPy, "test_" + configuration["dataset"],
gen_sup_test(configuration)
TestFastTextPy, "test_sup_" + str(i) + "_" + configuration["dataset"],
gen_sup_test(configuration, data_dir)
)
i += 1
return TestFastTextPy
7 changes: 7 additions & 0 deletions runtests.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,13 @@ def run_tests(tests):
help="run integration tests",
action="store_true"
)
parser.add_argument(
"-v",
"--verbose",
default=1,
help="verbosity level (default 1)",
type=int,
)
parser.add_argument("--data-dir", help="Full path to data directory")
args = parser.parse_args()
if args.unit_tests:
Expand Down
20 changes: 20 additions & 0 deletions tests/fetch_test_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,26 @@ then
tail -n 3000 "${DATADIR}"/cooking/cooking.preprocessed.txt > "${DATADIR}"/cooking.valid
fi

echo "Checking for YFCC100M"

data_result="${DATADIR}"/YFCC100M/train
if [ ! -f "$data_result" ]
then
echo 'Download YFCC100M, unpack it and place train into the following path: '"$data_result"
echo 'You can download YFCC100M at :'"https://fasttext.cc/docs/en/dataset.html"
echo 'After you download this, run the script again'
exit 1
fi

data_result="${DATADIR}"/YFCC100M/test
if [ ! -f "$data_result" ]
then
echo 'Download YFCC100M, unpack it and place test into the following path: '"$data_result"
echo 'You can download YFCC100M at :'"https://fasttext.cc/docs/en/dataset.html"
echo 'After you download this, run the script again'
exit 1
fi

DATASET=(
ag_news
sogou_news
Expand Down

0 comments on commit eb9703a

Please sign in to comment.