Merge pull request #784 from irinakhismatullina/fasttext

Small change in fasttext training data generation
src-d · Jun 25, 2019 · fa9c805 · fa9c805
2 parents d98e34a + 696f06c
commit fa9c805
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 2 deletions.
diff --git a/lookout/style/typos/config.py b/lookout/style/typos/config.py
@@ -28,6 +28,7 @@
         "path": str(DEFAULT_DATA_DIR / "fasttext.bin"),  # Where to store trained fasttext model
         "dim": 10,  # Number of dimensions of embeddings
         "bucket": 200000,  # Number of hash buckets in the model
+        "adjust_frequencies": True,  # Whether to divide identifiers frequencies by tokens number.
     },
     "datasets": {
         "portion": 400000,

diff --git a/lookout/style/typos/preparation.py b/lookout/style/typos/preparation.py
@@ -249,6 +249,10 @@ def train_fasttext(data: pandas.DataFrame, config: Optional[Mapping[str, Any]] =
                    dim: Number of dimensions for embeddings in the new model.
                    bucket: Number of hash buckets to keep in the fasttext model: \
                            the less there are, the more compact the model gets.
+                   adjust_frequencies: Whether to divide frequencies by the number of tokens in \
+                                       the identifiers. Needs to be done when the result of the \
+                                       `prepare` function is used as data to have a true \
+                                       identifiers distribution.
     """
     try:
         import fastText
@@ -260,8 +264,12 @@ def train_fasttext(data: pandas.DataFrame, config: Optional[Mapping[str, Any]] =
     if config is None:
         config = {}
     config = merge_dicts(DEFAULT_CORRECTOR_CONFIG["fasttext"], config)
-    train_data = data[[len(str(x).split()) > 2 for x in data[Columns.Split]]].sample(
-        config["size"], weights=Columns.Frequency, replace=True)
+    tokens_number = data[Columns.Split].apply(lambda x: len(str(x).split()))
+    if config["adjust_frequencies"]:
+        weights = data[Columns.Frequency] / tokens_number
+    else:
+        weights = data[Columns.Frequency]
+    train_data = data[tokens_number > 1].sample(config["size"], weights=weights, replace=True)
     if config["corrupt"]:
         train_data = corrupt_tokens_in_df(train_data, config["typo_probability"],
                                           config["add_typo_probability"])