diff --git a/lookout/style/typos/config.py b/lookout/style/typos/config.py index 86705be7c..9d9316a07 100644 --- a/lookout/style/typos/config.py +++ b/lookout/style/typos/config.py @@ -28,6 +28,7 @@ "path": str(DEFAULT_DATA_DIR / "fasttext.bin"), # Where to store trained fasttext model "dim": 10, # Number of dimensions of embeddings "bucket": 200000, # Number of hash buckets in the model + "adjust_frequencies": True, # Whether to divide identifiers frequencies by tokens number. }, "datasets": { "portion": 400000, diff --git a/lookout/style/typos/preparation.py b/lookout/style/typos/preparation.py index 71791bce1..fab8ed542 100644 --- a/lookout/style/typos/preparation.py +++ b/lookout/style/typos/preparation.py @@ -249,6 +249,10 @@ def train_fasttext(data: pandas.DataFrame, config: Optional[Mapping[str, Any]] = dim: Number of dimensions for embeddings in the new model. bucket: Number of hash buckets to keep in the fasttext model: \ the less there are, the more compact the model gets. + adjust_frequencies: Whether to divide frequencies by the number of tokens in \ + the identifiers. Needs to be done when the result of the \ + `prepare` function is used as data to have a true \ + identifiers distribution. """ try: import fastText @@ -260,8 +264,12 @@ def train_fasttext(data: pandas.DataFrame, config: Optional[Mapping[str, Any]] = if config is None: config = {} config = merge_dicts(DEFAULT_CORRECTOR_CONFIG["fasttext"], config) - train_data = data[[len(str(x).split()) > 2 for x in data[Columns.Split]]].sample( - config["size"], weights=Columns.Frequency, replace=True) + tokens_number = data[Columns.Split].apply(lambda x: len(str(x).split())) + if config["adjust_frequencies"]: + weights = data[Columns.Frequency] / tokens_number + else: + weights = data[Columns.Frequency] + train_data = data[tokens_number > 1].sample(config["size"], weights=weights, replace=True) if config["corrupt"]: train_data = corrupt_tokens_in_df(train_data, config["typo_probability"], config["add_typo_probability"])