Skip to content

Commit

Permalink
Merge pull request #784 from irinakhismatullina/fasttext
Browse files Browse the repository at this point in the history
Small change in fasttext training data generation
  • Loading branch information
zurk authored Jun 25, 2019
2 parents d98e34a + 696f06c commit fa9c805
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 2 deletions.
1 change: 1 addition & 0 deletions lookout/style/typos/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
"path": str(DEFAULT_DATA_DIR / "fasttext.bin"), # Where to store trained fasttext model
"dim": 10, # Number of dimensions of embeddings
"bucket": 200000, # Number of hash buckets in the model
"adjust_frequencies": True, # Whether to divide identifiers frequencies by tokens number.
},
"datasets": {
"portion": 400000,
Expand Down
12 changes: 10 additions & 2 deletions lookout/style/typos/preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,10 @@ def train_fasttext(data: pandas.DataFrame, config: Optional[Mapping[str, Any]] =
dim: Number of dimensions for embeddings in the new model.
bucket: Number of hash buckets to keep in the fasttext model: \
the less there are, the more compact the model gets.
adjust_frequencies: Whether to divide frequencies by the number of tokens in \
the identifiers. Needs to be done when the result of the \
`prepare` function is used as data to have a true \
identifiers distribution.
"""
try:
import fastText
Expand All @@ -260,8 +264,12 @@ def train_fasttext(data: pandas.DataFrame, config: Optional[Mapping[str, Any]] =
if config is None:
config = {}
config = merge_dicts(DEFAULT_CORRECTOR_CONFIG["fasttext"], config)
train_data = data[[len(str(x).split()) > 2 for x in data[Columns.Split]]].sample(
config["size"], weights=Columns.Frequency, replace=True)
tokens_number = data[Columns.Split].apply(lambda x: len(str(x).split()))
if config["adjust_frequencies"]:
weights = data[Columns.Frequency] / tokens_number
else:
weights = data[Columns.Frequency]
train_data = data[tokens_number > 1].sample(config["size"], weights=weights, replace=True)
if config["corrupt"]:
train_data = corrupt_tokens_in_df(train_data, config["typo_probability"],
config["add_typo_probability"])
Expand Down

0 comments on commit fa9c805

Please sign in to comment.