From 79598b18b0617785dd9657f0166bfb51798548ba Mon Sep 17 00:00:00 2001 From: markus583 Date: Tue, 21 May 2024 18:57:37 +0000 Subject: [PATCH] fix short seq inclusion --- wtpsplit/evaluation/intrinsic_baselines.py | 14 +++++++------- .../evaluation/intrinsic_baselines_multilingual.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/wtpsplit/evaluation/intrinsic_baselines.py b/wtpsplit/evaluation/intrinsic_baselines.py index ce0132c2..a471427e 100644 --- a/wtpsplit/evaluation/intrinsic_baselines.py +++ b/wtpsplit/evaluation/intrinsic_baselines.py @@ -79,10 +79,10 @@ class Args: for f, name in [ (punkt_sentencize, "punkt"), - # (spacy_dp_sentencize, "spacy_dp"), - # (spacy_sent_sentencize, "spacy_sent"), - # (pysbd_sentencize, "pysbd"), - # (ersatz_sentencize, "ersatz"), + (spacy_dp_sentencize, "spacy_dp"), + (spacy_sent_sentencize, "spacy_sent"), + (pysbd_sentencize, "pysbd"), + (ersatz_sentencize, "ersatz"), ]: print(f"Running {name} on {dataset_name} in {lang_code}...") indices[lang][dataset_name][name] = {} @@ -109,7 +109,7 @@ class Args: concat_indices = {} for doc in metrics: for key, value in doc.items(): - if isinstance(value, (float, int)): + if not isinstance(value, list): # numeric if key not in avg_results: avg_results[key] = [] @@ -156,6 +156,6 @@ class Args: # print(e) results[lang][dataset_name][name] = None - json.dump(results, open(Constants.CACHE_DIR / "intrinsic_baselines_punkt.json", "w"), indent=4, default=int) - json.dump(indices, open(Constants.CACHE_DIR / "intrinsic_baselines_punkt_IDX.json", "w"), indent=4, default=int) + json.dump(results, open(Constants.CACHE_DIR / "intrinsic_baselines.json", "w"), indent=4, default=int) + json.dump(indices, open(Constants.CACHE_DIR / "intrinsic_baselines_IDX.json", "w"), indent=4, default=int) print(Constants.CACHE_DIR / "intrinsic_baselines.json") diff --git a/wtpsplit/evaluation/intrinsic_baselines_multilingual.py b/wtpsplit/evaluation/intrinsic_baselines_multilingual.py index 20a30a90..b8b1835d 100644 --- a/wtpsplit/evaluation/intrinsic_baselines_multilingual.py +++ b/wtpsplit/evaluation/intrinsic_baselines_multilingual.py @@ -106,7 +106,7 @@ class Args: concat_indices = {} for doc in metrics: for key, value in doc.items(): - if isinstance(value, (float, int)): + if not isinstance(value, list): # numeric if key not in avg_results: avg_results[key] = []