From 07c78dc615d35efd0582658e19a4df80e8da51bd Mon Sep 17 00:00:00 2001 From: markus583 Date: Sun, 16 Jun 2024 12:24:03 +0000 Subject: [PATCH] prepare for v2 --- LICENSE | 2 +- data/punctuation_xlmr_top10_with.txt | 81 -- data/punctuation_xlmr_top10_without.txt | 76 -- data/punctuation_xlmr_top15_with.txt | 125 --- data/punctuation_xlmr_top15_without.txt | 119 --- data/punctuation_xlmr_top20_with.txt | 172 --- data/punctuation_xlmr_top20_without.txt | 179 ---- data/punctuation_xlmr_top25_with.txt | 224 ---- data/punctuation_xlmr_top25_without.txt | 240 ----- data/punctuation_xlmr_top30_with.txt | 276 ----- data/punctuation_xlmr_top30_without.txt | 290 ------ data/punctuation_xlmr_top5_with.txt | 46 - data/punctuation_xlmr_top5_without.txt | 36 - scripts/find_common_punctuation_xlmr.py | 18 +- scripts/intrinsic_eval_all.sh | 25 - scripts/lyrics_eval_all.sh | 39 - utils/clean_tweets.py | 6 +- .../evaluation => utils}/download_spacy.py | 0 wtpsplit/__init__.py | 341 +++++- wtpsplit/configs.py | 2 +- wtpsplit/data/punctuation_xlmv.txt | 98 -- wtpsplit/data/punctuation_xlmv_unk.txt | 99 -- .../extract_shared_task_data.py | 2 + wtpsplit/evaluation/__init__.py | 29 +- ...task1.py => evaluate_sepp_nlg_subtask1.py} | 10 +- .../wtp-bert-mini_intrinsic_results.json | 977 ------------------ .../wtp-bert-tiny_intrinsic_results.json | 977 ------------------ ...e-s-12l-no-adapters_intrinsic_results.json | 977 ------------------ .../wtp-canine-s-12l_intrinsic_results.json | 977 ------------------ ...ne-s-1l-no-adapters_intrinsic_results.json | 977 ------------------ .../wtp-canine-s-1l_intrinsic_results.json | 977 ------------------ ...ne-s-3l-no-adapters_intrinsic_results.json | 977 ------------------ .../wtp-canine-s-3l_intrinsic_results.json | 977 ------------------ ...ne-s-6l-no-adapters_intrinsic_results.json | 977 ------------------ .../wtp-canine-s-6l_intrinsic_results.json | 977 ------------------ ...ne-s-9l-no-adapters_intrinsic_results.json | 977 ------------------ .../wtp-canine-s-9l_intrinsic_results.json | 977 ------------------ wtpsplit/evaluation/intrinsic.py | 97 +- wtpsplit/evaluation/intrinsic_baselines.py | 27 +- .../intrinsic_baselines_multilingual.py | 33 +- wtpsplit/evaluation/intrinsic_list.py | 493 --------- wtpsplit/evaluation/intrinsic_pairwise.py | 156 +-- wtpsplit/evaluation/intrinsic_ted.py | 34 +- wtpsplit/evaluation/kmer_optuna.py | 245 ----- .../{law_bert.py => legal_baselines.py} | 18 +- wtpsplit/evaluation/time_intrinsic.py | 78 -- wtpsplit/tokenization_utils.py | 104 -- wtpsplit/train/adapter_utils.py | 117 --- wtpsplit/train/adaptertrainer.py | 543 +--------- wtpsplit/train/train.py | 160 +-- wtpsplit/train/train_adapter.py | 100 +- wtpsplit/train/train_adapter_parallel.py | 774 -------------- wtpsplit/train/train_xlmr.py | 574 ---------- wtpsplit/train/trainer.py | 2 +- wtpsplit/utils.py | 101 +- 55 files changed, 532 insertions(+), 17383 deletions(-) delete mode 100644 data/punctuation_xlmr_top10_with.txt delete mode 100644 data/punctuation_xlmr_top10_without.txt delete mode 100644 data/punctuation_xlmr_top15_with.txt delete mode 100644 data/punctuation_xlmr_top15_without.txt delete mode 100644 data/punctuation_xlmr_top20_with.txt delete mode 100644 data/punctuation_xlmr_top20_without.txt delete mode 100644 data/punctuation_xlmr_top25_with.txt delete mode 100644 data/punctuation_xlmr_top25_without.txt delete mode 100644 data/punctuation_xlmr_top30_with.txt delete mode 100644 data/punctuation_xlmr_top30_without.txt delete mode 100644 data/punctuation_xlmr_top5_with.txt delete mode 100644 data/punctuation_xlmr_top5_without.txt delete mode 100755 scripts/intrinsic_eval_all.sh delete mode 100755 scripts/lyrics_eval_all.sh rename {wtpsplit/evaluation => utils}/download_spacy.py (100%) delete mode 100644 wtpsplit/data/punctuation_xlmv.txt delete mode 100644 wtpsplit/data/punctuation_xlmv_unk.txt rename wtpsplit/evaluation/{evaluate_sepp_nlg_2021_subtask1.py => evaluate_sepp_nlg_subtask1.py} (95%) delete mode 100644 wtpsplit/evaluation/evaluation_results/wtp-bert-mini_intrinsic_results.json delete mode 100644 wtpsplit/evaluation/evaluation_results/wtp-bert-tiny_intrinsic_results.json delete mode 100644 wtpsplit/evaluation/evaluation_results/wtp-canine-s-12l-no-adapters_intrinsic_results.json delete mode 100644 wtpsplit/evaluation/evaluation_results/wtp-canine-s-12l_intrinsic_results.json delete mode 100644 wtpsplit/evaluation/evaluation_results/wtp-canine-s-1l-no-adapters_intrinsic_results.json delete mode 100644 wtpsplit/evaluation/evaluation_results/wtp-canine-s-1l_intrinsic_results.json delete mode 100644 wtpsplit/evaluation/evaluation_results/wtp-canine-s-3l-no-adapters_intrinsic_results.json delete mode 100644 wtpsplit/evaluation/evaluation_results/wtp-canine-s-3l_intrinsic_results.json delete mode 100644 wtpsplit/evaluation/evaluation_results/wtp-canine-s-6l-no-adapters_intrinsic_results.json delete mode 100644 wtpsplit/evaluation/evaluation_results/wtp-canine-s-6l_intrinsic_results.json delete mode 100644 wtpsplit/evaluation/evaluation_results/wtp-canine-s-9l-no-adapters_intrinsic_results.json delete mode 100644 wtpsplit/evaluation/evaluation_results/wtp-canine-s-9l_intrinsic_results.json delete mode 100644 wtpsplit/evaluation/intrinsic_list.py delete mode 100644 wtpsplit/evaluation/kmer_optuna.py rename wtpsplit/evaluation/{law_bert.py => legal_baselines.py} (95%) delete mode 100644 wtpsplit/evaluation/time_intrinsic.py delete mode 100644 wtpsplit/tokenization_utils.py delete mode 100644 wtpsplit/train/adapter_utils.py delete mode 100644 wtpsplit/train/train_adapter_parallel.py delete mode 100644 wtpsplit/train/train_xlmr.py diff --git a/LICENSE b/LICENSE index a3b4e14f..56e70848 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2023 Benjamin Minixhofer +Copyright (c) 2024 Benjamin Minixhofer, Markus Frohmann, Igor Sterner Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/data/punctuation_xlmr_top10_with.txt b/data/punctuation_xlmr_top10_with.txt deleted file mode 100644 index 53422f3a..00000000 --- a/data/punctuation_xlmr_top10_with.txt +++ /dev/null @@ -1,81 +0,0 @@ -! -" -# -% -& -' -( -) -), -); -******* -+ -, -- -. -... -/ -: -:// -; -= -? -] -_ -` -{ -| -~ -° -» -՝ -։ -، -۔ -। -၊ -။ -၏ -፡ -፡፡ -። -፣ -፤ -។ -‘ -’ -‚ -“ -” -′ -€ -╬ -▁ -▁" -▁# -▁& -▁' -▁( -▁+ -▁- -▁/ -▁: -▁` -▁« -▁· -▁» -▁̧ -▁، -▁۔ -▁। -▁។ -▁៖ -▁– -▁— -▁‘ -▁“ -▁„ -▁€ -、 -。 -】【 diff --git a/data/punctuation_xlmr_top10_without.txt b/data/punctuation_xlmr_top10_without.txt deleted file mode 100644 index 62fac613..00000000 --- a/data/punctuation_xlmr_top10_without.txt +++ /dev/null @@ -1,76 +0,0 @@ -! -" -", -# -$ -% -& -' -( -) -), -). -); -)، -******* -+ -, -- -. -..! -... -.” -/ -: -:// -:“ -; -= -? -?? -[ -] -_ -` -{ -| -~ -° -· -» -՝ -։ -، -؛ -؟ -۔ -। -၊ -။ -၏ -፡ -፡፡ -። -፣ -፤ -។ -៖ -— -—— -‘ -’ -‚ -“ -” -”. -′ -′′ -‹ -⁊ -€ -║ -╬ -▁ -、 -。 -】【 diff --git a/data/punctuation_xlmr_top15_with.txt b/data/punctuation_xlmr_top15_with.txt deleted file mode 100644 index 368d9ad9..00000000 --- a/data/punctuation_xlmr_top15_with.txt +++ /dev/null @@ -1,125 +0,0 @@ -! -" -", -# -$ -% -& -&# -' -( -) -), -). -); -)، -******* -+ -, -- -. -..! -... -.” -/ -: -:// -:“ -; -= -> -? -?? -[ -] -_ -` -{ -| -~ -° -· -» -՝ -։ -״ -، -؛ -؟ -۔ -। -၊ -။ -၍ -၏ -፡ -፡፡ -። -፣ -፤ -។ -៖ -— -—— -‘ -’ -‚ -“ -” -”. -′ -′′ -‹ -⁊ -€ -║ -╬ -▁ -▁" -▁# -▁$ -▁% -▁& -▁' -▁( -▁* -▁+ -▁- -▁.. -▁... -▁/ -▁: -▁; -▁= -▁>> -▁? -▁[ -▁^ -▁` -▁} -▁« -▁· -▁» -▁̧ -▁، -▁۔ -▁। -▁។ -▁៖ -▁– -▁— -▁‘ -▁“ -▁” -▁„ -▁• -▁€ -、 -。 -《 -「 -」 -】 -】【 -・ -😂 diff --git a/data/punctuation_xlmr_top15_without.txt b/data/punctuation_xlmr_top15_without.txt deleted file mode 100644 index 3a1fbf19..00000000 --- a/data/punctuation_xlmr_top15_without.txt +++ /dev/null @@ -1,119 +0,0 @@ -! -!!! -" -", -". -"; -# -$ -% -& -&# -' -( -) -), -). -); -)، -******* -+ -, -- -. -..! -... -...! -................ -.” -/ -: -:// -:“ -; -= -> -? -?" -?? -?” -@ -[ -\ -] -]] -_ -________________ -` -{ -| -} -~ -¤ -° -± -¶ -· -» -») -», -». -˹ -՝ -։ -׳ -״ -، -؛ -؟ -۔ -। -।’ -၊ -။ -၍ -၏ -፡ -፡፡ -። -፣ -፤ -។ -៕ -៖ -– -— -—— -‘ -’ -‚ -“ -“, -” -”, -”. -′ -′′ -‹ -⁊ -€ -║ -╕ -╗ -╛ -╝ -╬ -▁ -░ -□ -✡ -、 -。 -《 -》 -「 -」 -】 -】【 -・ -😂 diff --git a/data/punctuation_xlmr_top20_with.txt b/data/punctuation_xlmr_top20_with.txt deleted file mode 100644 index 78ca81a1..00000000 --- a/data/punctuation_xlmr_top20_with.txt +++ /dev/null @@ -1,172 +0,0 @@ -! -!!! -" -"). -", -". -# -$ -% -& -&# -' -( -) -), -). -); -)، -******* -+ -, -- -. -..! -... -.” -/ -: -:// -:“ -; -= -> -? -?" -?? -@ -[ -\ -] -]] -_ -` -{ -| -} -~ -¤ -° -± -· -» -») -», -˶ -˹ -՝ -։ -״ -، -؛ -؟ -۔ -। -।’ -၊ -။ -၍ -၏ -፡ -፡፡ -። -፣ -፤ -។ -៖ -– -— -—— -‘ -’ -‚ -“ -“, -” -”, -”. -′ -′′ -‹ -› -⁊ -€ -║ -╗ -╛ -╝ -╣ -╬ -▁ -▁! -▁" -▁# -▁$ -▁% -▁& -▁' -▁( -▁(...) -▁(« -▁) -▁* -▁+ -▁- -▁.. -▁... -▁/ -▁: -▁:) -▁; -▁;) -▁= -▁> -▁>> -▁? -▁@ -▁[ -▁[[ -▁^ -▁` -▁} -▁« -▁° -▁· -▁» -▁× -▁́ -▁̧ -▁، -▁۔ -▁। -▁ေ -▁့ -▁၊ -▁။ -▁។ -▁៖ -▁– -▁— -▁‘ -▁’’ -▁“ -▁” -▁„ -▁• -▁€ -▁↑ -▁【 -□ -○ -✡ -、 -。 -《 -》 -「 -」 -【 -】 -】【 -・ -😂 diff --git a/data/punctuation_xlmr_top20_without.txt b/data/punctuation_xlmr_top20_without.txt deleted file mode 100644 index 291e6a14..00000000 --- a/data/punctuation_xlmr_top20_without.txt +++ /dev/null @@ -1,179 +0,0 @@ -! -!! -!!! -!" -!“ -" -"). -", -". -"; -# -$ -% -%) -& -&# -' -( -) -))) -), -). -): -); -)، -* -** -******* -+ -, -- -. -." -..! -... -...! -..." -..... -...... -........ -.......... -................ -...” -..? -.” -/ -: -:// -:“ -; -= -== -> -? -?! -?" -?? -?“ -?” -@ -[ -\ -] -]] -^ -_ -__ -________________ -` -`` -{ -{{ -| -} -}} -~ -¡ -¤ -© -« -® -° -± -¶ -· -» -») -», -»- -». -¿ -˵ -˶ -˹ -՝ -՞ -։ -׳ -״ -، -؛ -؟ -۔ -۔۔۔ -। -।’ -၊ -။ -၌ -၍ -၏ -፡ -፡፡ -። -፣ -፤ -፥ -។ -៕ -៖ -– -— -—— -‘ -‘‘ -’ -‚ -“ -“, -“. -” -”, -”. -‡ -• -′ -′′ -‹ -› -⁊ -€ -₺ -∼ -┐ -║ -╒ -╕ -╗ -╛ -╝ -╢ -╣ -╬ -▁ -░ -□ -○ -✖ -✡ -❥ -⤵ -⭐ -、 -。 -。” -《 -》 -「 -」 -【 -】 -】【 -〜 -・ -🌟 -🎁 -😂 -😄 -🙂 diff --git a/data/punctuation_xlmr_top25_with.txt b/data/punctuation_xlmr_top25_with.txt deleted file mode 100644 index 7daf84ad..00000000 --- a/data/punctuation_xlmr_top25_with.txt +++ /dev/null @@ -1,224 +0,0 @@ -! -!! -!!! -!" -" -"). -", -". -"; -# -$ -% -%) -& -&# -' -( -) -), -). -): -); -)، -* -******* -+ -, -- -. -." -..! -... -...! -................ -.” -/ -: -:// -:“ -; -= -> -? -?" -?? -?“ -?” -@ -[ -\ -] -]] -^ -_ -________________ -` -`` -{ -| -} -~ -¤ -° -± -· -» -») -», -»- -». -¿ -× -˵ -˶ -˹ -՝ -՞ -։ -׳ -״ -، -؛ -؟ -۔ -। -।’ -၊ -။ -၍ -၏ -፡ -፡፡ -። -፣ -፤ -፥ -។ -៕ -៖ -– -— -—— -‘ -‘‘ -’ -‚ -“ -“, -“. -” -”, -”. -”。 -‡ -• -′ -′′ -‹ -› -⁊ -€ -₺ -┐ -║ -╒ -╕ -╗ -╛ -╝ -╢ -╣ -╬ -▁ -▁! -▁" -▁# -▁$ -▁% -▁%. -▁& -▁' -▁( -▁(...) -▁(« -▁) -▁* -▁+ -▁- -▁.. -▁... -▁/ -▁: -▁:) -▁:-) -▁; -▁;) -▁< -▁= -▁> -▁>> -▁? -▁@ -▁[ -▁[...] -▁[[ -▁^ -▁` -▁{ -▁} -▁~ -▁“ -▁§ -▁« -▁° -▁· -▁» -▁¿ -▁× -▁́ -▁̧ -▁، -▁۔ -▁। -▁් -▁ေ -▁့ -▁၊ -▁။ -▁។ -▁៕ -▁៖ -▁– -▁— -▁‘ -▁‘‘ -▁’’ -▁“ -▁” -▁„ -▁• -▁‬ -▁› -▁€ -▁↑ -▁【 -▁😀 -░ -□ -○ -✡ -⭐ -、 -。 -《 -》 -「 -」 -【 -】 -】【 -〜 -・ -~ -😂 diff --git a/data/punctuation_xlmr_top25_without.txt b/data/punctuation_xlmr_top25_without.txt deleted file mode 100644 index cd3c2b53..00000000 --- a/data/punctuation_xlmr_top25_without.txt +++ /dev/null @@ -1,240 +0,0 @@ -! -!! -!!! -!" -!) -!» -!“ -!” -" -") -"), -"). -", -". -"; -"، -# -$ -% -%) -& -&# -' -'' -( -) -))) -), -). -): -); -)، -* -** -**** -******* -+ -, -- ----------- -. -." -..! -..!! -... -...! -..." -...) -..... -...... -....... -........ -.......... -................ -...» -...” -..? -.” -/ -: -:// -:“ -; -= -== -> ->< -? -?! -?" -?? -?“ -?” -@ -[ -[/ -\ -\\ -] -]] -^ -_ -__ -___ -________ -________________ -` -`` -{ -{{ -| -} -}} -~ -¡ -£ -¤ -§ -© -« -«. -® -° -± -¶ -· -» -») -», -»,- -»- -». -»; -»։ -¿ -× -˭ -˵ -˶ -˷ -˹ -˾ -՝ -՞ -։ -׳ -״ -، -؎ -؛ -؟ -؟؟؟؟ -٪ -٫ -٬ -۔ -۔۔۔ -। -।। -।’ -।” -၊ -။ -၌ -၍ -၎ -၏ -፡ -፡፡ -። -፣ -፤ -፥ -។ -៕ -៖ -‐ -– -— -—— -‘ -‘‘ -’ -‚ -“ -“, -“. -” -”) -”), -”, -”. -”; -”“ -”。 -„ -‡ -• -′ -′′ -‹ -› -›› -⁊ -€ -₱ -₺ -∼ -≥ -┐ -║ -╒ -╕ -╗ -╙ -╛ -╝ -╢ -╣ -╥ -╬ -▁ -░ -■ -□ -▼ -○ -⚜ -✖ -✡ -❥ -⤵ -⭐ -、 -。 -。” -《 -》 -「 -」 -」, -』 -【 -】 -】【 -〜 -・ -~ -🌟 -🎁 -🐛🐜 -🐝 -👍 -💕 -💪 -😂 -😄 -😍 -😘 -🙂 diff --git a/data/punctuation_xlmr_top30_with.txt b/data/punctuation_xlmr_top30_with.txt deleted file mode 100644 index 86389d4b..00000000 --- a/data/punctuation_xlmr_top30_with.txt +++ /dev/null @@ -1,276 +0,0 @@ -! -!! -!!! -!" -!“ -" -"). -", -". -"; -# -$ -% -%) -& -&# -' -( -) -))) -), -). -): -); -)، -* -** -******* -+ -, -- -. -." -..! -... -...! -..." -...) -...... -................ -..? -.” -/ -: -:// -:“ -; -= -== -> ->< -? -?! -?" -?? -?“ -?” -@ -[ -\ -\\ -] -]] -^ -_ -__ -___ -________ -________________ -` -`` -{ -{{ -| -} -}} -~ -¡ -¤ -© -« -® -° -± -¶ -· -» -») -», -»- -». -¿ -× -˵ -˶ -˹ -՝ -՞ -։ -׳ -״ -، -؛ -؟ -۔ -۔۔۔ -। -।’ -।” -၊ -။ -၍ -၏ -፡ -፡፡ -። -፣ -፤ -፥ -។ -៕ -៖ -– -— -—— -‘ -‘‘ -’ -‚ -“ -“, -“. -” -”, -”. -”。 -‡ -• -′ -′′ -‹ -› -⁊ -€ -₺ -┐ -║ -╒ -╕ -╗ -╙ -╛ -╝ -╢ -╣ -╬ -▁ -▁! -▁" -▁# -▁$ -▁% -▁%. -▁& -▁' -▁'' -▁( -▁(+ -▁(...) -▁(« -▁) -▁* -▁** -▁+ -▁- -▁-- -▁--> -▁.. -▁... -▁..... -▁......... -▁/ -▁: -▁:) -▁:- -▁:-) -▁:: -▁; -▁;) -▁< -▁= -▁> -▁>> -▁? -▁@ -▁[ -▁[...] -▁[[ -▁^ -▁` -▁{ -▁} -▁~ -▁“ -▁¡ -▁£ -▁§ -▁« -▁° -▁· -▁» -▁¿ -▁× -▁́ -▁̧ -▁، -▁۔ -▁। -▁් -▁ေ -▁့ -▁၊ -▁။ -▁፣ -▁។ -▁៕ -▁៖ -▁– -▁— -▁‘ -▁‘‘ -▁’’ -▁“ -▁” -▁„ -▁• -▁‬ -▁‹‹ -▁› -▁€ -▁€. -▁← -▁↑ -▁→ -▁「 -▁【 -▁😀 -▁😉 -▁🙂 -░ -■ -□ -○ -● -✖ -✡ -❥ -⤵ -⭐ -、 -。 -。” -《 -》 -「 -」 -』 -【 -】 -】【 -〜 -・ -~ -😂 -😄 -🙂 diff --git a/data/punctuation_xlmr_top30_without.txt b/data/punctuation_xlmr_top30_without.txt deleted file mode 100644 index 2a6d0402..00000000 --- a/data/punctuation_xlmr_top30_without.txt +++ /dev/null @@ -1,290 +0,0 @@ -! -!! -!!! -!!!! -!" -!) -!» -!“ -!” -" -") -"), -"). -", -". -"; -"? -"، -# -$ -% -%) -& -&# -' -'' -( -) -))) -), -). -): -); -)، -* -** -*** -**** -******* -+ -, -- ---- ----- ----------- -. -." -.'' -..! -..!! -... -...! -..." -...) -..... -...... -....... -........ -.......... -........... -................ -...» -...” -..? -.” -/ -// -: -:)) -:// -:“ -; -< -<<< -= -== -> ->< ->> ->>> -? -?! -?" -?? -??? -???? -?“ -?” -@ -[ -[/ -\ -\\ -] -]] -^ -_ -__ -___ -____ -________ -________________ -` -`` -{ -{{ -| -} -}} -~ -~~ -¡ -£ -¤ -§ -© -« -«. -® -° -± -¶ -· -» -») -», -»,- -»- -». -»: -»; -»։ -»، -¿ -× -˭ -˳ -˵ -˶ -˷ -˹ -˽ -˾ -϶ -՝ -՞ -։ -֊ -׳ -״ -، -؎ -؛ -؟ -؟؟؟؟ -٪ -٫ -٬ -۔ -۔۔۔ -। -।। -।’ -।” -၊ -။ -၌ -၍ -၎ -၏ -፡ -፡፡ -። -፣ -፤ -፥ -፦ -។ -៕ -៖ -៚ -‐ -– -— -—— -‘ -‘‘ -’ -‚ -“ -“, -“. -” -”) -”), -”, -”. -”; -”“ -”。 -„ -‡ -• -‰ -′ -′′ -‹ -› -›› -⁊ -₫ -€ -₱ -₺ -− -∼ -≥ -─ -┐ -┴ -║ -╒ -╕ -╗ -╙ -╚ -╛ -╝ -╡ -╢ -╣ -╥ -╬ -▀ -▁ -░ -▒ -▓ -■ -□ -▼ -◄ -○ -● -☝ -⚜ -✈ -✖ -✡ -❥ -⤵ -⭐ -、 -、「 -。 -。” -《 -》 -》, -「 -」 -」, -」。 -『 -』 -【 -】 -】【 -〒 -〜 -・ -~ -🇩 -🇮 -🌟 -🎁 -🐛🐜 -🐝 -🐞 -👁 -👆 -👍 -💕 -💪 -😂 -😄 -😍 -😘 -🙂 -🦂 diff --git a/data/punctuation_xlmr_top5_with.txt b/data/punctuation_xlmr_top5_with.txt deleted file mode 100644 index 1fce3876..00000000 --- a/data/punctuation_xlmr_top5_with.txt +++ /dev/null @@ -1,46 +0,0 @@ -! -" -# -% -' -( -) -, -- -. -... -/ -: -; -? -| -° -։ -، -۔ -। -၊ -။ -፡፡ -። -፣ -។ -‘ -’ -‚ -▁ -▁# -▁( -▁- -▁« -▁· -▁، -▁۔ -▁। -▁។ -▁– -▁‘ -▁€ -、 -。 -】【 diff --git a/data/punctuation_xlmr_top5_without.txt b/data/punctuation_xlmr_top5_without.txt deleted file mode 100644 index 3d5d469a..00000000 --- a/data/punctuation_xlmr_top5_without.txt +++ /dev/null @@ -1,36 +0,0 @@ -! -" -# -% -' -( -) -, -- -. -... -/ -: -; -? -_ -| -° -» -։ -، -۔ -। -၊ -။ -፡፡ -። -፣ -។ -‘ -’ -‚ -▁ -、 -。 -】【 diff --git a/scripts/find_common_punctuation_xlmr.py b/scripts/find_common_punctuation_xlmr.py index 83329e4c..c7f00f05 100644 --- a/scripts/find_common_punctuation_xlmr.py +++ b/scripts/find_common_punctuation_xlmr.py @@ -1,15 +1,14 @@ +import json +import os import re +import unicodedata from collections import Counter -from transformers import HfArgumentParser, XLMRobertaTokenizer from dataclasses import dataclass -from datasets import load_dataset -import unicodedata -import json -import pickle -import pandas as pd -import os from pathlib import Path +import pandas as pd +from datasets import load_dataset +from transformers import HfArgumentParser, XLMRobertaTokenizer ROOT_DIR = Path(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) LANGINFO = pd.read_csv(os.path.join(ROOT_DIR, "data", "language_info.csv"), index_col=0) @@ -26,15 +25,14 @@ class Args: tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base") -# Regex for checking tokens that start with an underscore followed by punctuation punctuation_pattern = re.compile(r"^▁+[^\w\s]+?$") def is_punctuation(token, include_whitespace=False): - # Check if token matches the regular expression + # check if token matches the regular expression if punctuation_pattern.match(token): return include_whitespace - # Fallback to check if all characters in the token are punctuation + # fallback return all("P" in unicodedata.category(ch) for ch in token) or all("S" in unicodedata.category(ch) for ch in token) diff --git a/scripts/intrinsic_eval_all.sh b/scripts/intrinsic_eval_all.sh deleted file mode 100755 index f804b2e2..00000000 --- a/scripts/intrinsic_eval_all.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -# This script takes one argument: --model_path -MODEL_PATH="$1" - -# array to hold the following additional commands: -# "--do_lowercase", "--do_remove_punct", "--do_lowercase --do_remove_punct" -assoc_array=( - # "--do_lowercase" - "--do_remove_punct" - "--do_lowercase --do_remove_punct" -) -# Loop through the associative array -for i in "${assoc_array[@]}" -do - # Construct the command - cmd="python3 wtpsplit/evaluation/intrinsic.py --model_path $MODEL_PATH $i" - - - # Execute the command - echo "Executing: $cmd" - eval $cmd -done - -echo "All evaluations completed." diff --git a/scripts/lyrics_eval_all.sh b/scripts/lyrics_eval_all.sh deleted file mode 100755 index d00392ee..00000000 --- a/scripts/lyrics_eval_all.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -# This script takes one argument: --model_path -MODEL_PATH="$1" - -# Define an associative array for eval_data_path and their corresponding save_suffixes -declare -A eval_data_paths=( - ["data/lyrics_lines.pt"]="lines" - ["data/lyrics_lines_lower.pt"]="lines_lower" - ["data/lyrics_lines_rmp_lower.pt"]="lines_lower" - ["data/lyrics_lines_rmp_lower.pt"]="lines_lower_rmp" - ["data/lyrics_lines_rmp.pt"]="lines_rmp" - ["data/lyrics_verses_strip_n.pt"]="verses" - ["data/lyrics_verses_lower_strip_n.pt"]="verses_lower" - ["data/lyrics_verses_rmp_strip_n.pt"]="verses_rmp" - ["data/lyrics_verses_rmp_lower_strip_n.pt"]="verses_lower_rmp" -) - -# Path to the custom_language_list file -CUSTOM_LANG_LIST="data/lyrics_langs.csv" - -# Loop through the associative array -for eval_data_path in "${!eval_data_paths[@]}"; do - save_suffix="${eval_data_paths[$eval_data_path]}" - - # Construct the command - cmd="python3 wtpsplit/evaluation/intrinsic.py --model_path $MODEL_PATH --eval_data_path $eval_data_path --custom_language_list $CUSTOM_LANG_LIST" - - # Append --save_suffix if it's not empty - if [[ -n $save_suffix ]]; then - cmd="$cmd --save_suffix $save_suffix" - fi - - # Execute the command - echo "Executing: $cmd" - eval $cmd -done - -echo "All evaluations completed." diff --git a/utils/clean_tweets.py b/utils/clean_tweets.py index 696a07bc..b30c9135 100644 --- a/utils/clean_tweets.py +++ b/utils/clean_tweets.py @@ -45,8 +45,6 @@ def pair_sentences(sequences): transformed_data = {} for lang_code, lang_data in data.items(): - if lang_code == "en-de": - continue transformed_data[lang_code] = {} for content_type, datasets in lang_data.items(): if content_type != "sentence": @@ -64,7 +62,7 @@ def pair_sentences(sequences): return transformed_data -data = torch.load("data/all_data_11_05-all.pth") +data = torch.load("data/all_tweets.pth") transformed_data = transform_data(data) -torch.save(transformed_data, "data/all_data_11_05-short_proc_SM.pth") +torch.save(transformed_data, "data/all_data_tweets_cleaned.pth") diff --git a/wtpsplit/evaluation/download_spacy.py b/utils/download_spacy.py similarity index 100% rename from wtpsplit/evaluation/download_spacy.py rename to utils/download_spacy.py diff --git a/wtpsplit/__init__.py b/wtpsplit/__init__.py index f3717c39..9a4307fe 100644 --- a/wtpsplit/__init__.py +++ b/wtpsplit/__init__.py @@ -1,7 +1,7 @@ +import contextlib import math import os from pathlib import Path -import contextlib # avoid the "None of PyTorch, TensorFlow, etc. have been found" warning. with contextlib.redirect_stderr(open(os.devnull, "w")): @@ -9,13 +9,16 @@ import numpy as np import skops.io as sio +from huggingface_hub import hf_hub_download from transformers import AutoConfig, AutoModelForTokenClassification from transformers.utils.hub import cached_file +import adapters +from wtpsplit.evaluation import token_to_char_probs from wtpsplit.extract import ORTWrapper, PyTorchWrapper, extract from wtpsplit.utils import Constants, indices_to_sentences, sigmoid -__version__ = "1.2.4" +__version__ = "1.0.0" class WtP: @@ -199,7 +202,7 @@ def _predict_proba( input_texts = [] space_positions = [] - for text in input_texts: + for text in outer_batch_texts: if remove_whitespace_before_inference: text_space_positions = [] input_text = "" @@ -217,7 +220,7 @@ def _predict_proba( input_texts.append(input_text) outer_batch_logits = extract( - outer_batch_texts, + input_texts, self.model, lang_code=lang_code, stride=stride, @@ -225,7 +228,7 @@ def _predict_proba( batch_size=batch_size, pad_last_batch=pad_last_batch, verbose=verbose, - ) + )[0] def newline_probability_fn(logits): return sigmoid(logits[:, Constants.NEWLINE_INDEX]) @@ -238,14 +241,14 @@ def newline_probability_fn(logits): sentence_probs = newline_probs = newline_probability_fn(logits) if remove_whitespace_before_inference: - newline_probs, sentence_probs = list(newline_probs), list(sentence_probs) + full_newline_probs, full_sentence_probs = list(newline_probs), list(sentence_probs) - for i in space_positions: - newline_probs.insert(i, np.zeros_like(newline_probs[0])) - sentence_probs.insert(i, np.zeros_like(sentence_probs[0])) + for j in space_positions[i]: + full_newline_probs.insert(j, np.zeros_like(newline_probs[0])) + full_sentence_probs.insert(j, np.zeros_like(sentence_probs[0])) - newline_probs = np.array(newline_probs) - sentence_probs = np.array(sentence_probs) + newline_probs = np.array(full_newline_probs) + sentence_probs = np.array(full_sentence_probs) if return_paragraph_probabilities: yield sentence_probs, newline_probs @@ -397,3 +400,319 @@ def _split( text, np.where(probs > sentence_threshold)[0], strip_whitespace=strip_whitespace ) yield sentences + + +class SaT: + def __init__( + self, + model_name_or_model, + from_pretrained_kwargs=None, + ort_providers=None, + ort_kwargs=None, + style_or_domain: str = None, + language: str = None, + lora_path: str = None, # local + hub_prefix="segment-any-text", + ): + self.model_name_or_model = model_name_or_model + self.ort_providers = ort_providers + self.ort_kwargs = ort_kwargs + + self.use_lora = False + + if isinstance(model_name_or_model, (str, Path)): + model_name = str(model_name_or_model) + is_local = os.path.isdir(model_name) + + if not is_local and hub_prefix is not None: + model_name_to_fetch = f"{hub_prefix}/{model_name}" + else: + model_name_to_fetch = model_name + + if is_local: + model_path = Path(model_name) + onnx_path = model_path / "model.onnx" + if not onnx_path.exists(): + onnx_path = None + else: + # no need to load if no ort_providers set + if ort_providers is not None: + onnx_path = cached_file(model_name_to_fetch, "model.onnx", **(from_pretrained_kwargs or {})) + else: + onnx_path = None + + if ort_providers is not None: + if onnx_path is None: + raise ValueError( + "Could not find an ONNX model in the model directory. Try `use_ort=False` to run with PyTorch." + ) + + try: + import onnxruntime as ort # noqa + except ModuleNotFoundError: + raise ValueError("Please install `onnxruntime` to use WtP with an ONNX model.") + + # to register models for AutoConfig + import wtpsplit.configs # noqa + + # TODO: ONNX integration + self.model = ORTWrapper( + AutoConfig.from_pretrained(model_name_to_fetch, **(from_pretrained_kwargs or {})), + ort.InferenceSession(str(onnx_path), providers=ort_providers, **(ort_kwargs or {})), + ) + else: + # to register models for AutoConfig + try: + import torch # noqa + except ModuleNotFoundError: + raise ValueError("Please install `torch` to use WtP with a PyTorch model.") + + import wtpsplit.models # noqa + + self.model = PyTorchWrapper( + AutoModelForTokenClassification.from_pretrained( + model_name_to_fetch, **(from_pretrained_kwargs or {}) + ) + ) + # LoRA LOADING + # TODO: LoRA + ONNX ? + if (style_or_domain and not language) or (language and not style_or_domain): + raise ValueError("Please specify both language and style_or_domain!") + if style_or_domain and language: + model_type = self.model.model.config.model_type + # adapters need xlm-roberta as model type. + self.model.model.config.model_type = "xlm-roberta" + adapters.init(self.model.model) + # reset model type (used later) + self.model.model.config.model_type = model_type + try: + if not lora_path: + for file in [ + "adapter_config.json", + "head_config.json", + "pytorch_adapter.bin", + "pytorch_model_head.bin", + ]: + hf_hub_download( + repo_id=model_name_to_fetch, + subfolder=f"loras/{style_or_domain}/{language}", + filename=file, + local_dir=Constants.CACHE_DIR, + ) + lora_load_path = str(Constants.CACHE_DIR / "loras" / style_or_domain / language) + else: + lora_load_path = lora_path + + self.model.model.load_adapter( + lora_load_path, + set_active=True, + with_head=True, + load_as="sat-lora", + ) + # merge lora weights into transformer for 0 efficiency overhead + self.model.model.merge_adapter("sat-lora") + self.use_lora = True + except: + if lora_path: + print(f"LoRA at {lora_path} not found, using base model...") + else: + print(f"LoRA {style_or_domain}/{language} not found, using base model...") + else: + if ort_providers is not None: + raise ValueError("You can only use onnxruntime with a model directory, not a model object.") + + self.model = model_name_or_model + + def __getattr__(self, name): + assert hasattr(self, "model") + return getattr(self.model, name) + + def predict_proba( + self, + text_or_texts, + stride=256, + block_size: int = 512, + batch_size=32, + pad_last_batch: bool = False, + return_paragraph_probabilities=False, + verbose: bool = False, + ): + if isinstance(text_or_texts, str): + return next( + self._predict_proba( + [text_or_texts], + stride=stride, + block_size=block_size, + batch_size=batch_size, + pad_last_batch=pad_last_batch, + return_paragraph_probabilities=return_paragraph_probabilities, + verbose=verbose, + ) + ) + else: + return self._predict_proba( + text_or_texts, + stride=stride, + block_size=block_size, + batch_size=batch_size, + pad_last_batch=pad_last_batch, + return_paragraph_probabilities=return_paragraph_probabilities, + verbose=verbose, + ) + + def _predict_proba( + self, + texts, + stride: int, + block_size: int, + batch_size: int, + pad_last_batch: bool, + return_paragraph_probabilities: bool, + verbose: bool, + ): + def newline_probability_fn(logits): + return sigmoid(logits[:, Constants.NEWLINE_INDEX]) + + for text in texts: + outer_batch_logits, offsets_mapping, tokenizer = extract( + [text], + self.model, + stride=stride, + max_block_size=block_size, + batch_size=batch_size, + pad_last_batch=pad_last_batch, + verbose=verbose, + ) + + logits = outer_batch_logits[0] + if offsets_mapping is not None: + offsets_mapping = offsets_mapping[0] + tokens = tokenizer.tokenize(text, verbose=False) + + # convert token probabilities to character probabilities for the entire array + logits = token_to_char_probs(text, tokens, logits, tokenizer, offsets_mapping) + sentence_probs = newline_probs = newline_probability_fn(logits) + + if return_paragraph_probabilities: + yield sentence_probs, newline_probs + else: + yield sentence_probs + + def split( + self, + text_or_texts, + threshold: float = None, + stride=64, + block_size: int = 512, + batch_size=32, + pad_last_batch: bool = False, + paragraph_threshold: float = 0.5, + strip_whitespace: bool = False, + do_paragraph_segmentation=False, + verbose: bool = False, + ): + if isinstance(text_or_texts, str): + return next( + self._split( + [text_or_texts], + threshold=threshold, + stride=stride, + block_size=block_size, + batch_size=batch_size, + pad_last_batch=pad_last_batch, + paragraph_threshold=paragraph_threshold, + strip_whitespace=strip_whitespace, + do_paragraph_segmentation=do_paragraph_segmentation, + verbose=verbose, + ) + ) + else: + return self._split( + text_or_texts, + threshold=threshold, + stride=stride, + block_size=block_size, + batch_size=batch_size, + pad_last_batch=pad_last_batch, + paragraph_threshold=paragraph_threshold, + strip_whitespace=strip_whitespace, + do_paragraph_segmentation=do_paragraph_segmentation, + verbose=verbose, + ) + + def _split( + self, + texts, + threshold: float, + stride: int, + block_size: int, + batch_size: int, + pad_last_batch: bool, + paragraph_threshold: float, + do_paragraph_segmentation: bool, + strip_whitespace: bool, + verbose: bool, + ): + def get_default_threshold(model_str: str): + if "sm" in model_str: + return 0.25 + if self.use_lora: + return 0.5 + if "no-limited-lookahead" in model_str and "sm" not in model_str: + return 0.01 + return 0.025 + + default_threshold = get_default_threshold(self.model_name_or_model) + sentence_threshold = threshold if threshold is not None else default_threshold + + for text, probs in zip( + texts, + self.predict_proba( + texts, + stride=stride, + block_size=block_size, + batch_size=batch_size, + pad_last_batch=pad_last_batch, + return_paragraph_probabilities=do_paragraph_segmentation, + verbose=verbose, + ), + ): + if do_paragraph_segmentation: + sentence_probs, newline_probs = probs + + offset = 0 + paragraphs = [] + + for paragraph in indices_to_sentences(text, np.where(newline_probs > paragraph_threshold)[0]): + sentences = [] + + for sentence in indices_to_sentences( + paragraph, + np.where( + sentence_probs[offset : offset + len(paragraph)] > sentence_threshold, + )[0], + strip_whitespace=strip_whitespace, + ): + sentences.append(sentence) + + paragraphs.append(sentences) + offset += len(paragraph) + + yield paragraphs + else: + sentences = indices_to_sentences( + text, np.where(probs > sentence_threshold)[0], strip_whitespace=strip_whitespace + ) + yield sentences + + +if __name__ == "__main__": + sat_lora = SaT("sat-3l", style_or_domain="ud", language="en") + out = sat_lora.split( + "Hello this is a test But this is different now Now the next one starts looool", + do_paragraph_segmentation=False, + strip_whitespace=True, + ) + print(out) + splits = list(sat_lora.split(["Paragraph-A Paragraph-B", "Paragraph-C100 Paragraph-D"])) + print(splits) diff --git a/wtpsplit/configs.py b/wtpsplit/configs.py index 0b63587a..0eaffd02 100644 --- a/wtpsplit/configs.py +++ b/wtpsplit/configs.py @@ -40,7 +40,7 @@ def __init__( class SubwordXLMConfig(XLMRobertaConfig): - """Config for XLM-R and XLM-V models. Used for token-level training. + """Config for XLM-R. Used for token-level training, i.e., SaT models. Args: XLMRobertaConfig: Base class. diff --git a/wtpsplit/data/punctuation_xlmv.txt b/wtpsplit/data/punctuation_xlmv.txt deleted file mode 100644 index 723c5b2f..00000000 --- a/wtpsplit/data/punctuation_xlmv.txt +++ /dev/null @@ -1,98 +0,0 @@ -! -" -# -$ -% -& -' -( -) -* -+ -, -- -. -/ -: -; -< -= -> -? -@ -[ -\ -] -^ -_ -` -{ -| -} -~ -¡ -£ -§ -© -« -¬ -® -° -± -· -» -¿ -÷ -՝ -՞ -։ -־ -׳ -، -؛ -؟ -۔ -। -॥ -၊ -။ -၌ -၍ -၎ -၏ -፡ -። -፣ -፤ -፥ -។ -៕ -៖ -– -— -‘ -’ -“ -” -„ -• -′ -‹ -› -€ -↑ -→ -□ -➖ -、 -。 -《 -》 -「 -」 -『 -』 -【 -】 -・ -~ diff --git a/wtpsplit/data/punctuation_xlmv_unk.txt b/wtpsplit/data/punctuation_xlmv_unk.txt deleted file mode 100644 index c4b6017d..00000000 --- a/wtpsplit/data/punctuation_xlmv_unk.txt +++ /dev/null @@ -1,99 +0,0 @@ -! -" -# -$ -% -& -' -( -) -* -+ -, -- -. -/ -: -; -< -= -> -? -@ -[ -\ -] -^ -_ -` -{ -| -} -~ -¡ -£ - -§ -© -« -¬ -® -° -± -· -» -¿ -÷ -՝ -՞ -։ -־ -׳ -، -؛ -؟ -۔ -। -॥ -၊ -။ -၌ -၍ -၎ -၏ -፡ -። -፣ -፤ -፥ -។ -៕ -៖ -– -— -‘ -’ -“ -” -„ -• -′ -‹ -› -€ -↑ -→ -□ -➖ -、 -。 -《 -》 -「 -」 -『 -』 -【 -】 -・ -~ diff --git a/wtpsplit/data_acquisition/extract_shared_task_data.py b/wtpsplit/data_acquisition/extract_shared_task_data.py index c10f728e..7998e835 100644 --- a/wtpsplit/data_acquisition/extract_shared_task_data.py +++ b/wtpsplit/data_acquisition/extract_shared_task_data.py @@ -60,6 +60,8 @@ def build_data_dictionary(root_dir): return data_dict +# this must be downloaded first from the linked drive here +# https://sites.google.com/view/sentence-segmentation root_dir = Constants.ROOT_DIR.parent / "data/sepp_nlg_2021_data" data = build_data_dictionary(root_dir) diff --git a/wtpsplit/evaluation/__init__.py b/wtpsplit/evaluation/__init__.py index 5a0c7a7f..8d3ed68c 100644 --- a/wtpsplit/evaluation/__init__.py +++ b/wtpsplit/evaluation/__init__.py @@ -49,17 +49,18 @@ def evaluate_sentences( assert len(text) == len("".join(predicted_sentences)) labels = get_labels(lang_code, sentences) - + predicted_end_indices = np.cumsum(np.array([len(s) for s in predicted_sentences])) predictions = np.zeros_like(labels) predictions[predicted_end_indices] = 1 - + assert len(labels) == len(predictions) - + + # we exclude labels for metrics calculation to enable a fair comparison with LLMs. if exclude_every_k > 0: true_end_indices = np.where(labels == 1)[0] # every k-th from those where labels are 1 - indices_to_remove = true_end_indices[exclude_every_k-1::exclude_every_k] + indices_to_remove = true_end_indices[exclude_every_k - 1 :: exclude_every_k] # mask for indices to keep mask = np.ones_like(labels, dtype=bool) @@ -69,7 +70,7 @@ def evaluate_sentences( # remove indices labels = labels[mask] predictions = predictions[mask] - + assert len(labels) == len(predictions) return f1_score(labels, predictions, zero_division=0), { @@ -84,16 +85,14 @@ def evaluate_sentences( "length": len(labels), } -def evaluate_sentences_llm( - labels, predictions, return_indices: bool = False, exclude_every_k: int = 0 -): - + +def evaluate_sentences_llm(labels, predictions, return_indices: bool = False, exclude_every_k: int = 0): assert len(labels) == len(predictions) - + if exclude_every_k > 0: true_end_indices = np.where(labels == 1)[0] # every k-th from those where labels are 1 - indices_to_remove = true_end_indices[exclude_every_k-1::exclude_every_k] + indices_to_remove = true_end_indices[exclude_every_k - 1 :: exclude_every_k] # mask for indices to keep mask = np.ones_like(labels, dtype=bool) @@ -103,7 +102,7 @@ def evaluate_sentences_llm( # remove indices labels = labels[mask] predictions = predictions[mask] - + assert len(labels) == len(predictions) return { @@ -119,6 +118,7 @@ def evaluate_sentences_llm( "length": len(labels), } + def train_mixture(lang_code, original_train_x, train_y, n_subsample=None, features=None, skip_punct: bool = False): original_train_x = torch.from_numpy(original_train_x).float() @@ -273,7 +273,9 @@ def our_sentencize( return reconstruct_sentences(text, indices_to_sentences(text, predicted_indices_transformed)) -# baselines +########### +# BASELINES +########### ERSATZ_LANGUAGES = { "ar", @@ -442,6 +444,7 @@ def get_token_spans(tokenizer, offsets_mapping, tokens): def token_to_char_probs(text, tokens, token_logits, tokenizer, offsets_mapping): + """Map from token probabalities to character probabilities""" char_probs = np.full((len(text), token_logits.shape[1]), np.min(token_logits)) # Initialize with very low numbers valid_indices, valid_offsets = get_token_spans(tokenizer, offsets_mapping, tokens) diff --git a/wtpsplit/evaluation/evaluate_sepp_nlg_2021_subtask1.py b/wtpsplit/evaluation/evaluate_sepp_nlg_subtask1.py similarity index 95% rename from wtpsplit/evaluation/evaluate_sepp_nlg_2021_subtask1.py rename to wtpsplit/evaluation/evaluate_sepp_nlg_subtask1.py index fb73bd94..baca8648 100644 --- a/wtpsplit/evaluation/evaluate_sepp_nlg_2021_subtask1.py +++ b/wtpsplit/evaluation/evaluate_sepp_nlg_subtask1.py @@ -1,7 +1,6 @@ import json import os import sys -from pprint import pprint from sklearn.metrics import classification_report @@ -9,6 +8,11 @@ def evaluate_subtask1(splits, langs, prediction_dir: str, supervisions, include_n_documents) -> None: + """ + Mirrors the original SEPP-NLG 2021 Shared Task evaluation function + https://sites.google.com/view/sentence-segmentation + """ + results = {} avg_holder = {} for supervision in supervisions: @@ -29,8 +33,7 @@ def evaluate_subtask1(splits, langs, prediction_dir: str, supervisions, include_ if str(fname).startswith(str(relevant_dir)) and str(fname).endswith(".tsv") ] - for i, gt_tsv_file in enumerate(gt_tsv_files, 0): - # print(i, gt_tsv_file) + for _, gt_tsv_file in enumerate(gt_tsv_files, 0): basename = os.path.basename(gt_tsv_file) with open(gt_tsv_file, encoding="utf-8") as f: @@ -56,7 +59,6 @@ def evaluate_subtask1(splits, langs, prediction_dir: str, supervisions, include_ all_predicted_labels.extend(pred_labels) eval_result = classification_report(all_gt_labels, all_predicted_labels, output_dict=True) - # pprint(eval_result, indent=4) print(eval_result["1"]["f1-score"]) avg_holder[supervision] += eval_result["1"]["f1-score"] results[lang_code][split][supervision] = eval_result diff --git a/wtpsplit/evaluation/evaluation_results/wtp-bert-mini_intrinsic_results.json b/wtpsplit/evaluation/evaluation_results/wtp-bert-mini_intrinsic_results.json deleted file mode 100644 index 812ca9ff..00000000 --- a/wtpsplit/evaluation/evaluation_results/wtp-bert-mini_intrinsic_results.json +++ /dev/null @@ -1,977 +0,0 @@ -{ - "af": { - "opus100": { - "u": 0.7537688442211055, - "t": 0.7562749237626084, - "punct": 0.849820604818042 - }, - "ud": { - "u": 0.9792626728110598, - "t": 0.9952830188679245, - "punct": 0.9988249118683901 - } - }, - "am": { - "opus100": { - "u": 0.5768080239310224, - "t": 0.5240899357601713, - "punct": 0.655965500718735 - } - }, - "ar": { - "ersatz": { - "u": 0.8523364485981307, - "t": 0.8742342342342343, - "punct": 0.8734491315136477 - }, - "opus100": { - "u": 0.6039680494717856, - "t": 0.6161731207289294, - "punct": 0.6883116883116884 - }, - "ud": { - "u": 0.7401296405421333, - "t": 0.8123249299719887, - "punct": 0.8460987831066571 - } - }, - "az": { - "opus100": { - "u": 0.6520431629223421, - "t": 0.7372149656851892, - "punct": 0.820888685295465 - } - }, - "be": { - "opus100": { - "u": 0.6778835510294442, - "t": 0.6862702702702703, - "punct": 0.8523228652997664 - }, - "ud": { - "u": 0.8551047703967899, - "t": 0.840530141085934, - "punct": 0.873141724479683 - } - }, - "bg": { - "opus100": { - "u": 0.9277566539923954, - "t": 0.9006410256410255, - "punct": 0.9602551521099116 - }, - "ud": { - "u": 0.9623992837958818, - "t": 0.9640287769784173, - "punct": 0.9838854073410923 - } - }, - "bn": { - "opus100": { - "u": 0.7760803142732432, - "t": 0.819579096713171, - "punct": 0.844738778513613 - }, - "ud": { - "u": 0.9909909909909909, - "t": null, - "punct": null - } - }, - "ca": { - "opus100": { - "u": 0.8935567618598065, - "t": 0.8934561776517836, - "punct": 0.9356265356265356 - }, - "ud": { - "u": 0.9710373880989994, - "t": 0.990280777537797, - "punct": 0.9989165763813651 - } - }, - "ceb": { - "ud": { - "u": 0.9946808510638298, - "t": null, - "punct": null - } - }, - "cs": { - "ersatz": { - "u": 0.9489472166137872, - "t": 0.9424948392804483, - "punct": 0.9713114754098361 - }, - "opus100": { - "u": 0.8925813245963025, - "t": 0.8923724847917641, - "punct": 0.9412643106022898 - }, - "ud": { - "u": 0.90336094449205, - "t": 0.9072699542450433, - "punct": 0.9264823717948718 - } - }, - "cy": { - "opus100": { - "u": 0.6786703601108033, - "t": 0.6995424504321301, - "punct": 0.7924217462932455 - }, - "ud": { - "u": 0.9766233766233766, - "t": 0.9616008653326122, - "punct": 0.9936708860759494 - } - }, - "da": { - "opus100": { - "u": 0.8930060213061602, - "t": 0.9046720151014629, - "punct": 0.9439252336448598 - }, - "ud": { - "u": 0.9466553767993225, - "t": 0.9518810148731408, - "punct": 0.9769094138543517 - } - }, - "de": { - "ersatz": { - "u": 0.9228187919463087, - "t": 0.9399692149820421, - "punct": 0.9825908858166922 - }, - "opus100": { - "u": 0.7708016510971106, - "t": 0.8231208136938725, - "punct": 0.8609137055837564 - }, - "ud": { - "u": 0.9284657128761717, - "t": 0.9550679205851621, - "punct": 0.952934955050238 - } - }, - "el": { - "opus100": { - "u": 0.9128397375820057, - "t": 0.9225302061122957, - "punct": 0.9614723926380369 - }, - "ud": { - "u": 0.9693654266958425, - "t": 0.9709821428571428, - "punct": 0.9699666295884315 - } - }, - "en": { - "ersatz": { - "u": 0.9292496171516079, - "t": 0.9607716209218021, - "punct": 0.9809761217528207 - }, - "opus100": { - "u": 0.9040217123118678, - "t": 0.8962061560486757, - "punct": 0.9386352479135984 - }, - "ud": { - "u": 0.9206773618538325, - "t": 0.9212175470008952, - "punct": 0.9564007421150278 - } - }, - "eo": { - "opus100": { - "u": 0.9002457002457002, - "t": 0.9000729394602479, - "punct": 0.9474199070677427 - } - }, - "es": { - "ersatz": { - "u": 0.9753900595142352, - "t": 0.9770513455506025, - "punct": 0.990011462256427 - }, - "opus100": { - "u": 0.8976268031642625, - "t": 0.9055436592909827, - "punct": 0.9454635597421914 - }, - "ud": { - "u": 0.9567401618755234, - "t": 0.9765963594336896, - "punct": 0.9959396751740139 - } - }, - "et": { - "ersatz": { - "u": 0.9440459110473458, - "t": 0.9526197041865129, - "punct": 0.9746394828443561 - }, - "opus100": { - "u": 0.8586556169429098, - "t": 0.8767647762622637, - "punct": 0.9386245994577274 - }, - "ud": { - "u": 0.8961077844311377, - "t": 0.9194283021831318, - "punct": 0.963464566929134 - } - }, - "eu": { - "opus100": { - "u": 0.8735196589294174, - "t": 0.8714218676677615, - "punct": 0.9107958643904783 - }, - "ud": { - "u": 0.9627245910431751, - "t": 0.9866962305986696, - "punct": 0.999722145040289 - } - }, - "fa": { - "opus100": { - "u": 0.5503489531405783, - "t": 0.5644308377400501, - "punct": 0.6569162121963312 - }, - "ud": { - "u": 0.9729187562688064, - "t": 0.9901192504258944, - "punct": 0.9993122420907841 - } - }, - "fi": { - "ersatz": { - "u": 0.9243373493975904, - "t": 0.9535714285714285, - "punct": 0.9761784085149519 - }, - "opus100": { - "u": 0.9157820240622788, - "t": 0.9259971167707832, - "punct": 0.9557739557739557 - }, - "ud": { - "u": 0.9151087955868833, - "t": 0.9368900455432662, - "punct": 0.9670687968699055 - } - }, - "fr": { - "ersatz": { - "u": 0.9558779982232751, - "t": 0.9609756097560975, - "punct": 0.9741275571600482 - }, - "opus100": { - "u": 0.887190178364605, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9489559164733179, - "t": 0.9724550898203593, - "punct": 0.983132530120482 - } - }, - "fy": { - "opus100": { - "u": 0.6877828054298643, - "t": 0.6963220675944335, - "punct": 0.8597545050927135 - } - }, - "ga": { - "opus100": { - "u": 0.7676228596842338, - "t": 0.779862700228833, - "punct": 0.8354114713216957 - }, - "ud": { - "u": 0.8834476003917727, - "t": 0.9141039236479322, - "punct": 0.9889135254988914 - } - }, - "gd": { - "opus100": { - "u": 0.7688723205964585, - "t": 0.8244592346089848, - "punct": 0.9140733859730609 - }, - "ud": { - "u": 0.6635838150289017, - "t": 0.668903803131991, - "punct": 0.756385068762279 - } - }, - "gl": { - "opus100": { - "u": 0.8829588014981273, - "t": 0.8854925934634376, - "punct": 0.9330727894479727 - }, - "ud": { - "u": 0.9444444444444444, - "t": 0.9684741488020178, - "punct": 0.9533169533169534 - } - }, - "gu": { - "ersatz": { - "u": 0.875626423690205, - "t": 0.8908641975308642, - "punct": 0.936 - }, - "opus100": { - "u": 0.6388256137686661, - "t": 0.6450856204550787, - "punct": 0.726624203821656 - } - }, - "ha": { - "opus100": { - "u": 0.8318356867779204, - "t": 0.8702326697049652, - "punct": 0.9116523400191022 - } - }, - "he": { - "opus100": { - "u": 0.9069040173201829, - "t": 0.9004450691028343, - "punct": 0.9405568096313017 - }, - "ud": { - "u": 0.8956310679611651, - "t": 0.9341935483870967, - "punct": 0.9520000000000001 - } - }, - "hi": { - "ersatz": { - "u": 0.91011025976453, - "t": 0.9445536767597327, - "punct": 0.9575116893677577 - }, - "opus100": { - "u": 0.61438202247191, - "t": 0.5867951033513948, - "punct": 0.7148148148148148 - }, - "ud": { - "u": 0.9008863819500403, - "t": 0.9496112870716958, - "punct": 0.9994061757719715 - } - }, - "hu": { - "opus100": { - "u": 0.9169419537517697, - "t": 0.9165094339622643, - "punct": 0.9608418991678903 - }, - "ud": { - "u": 0.9664138678223185, - "t": 0.9867256637168141, - "punct": 0.9833147942157953 - } - }, - "hy": { - "opus100": { - "u": 0.8283446121811557, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9235618597320726, - "t": 0.940983606557377, - "punct": 0.9692307692307692 - } - }, - "id": { - "opus100": { - "u": 0.8864292589027911, - "t": 0.888404744613895, - "punct": 0.9376408715251691 - }, - "ud": { - "u": 0.9631067961165048, - "t": null, - "punct": null - } - }, - "ig": { - "opus100": { - "u": 0.8090090090090091, - "t": 0.816267666570522, - "punct": 0.8879336813018116 - } - }, - "is": { - "opus100": { - "u": 0.9426906265177271, - "t": 0.9453924914675769, - "punct": 0.9631268436578172 - }, - "ud": { - "u": 0.8760646619155225, - "t": 0.9137546468401487, - "punct": 0.9649307214524606 - } - }, - "it": { - "opus100": { - "u": 0.8615591397849464, - "t": 0.8896697118763176, - "punct": 0.9281849483521888 - }, - "ud": { - "u": 0.9687814702920444, - "t": 0.9836734693877551, - "punct": 0.9917355371900826 - } - }, - "ja": { - "ersatz": { - "u": 0.7918181818181818, - "t": 0.7968750000000001, - "punct": 0.9329474191992283 - }, - "opus100": { - "u": 0.47619047619047616, - "t": 0.6895551257253385, - "punct": 0.8183884819398838 - }, - "ud": { - "u": 0.9597754911131899, - "t": 0.955534531693472, - "punct": 0.9832713754646839 - } - }, - "jv": { - "ud": { - "u": 0.9541984732824427, - "t": null, - "punct": null - } - }, - "ka": { - "opus100": { - "u": 0.9093553078041687, - "t": 0.9087854056649064, - "punct": 0.9353086419753086 - } - }, - "kk": { - "ersatz": { - "u": 0.9688715953307393, - "t": 0.9724862431215608, - "punct": 0.995019920318725 - }, - "opus100": { - "u": 0.7247776365946633, - "t": 0.7314629258517034, - "punct": 0.8941115702479339 - }, - "ud": { - "u": 0.9615745079662605, - "t": 0.8516129032258066, - "punct": 0.9458375125376128 - } - }, - "km": { - "ersatz": { - "u": 0.8669135264648226, - "t": 0.9030330882352942, - "punct": 0.9072923855532551 - }, - "opus100": { - "u": 0.6313364055299538, - "t": 0.641421497232741, - "punct": 0.7237932019423022 - } - }, - "kn": { - "opus100": { - "u": 0.5131128848346637, - "t": 0.5715705765407555, - "punct": 0.7471526195899773 - } - }, - "ko": { - "opus100": { - "u": 0.5429219306770328, - "t": 0.6279170267934313, - "punct": 0.794822627037392 - }, - "ud": { - "u": 0.9917391304347826, - "t": 0.9938650306748466, - "punct": 0.9989061474513237 - } - }, - "ku": { - "opus100": { - "u": 0.7462551066727191, - "t": 0.5642611683848798, - "punct": 0.8367518966001686 - } - }, - "ky": { - "opus100": { - "u": 0.7999999999999999, - "t": 0.7952228371686572, - "punct": 0.8749625187406296 - } - }, - "la": { - "ud": { - "u": 0.7529751172015867, - "t": 0.9010682768230377, - "punct": 0.9626123994320871 - } - }, - "lt": { - "ersatz": { - "u": 0.9155937052932761, - "t": 0.9309462915601024, - "punct": 0.9655521783181358 - }, - "opus100": { - "u": 0.7846121327414923, - "t": 0.8330601732615313, - "punct": 0.8866386762163496 - }, - "ud": { - "u": 0.9467040673211781, - "t": 0.9601181683899557, - "punct": 0.9719350073855244 - } - }, - "lv": { - "ersatz": { - "u": 0.9470198675496688, - "t": 0.9714285714285714, - "punct": 0.9883921956038529 - }, - "opus100": { - "u": 0.7910803204156743, - "t": 0.832391713747646, - "punct": 0.8796092796092796 - }, - "ud": { - "u": 0.958898393490507, - "t": 0.9472076788830714, - "punct": 0.9888123924268503 - } - }, - "mg": { - "opus100": { - "u": 0.8011720385098369, - "t": 0.8773517504167659, - "punct": 0.9355077835433655 - } - }, - "mk": { - "opus100": { - "u": 0.9086306098964327, - "t": 0.9123294008790191, - "punct": 0.9550861859674679 - } - }, - "ml": { - "opus100": { - "u": 0.7902595910866068, - "t": 0.8215255492470994, - "punct": 0.845909645909646 - } - }, - "mn": { - "opus100": { - "u": 0.9139120958953869, - "t": null, - "punct": null - } - }, - "mr": { - "opus100": { - "u": 0.8805277302711948, - "t": 0.8798053527980535, - "punct": 0.9110947832476121 - }, - "ud": { - "u": 0.9263157894736843, - "t": 0.9148936170212766, - "punct": 0.8791208791208791 - } - }, - "ms": { - "opus100": { - "u": 0.867933723196881, - "t": 0.8708251473477406, - "punct": 0.9330994738160863 - } - }, - "mt": { - "opus100": { - "u": 0.7198820556023589, - "t": 0.7812340642529321, - "punct": 0.8525073746312684 - }, - "ud": { - "u": 0.883936861652739, - "t": 0.9031639501438159, - "punct": 0.9264990328820116 - } - }, - "my": { - "opus100": { - "u": 0.5424963574550753, - "t": 0.5445230544035129, - "punct": 0.7461622807017543 - } - }, - "ne": { - "opus100": { - "u": 0.6493443419135503, - "t": 0.655791962174941, - "punct": 0.715046942400406 - } - }, - "nl": { - "opus100": { - "u": 0.9135687732342007, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9096722621902478, - "t": 0.934656741108354, - "punct": 0.9495798319327732 - } - }, - "no": { - "opus100": { - "u": 0.9460701330108827, - "t": 0.9460701330108827, - "punct": 0.965398773006135 - }, - "ud": { - "u": 0.969450101832994, - "t": 0.9739757794382891, - "punct": 0.9845041322314051 - } - }, - "pa": { - "opus100": { - "u": 0.6185026227944683, - "t": 0.6201442195859502, - "punct": 0.7120472643205754 - } - }, - "pl": { - "ersatz": { - "u": 0.9250720461095101, - "t": 0.9202264539372105, - "punct": 0.9614243323442137 - }, - "opus100": { - "u": 0.9159544159544158, - "t": 0.919627951347484, - "punct": 0.955435847208619 - }, - "ud": { - "u": 0.946167097329888, - "t": 0.967885816235504, - "punct": 0.9897657493745736 - } - }, - "ps": { - "ersatz": { - "u": 0.8647574904817084, - "t": 0.9411979547114682, - "punct": 0.9472300469483568 - }, - "opus100": { - "u": 0.5739967897271269, - "t": 0.6234947819106236, - "punct": 0.7254672897196262 - } - }, - "pt": { - "opus100": { - "u": 0.9022801302931596, - "t": 0.9126398476553201, - "punct": 0.9493268053855568 - }, - "ud": { - "u": 0.9470588235294117, - "t": 0.9478632478632478, - "punct": 0.9742489270386266 - } - }, - "ro": { - "ersatz": { - "u": 0.9577739809616792, - "t": 0.9633004302708176, - "punct": 0.9841827768014061 - }, - "opus100": { - "u": 0.8795126353790613, - "t": 0.87073007367716, - "punct": 0.9690670626082654 - }, - "ud": { - "u": 0.8205928237129485, - "t": 0.896709323583181, - "punct": 0.9942748091603053 - } - }, - "ru": { - "ersatz": { - "u": 0.9396039603960396, - "t": 0.9475308641975309, - "punct": 0.9693251533742331 - }, - "opus100": { - "u": 0.7655640024655845, - "t": null, - "punct": null - }, - "ud": { - "u": 0.8283464566929133, - "t": 0.8348214285714286, - "punct": 0.902675014228799 - } - }, - "si": { - "opus100": { - "u": 0.7871604938271606, - "t": 0.8097102584181677, - "punct": 0.8392494929006085 - } - }, - "sk": { - "opus100": { - "u": 0.9144876325088338, - "t": 0.9245694882367207, - "punct": 0.9560117302052786 - }, - "ud": { - "u": 0.9294173377546187, - "t": 0.9299242424242423, - "punct": 0.9530137636449929 - } - }, - "sl": { - "opus100": { - "u": 0.9174917491749175, - "t": 0.9258373205741627, - "punct": 0.9535392848455365 - }, - "ud": { - "u": 0.9474076837001119, - "t": 0.9680154142581888, - "punct": 0.9894490035169988 - } - }, - "sq": { - "opus100": { - "u": 0.8699795779441797, - "t": 0.8756013745704468, - "punct": 0.9479192623972091 - }, - "ud": { - "u": 0.9917355371900827, - "t": null, - "punct": null - } - }, - "sr": { - "opus100": { - "u": 0.9389312977099237, - "t": 0.9389312977099237, - "punct": 0.9609184171958964 - }, - "ud": { - "u": 0.936247723132969, - "t": 0.9727626459143969, - "punct": 0.9873908826382153 - } - }, - "sv": { - "opus100": { - "u": 0.9182478332162098, - "t": 0.9282136894824707, - "punct": 0.9536811311555338 - }, - "ud": { - "u": 0.9359925788497218, - "t": 0.9447911665866539, - "punct": 0.962000962000962 - } - }, - "ta": { - "ersatz": { - "u": 0.945765937202664, - "t": 0.9670000000000001, - "punct": 0.978131212723658 - }, - "opus100": { - "u": 0.5535986083931288, - "t": 0.5500282645562464, - "punct": 0.6814854974247764 - }, - "ud": { - "u": 0.967741935483871, - "t": 0.983050847457627, - "punct": 1.0 - } - }, - "te": { - "opus100": { - "u": 0.7217412812268118, - "t": 0.7248427672955975, - "punct": 0.8036219581211093 - } - }, - "tg": { - "opus100": { - "u": 0.7652325835335227, - "t": 0.7689438097458614, - "punct": 0.8987951807228916 - } - }, - "th": { - "opus100": { - "u": 0.6788050203287963, - "t": 0.6838095238095238, - "punct": 0.7024331870761866 - }, - "ud": { - "u": 0.6119402985074627, - "t": null, - "punct": null - } - }, - "tr": { - "ersatz": { - "u": 0.9060256009990634, - "t": 0.9247984202731612, - "punct": 0.9769307923771314 - }, - "opus100": { - "u": 0.9230400383601054, - "t": 0.9223766171538093, - "punct": 0.9565432098765432 - }, - "ud": { - "u": 0.9300333174678724, - "t": 0.9560493827160493, - "punct": 0.9908629441624364 - } - }, - "uk": { - "opus100": { - "u": 0.8870626907724818, - "t": 0.8817254174397031, - "punct": 0.9385556915544675 - }, - "ud": { - "u": 0.9007551240560949, - "t": 0.9178690344062154, - "punct": 0.9797752808988766 - } - }, - "ur": { - "opus100": { - "u": 0.46293818571046297, - "t": 0.430117501546073, - "punct": 0.5775709219858155 - }, - "ud": { - "u": 0.8191653786707883, - "t": 0.972972972972973, - "punct": 0.9897292250233426 - } - }, - "uz": { - "opus100": { - "u": 0.7466666666666667, - "t": 0.7607307946290996, - "punct": 0.8331338281062963 - } - }, - "vi": { - "opus100": { - "u": 0.9137314956610514, - "t": 0.9138327793403223, - "punct": 0.946041351487645 - }, - "ud": { - "u": 0.8900112233445566, - "t": 0.9340727048675294, - "punct": 0.9794903666873834 - } - }, - "xh": { - "opus100": { - "u": 0.810678422728386, - "t": 0.8121301775147929, - "punct": 0.8896236012207528 - } - }, - "yi": { - "opus100": { - "u": 0.6704185719310587, - "t": 0.6684350132625995, - "punct": 0.724588781105019 - } - }, - "yo": { - "opus100": { - "u": 0.6918930458451192, - "t": null, - "punct": null - }, - "ud": { - "u": 0.8489795918367348, - "t": null, - "punct": null - } - }, - "zh": { - "ersatz": { - "u": 0.578965399109284, - "t": 0.8775849774185881, - "punct": 0.9603015075376885 - }, - "opus100": { - "u": 0.5167464114832536, - "t": 0.6962996389891697, - "punct": 0.8410872974385781 - }, - "ud": { - "u": 0.4709480122324159, - "t": 0.971028971028971, - "punct": 0.9939879759519038 - } - }, - "zu": { - "opus100": { - "u": 0.8233102714209686, - "t": 0.8365580448065173, - "punct": 0.8937315039009954 - } - } -} \ No newline at end of file diff --git a/wtpsplit/evaluation/evaluation_results/wtp-bert-tiny_intrinsic_results.json b/wtpsplit/evaluation/evaluation_results/wtp-bert-tiny_intrinsic_results.json deleted file mode 100644 index c8c4602b..00000000 --- a/wtpsplit/evaluation/evaluation_results/wtp-bert-tiny_intrinsic_results.json +++ /dev/null @@ -1,977 +0,0 @@ -{ - "af": { - "opus100": { - "u": 0.7015956180042868, - "t": 0.718082788671024, - "punct": 0.8136407300672431 - }, - "ud": { - "u": 0.9465478841870824, - "t": 0.9976525821596244, - "punct": 0.9988249118683901 - } - }, - "am": { - "opus100": { - "u": 0.5158638829524906, - "t": 0.4637281910009183, - "punct": 0.5807962529274004 - } - }, - "ar": { - "ersatz": { - "u": 0.8155628155628156, - "t": 0.7814100039385585, - "punct": 0.9067229848644843 - }, - "opus100": { - "u": 0.5555267926482008, - "t": 0.6051637279596978, - "punct": 0.6197982345523328 - }, - "ud": { - "u": 0.670547147846333, - "t": 0.8116560056858564, - "punct": 0.8179190751445087 - } - }, - "az": { - "opus100": { - "u": 0.49191848208011235, - "t": 0.7065217391304347, - "punct": 0.8082740213523132 - } - }, - "be": { - "opus100": { - "u": 0.6700421940928271, - "t": 0.666088088522456, - "punct": 0.8074055026999228 - }, - "ud": { - "u": 0.7622917513206015, - "t": 0.8084133516232281, - "punct": 0.8461916461916462 - } - }, - "bg": { - "opus100": { - "u": 0.9215962441314555, - "t": 0.930054905705419, - "punct": 0.9559623948540326 - }, - "ud": { - "u": 0.9257340241796199, - "t": 0.9325997248968363, - "punct": 0.9584664536741213 - } - }, - "bn": { - "opus100": { - "u": 0.7520927237604635, - "t": 0.8056849953401678, - "punct": 0.8342298288508557 - }, - "ud": { - "u": 0.9824561403508771, - "t": null, - "punct": null - } - }, - "ca": { - "opus100": { - "u": 0.8664660958111547, - "t": 0.8787584869059165, - "punct": 0.9105020616056271 - }, - "ud": { - "u": 0.9039215686274509, - "t": 0.9879129734085415, - "punct": 0.9964836353800379 - } - }, - "ceb": { - "ud": { - "u": 0.9973474801061007, - "t": null, - "punct": null - } - }, - "cs": { - "ersatz": { - "u": 0.900896008688569, - "t": 0.9595989383662636, - "punct": 0.9676077265973254 - }, - "opus100": { - "u": 0.8332605848974246, - "t": 0.8775948460987831, - "punct": 0.9172743273824491 - }, - "ud": { - "u": 0.8110325565156672, - "t": 0.8806473364801078, - "punct": 0.8781939965269165 - } - }, - "cy": { - "opus100": { - "u": 0.6215182960131076, - "t": 0.6704753199268738, - "punct": 0.7520976353928298 - }, - "ud": { - "u": 0.9612640163098877, - "t": 0.9688841201716739, - "punct": 0.990546218487395 - } - }, - "da": { - "opus100": { - "u": 0.8311062431544358, - "t": 0.872148288973384, - "punct": 0.9166871467190956 - }, - "ud": { - "u": 0.8616352201257863, - "t": 0.9228039041703637, - "punct": 0.9488536155202822 - } - }, - "de": { - "ersatz": { - "u": 0.8593930970581423, - "t": 0.9338140612928149, - "punct": 0.9587762509722583 - }, - "opus100": { - "u": 0.6693227091633467, - "t": 0.7412280701754386, - "punct": 0.7802141764405915 - }, - "ud": { - "u": 0.8522144522144521, - "t": 0.9237199582027168, - "punct": 0.921832884097035 - } - }, - "el": { - "opus100": { - "u": 0.9021335807050093, - "t": 0.913064133016627, - "punct": 0.9485094850948511 - }, - "ud": { - "u": 0.90927624872579, - "t": 0.9435215946843853, - "punct": 0.9496080627099664 - } - }, - "en": { - "ersatz": { - "u": 0.7561907597637835, - "t": 0.8670212765957446, - "punct": 0.9088375039594552 - }, - "opus100": { - "u": 0.8781211372064276, - "t": 0.859417652411283, - "punct": 0.9078384798099762 - }, - "ud": { - "u": 0.8795232013622818, - "t": 0.9037238873751136, - "punct": 0.9409005628517825 - } - }, - "eo": { - "opus100": { - "u": 0.8960975609756097, - "t": 0.8944099378881987, - "punct": 0.932422542083435 - } - }, - "es": { - "ersatz": { - "u": 0.9126094301950545, - "t": 0.9544093178036606, - "punct": 0.973718791064389 - }, - "opus100": { - "u": 0.8411007545494895, - "t": 0.8813397129186603, - "punct": 0.9149777558082056 - }, - "ud": { - "u": 0.897463002114165, - "t": 0.981922525107604, - "punct": 0.9927473165071076 - } - }, - "et": { - "ersatz": { - "u": 0.8796958855098389, - "t": 0.9629814907453726, - "punct": 0.9721946375372393 - }, - "opus100": { - "u": 0.8000872981230903, - "t": 0.8501240694789083, - "punct": 0.8961885656970913 - }, - "ud": { - "u": 0.8313592780597857, - "t": 0.9057971014492754, - "punct": 0.9498647143084513 - } - }, - "eu": { - "opus100": { - "u": 0.8514526710402999, - "t": 0.8499305233904586, - "punct": 0.8866855524079319 - }, - "ud": { - "u": 0.9450144698763484, - "t": 0.9914151204652452, - "punct": 0.9977777777777778 - } - }, - "fa": { - "opus100": { - "u": 0.5016385177716158, - "t": 0.5443686006825939, - "punct": 0.5947295423023579 - }, - "ud": { - "u": 0.953160825417622, - "t": 0.9931693989071039, - "punct": 0.9986263736263736 - } - }, - "fi": { - "ersatz": { - "u": 0.8645690834473324, - "t": 0.9565442317640972, - "punct": 0.9641209636084059 - }, - "opus100": { - "u": 0.8897041962852557, - "t": 0.9253658536585366, - "punct": 0.947290274684484 - }, - "ud": { - "u": 0.8403118683222639, - "t": 0.9085696081554635, - "punct": 0.9357259380097879 - } - }, - "fr": { - "ersatz": { - "u": 0.8697886921758995, - "t": 0.9554606467358147, - "punct": 0.9633958781913257 - }, - "opus100": { - "u": 0.8470376210858301, - "t": null, - "punct": null - }, - "ud": { - "u": 0.8685344827586207, - "t": 0.9596199524940617, - "punct": 0.9743589743589745 - } - }, - "fy": { - "opus100": { - "u": 0.6976096542121142, - "t": 0.69844943300162, - "punct": 0.8449391664509449 - } - }, - "ga": { - "opus100": { - "u": 0.6617067833698032, - "t": 0.6497163269594453, - "punct": 0.7316271425217188 - }, - "ud": { - "u": 0.8398510242085662, - "t": 0.9174503657262278, - "punct": 0.9770992366412214 - } - }, - "gd": { - "opus100": { - "u": 0.6442166910688142, - "t": 0.7189045264359072, - "punct": 0.855268911237429 - }, - "ud": { - "u": 0.6231884057971013, - "t": 0.6482758620689655, - "punct": 0.701123595505618 - } - }, - "gl": { - "opus100": { - "u": 0.8635315813739052, - "t": 0.8624942896299681, - "punct": 0.9130538922155689 - }, - "ud": { - "u": 0.8857808857808858, - "t": 0.9464285714285715, - "punct": 0.9445129469790382 - } - }, - "gu": { - "ersatz": { - "u": 0.8491773308957952, - "t": 0.9246435845213848, - "punct": 0.931910569105691 - }, - "opus100": { - "u": 0.5310734463276836, - "t": 0.5376926280716368, - "punct": 0.6523652365236523 - } - }, - "ha": { - "opus100": { - "u": 0.8057142857142857, - "t": 0.9019426456984273, - "punct": 0.9082375029335837 - } - }, - "he": { - "opus100": { - "u": 0.9123723581097127, - "t": 0.9149797570850203, - "punct": 0.9395701643489255 - }, - "ud": { - "u": 0.7801724137931034, - "t": 0.940554821664465, - "punct": 0.9479305740987984 - } - }, - "hi": { - "ersatz": { - "u": 0.8281516911513496, - "t": 0.914901809780516, - "punct": 0.9438480594549958 - }, - "opus100": { - "u": 0.5520788684097728, - "t": 0.5160109789569992, - "punct": 0.638583912166704 - }, - "ud": { - "u": 0.876633559853633, - "t": 0.9677043933663079, - "punct": 0.9970238095238096 - } - }, - "hu": { - "opus100": { - "u": 0.8891928864569084, - "t": 0.9199711607786589, - "punct": 0.9480934809348094 - }, - "ud": { - "u": 0.7780925401322002, - "t": 0.9769484083424808, - "punct": 0.9811320754716981 - } - }, - "hy": { - "opus100": { - "u": 0.7941584616401243, - "t": null, - "punct": null - }, - "ud": { - "u": 0.853008377760853, - "t": 0.880275624461671, - "punct": 0.9248251748251747 - } - }, - "id": { - "opus100": { - "u": 0.8787010506208213, - "t": 0.9074582190072337, - "punct": 0.929923273657289 - }, - "ud": { - "u": 0.9337094499294781, - "t": null, - "punct": null - } - }, - "ig": { - "opus100": { - "u": 0.7773025377815795, - "t": 0.7957101752550354, - "punct": 0.8679800058806234 - } - }, - "is": { - "opus100": { - "u": 0.9233301975540922, - "t": 0.9453125, - "punct": 0.9591937069813176 - }, - "ud": { - "u": 0.8294363770612847, - "t": 0.9005339849760159, - "punct": 0.9460876076375889 - } - }, - "it": { - "opus100": { - "u": 0.8141971349155441, - "t": 0.8580720092915215, - "punct": 0.8964346349745331 - }, - "ud": { - "u": 0.9385365853658536, - "t": 0.9687184661957619, - "punct": 0.976313079299691 - } - }, - "ja": { - "ersatz": { - "u": 0.8298245614035087, - "t": 0.841078066914498, - "punct": 0.9237536656891495 - }, - "opus100": { - "u": 0.47233748271092674, - "t": 0.5304818092428711, - "punct": 0.760578468130691 - }, - "ud": { - "u": 0.9511754068716094, - "t": 0.9545032497678738, - "punct": 0.9785647716682199 - } - }, - "jv": { - "ud": { - "u": 0.8741258741258742, - "t": null, - "punct": null - } - }, - "ka": { - "opus100": { - "u": 0.9196188614708039, - "t": 0.9213759213759214, - "punct": 0.9299678297451126 - } - }, - "kk": { - "ersatz": { - "u": 0.8694516971279372, - "t": 0.9714857428714357, - "punct": 0.9856790123456791 - }, - "opus100": { - "u": 0.7573079537729436, - "t": 0.7506029908345393, - "punct": 0.8537220197813639 - }, - "ud": { - "u": 0.9396396396396396, - "t": 0.9119840875186475, - "punct": 0.9410618606916707 - } - }, - "km": { - "ersatz": { - "u": 0.8973244911959753, - "t": 0.906906906906907, - "punct": 0.9004587155963303 - }, - "opus100": { - "u": 0.484312859036677, - "t": 0.5987025023169601, - "punct": 0.6685552407932012 - } - }, - "kn": { - "opus100": { - "u": 0.5091684434968017, - "t": 0.510221465076661, - "punct": 0.5983009708737863 - } - }, - "ko": { - "opus100": { - "u": 0.5140822272580123, - "t": 0.5579817680729277, - "punct": 0.7518981141317659 - }, - "ud": { - "u": 0.966744333827579, - "t": 0.9865100087032203, - "punct": 0.9986876640419947 - } - }, - "ku": { - "opus100": { - "u": 0.7050742300022159, - "t": 0.33918128654970764, - "punct": 0.7530827067669174 - } - }, - "ky": { - "opus100": { - "u": 0.7780195865070729, - "t": 0.7770906157569238, - "punct": 0.836950146627566 - } - }, - "la": { - "ud": { - "u": 0.9176161262050833, - "t": 0.9176524112829846, - "punct": 0.9560256714998813 - } - }, - "lt": { - "ersatz": { - "u": 0.7636508569151056, - "t": 0.8588899341486359, - "punct": 0.9266802443991854 - }, - "opus100": { - "u": 0.6679146387120929, - "t": 0.7488883688275215, - "punct": 0.8019323671497585 - }, - "ud": { - "u": 0.8737607402511567, - "t": 0.9383611312545324, - "punct": 0.943904263275991 - } - }, - "lv": { - "ersatz": { - "u": 0.8566523605150215, - "t": 0.9775474956822108, - "punct": 0.9817733990147782 - }, - "opus100": { - "u": 0.6008316008316008, - "t": 0.7431348724179829, - "punct": 0.8190715181932245 - }, - "ud": { - "u": 0.8971450919045757, - "t": 0.944204152249135, - "punct": 0.9814655172413793 - } - }, - "mg": { - "opus100": { - "u": 0.712599187149216, - "t": 0.8738807879253004, - "punct": 0.8664027709054923 - } - }, - "mk": { - "opus100": { - "u": 0.8685764268532693, - "t": 0.9172123479887745, - "punct": 0.9527043414989086 - } - }, - "ml": { - "opus100": { - "u": 0.7880055788005579, - "t": 0.811202800700175, - "punct": 0.8333333333333334 - } - }, - "mn": { - "opus100": { - "u": 0.7745554035567715, - "t": null, - "punct": null - } - }, - "mr": { - "opus100": { - "u": 0.852116024726581, - "t": 0.8656204021579205, - "punct": 0.9010933129926265 - }, - "ud": { - "u": 0.9292929292929293, - "t": 0.9032258064516129, - "punct": 0.8705882352941177 - } - }, - "ms": { - "opus100": { - "u": 0.8477425552353506, - "t": 0.8686299615877081, - "punct": 0.9009457441513191 - } - }, - "mt": { - "opus100": { - "u": 0.6026974951830443, - "t": 0.708006279434851, - "punct": 0.7571497818710615 - }, - "ud": { - "u": 0.8417329796640142, - "t": 0.8840304182509505, - "punct": 0.888030888030888 - } - }, - "my": { - "opus100": { - "u": 0.38558352402746005, - "t": 0.4022842639593908, - "punct": 0.6842259291776412 - } - }, - "ne": { - "opus100": { - "u": 0.5066864784546805, - "t": 0.518348623853211, - "punct": 0.613861386138614 - } - }, - "nl": { - "opus100": { - "u": 0.9092172301991662, - "t": null, - "punct": null - }, - "ud": { - "u": 0.8624708624708625, - "t": 0.9217830109335576, - "punct": 0.9221453287197232 - } - }, - "no": { - "opus100": { - "u": 0.934735835524743, - "t": 0.9441354292623942, - "punct": 0.9568838804507594 - }, - "ud": { - "u": 0.9339925834363412, - "t": 0.9655172413793104, - "punct": 0.9711488923235446 - } - }, - "pa": { - "opus100": { - "u": 0.541077085533263, - "t": 0.5458996328029376, - "punct": 0.6440154440154441 - } - }, - "pl": { - "ersatz": { - "u": 0.8152593227603944, - "t": 0.9250125817815802, - "punct": 0.9429581019687027 - }, - "opus100": { - "u": 0.8917987594762233, - "t": 0.9179140511830033, - "punct": 0.9431902753659142 - }, - "ud": { - "u": 0.8968335035750765, - "t": 0.9496937882764654, - "punct": 0.9756541524459613 - } - }, - "ps": { - "ersatz": { - "u": 0.8733031674208145, - "t": 0.9424501424501425, - "punct": 0.9429383886255924 - }, - "opus100": { - "u": 0.5847874720357942, - "t": 0.5855161787365177, - "punct": 0.67651235444476 - } - }, - "pt": { - "opus100": { - "u": 0.8580385132109271, - "t": 0.9078219013237063, - "punct": 0.931358024691358 - }, - "ud": { - "u": 0.8480184686417853, - "t": 0.9214986619090099, - "punct": 0.9480576167612397 - } - }, - "ro": { - "ersatz": { - "u": 0.8901226000462641, - "t": 0.9504596527068437, - "punct": 0.9606259464916709 - }, - "opus100": { - "u": 0.8632743362831858, - "t": 0.9063600093874676, - "punct": 0.9538002980625931 - }, - "ud": { - "u": 0.8161365399534523, - "t": 0.9368800721370604, - "punct": 0.991908614945264 - } - }, - "ru": { - "ersatz": { - "u": 0.818221638195357, - "t": 0.9134020618556701, - "punct": 0.9419831223628691 - }, - "opus100": { - "u": 0.6395537160338313, - "t": null, - "punct": null - }, - "ud": { - "u": 0.7547169811320755, - "t": 0.8258138206739006, - "punct": 0.8561682774303582 - } - }, - "si": { - "opus100": { - "u": 0.783987915407855, - "t": 0.8094479830148619, - "punct": 0.830575256107171 - } - }, - "sk": { - "opus100": { - "u": 0.8599377501111606, - "t": 0.9046584600531016, - "punct": 0.9290259422418012 - }, - "ud": { - "u": 0.8082191780821917, - "t": 0.8597415031115366, - "punct": 0.8809407153356198 - } - }, - "sl": { - "opus100": { - "u": 0.8885350318471338, - "t": 0.9140942809284517, - "punct": 0.9314079422382672 - }, - "ud": { - "u": 0.8743859649122807, - "t": 0.9658611430763331, - "punct": 0.9835809225957779 - } - }, - "sq": { - "opus100": { - "u": 0.8124868171271884, - "t": 0.8986340762041696, - "punct": 0.9406163868704585 - }, - "ud": { - "u": 0.9917355371900827, - "t": null, - "punct": null - } - }, - "sr": { - "opus100": { - "u": 0.9189189189189189, - "t": 0.9420009666505558, - "punct": 0.9580573951434879 - }, - "ud": { - "u": 0.7993680884676145, - "t": 0.9707031249999999, - "punct": 0.9774288518155054 - } - }, - "sv": { - "opus100": { - "u": 0.894712643678161, - "t": 0.9256756756756758, - "punct": 0.9420077972709551 - }, - "ud": { - "u": 0.9120728929384966, - "t": 0.9358851674641148, - "punct": 0.9526542324246772 - } - }, - "ta": { - "ersatz": { - "u": 0.8581255374032675, - "t": 0.9395843402609957, - "punct": 0.9402697495183044 - }, - "opus100": { - "u": 0.4773922187171398, - "t": 0.47893301000303123, - "punct": 0.5889212827988339 - }, - "ud": { - "u": 0.8823529411764706, - "t": 0.9486166007905138, - "punct": 0.9558232931726908 - } - }, - "te": { - "opus100": { - "u": 0.6797006999758629, - "t": 0.6950959488272921, - "punct": 0.7597163120567376 - } - }, - "tg": { - "opus100": { - "u": 0.7460004383081307, - "t": 0.7447786131996658, - "punct": 0.8717827626918536 - } - }, - "th": { - "opus100": { - "u": 0.6650709522995384, - "t": 0.6694766420793818, - "punct": 0.6797773654916512 - }, - "ud": { - "u": 0.5011343584572725, - "t": null, - "punct": null - } - }, - "tr": { - "ersatz": { - "u": 0.8237646386746643, - "t": 0.934352667442247, - "punct": 0.9614223414961421 - }, - "opus100": { - "u": 0.9156626506024096, - "t": 0.9342105263157895, - "punct": 0.9490273331691702 - }, - "ud": { - "u": 0.9278252611585945, - "t": 0.956308296514482, - "punct": 0.9903602232369355 - } - }, - "uk": { - "opus100": { - "u": 0.8741355463347165, - "t": 0.8852459016393444, - "punct": 0.9236381322957198 - }, - "ud": { - "u": 0.8219306466729147, - "t": 0.9271948608137044, - "punct": 0.9651293588301463 - } - }, - "ur": { - "opus100": { - "u": 0.39232303090727816, - "t": 0.38651247039152853, - "punct": 0.509812108559499 - }, - "ud": { - "u": 0.8776859504132232, - "t": 0.985941893158388, - "punct": 0.9897100093545369 - } - }, - "uz": { - "opus100": { - "u": 0.7003703703703704, - "t": 0.7485004284490145, - "punct": 0.8151041666666667 - } - }, - "vi": { - "opus100": { - "u": 0.8995024875621891, - "t": 0.9025012761613068, - "punct": 0.9352226720647774 - }, - "ud": { - "u": 0.8980281690140844, - "t": 0.9516324062877871, - "punct": 0.9684601113172543 - } - }, - "xh": { - "opus100": { - "u": 0.7358974358974358, - "t": 0.7437185929648241, - "punct": 0.8589775253148926 - } - }, - "yi": { - "opus100": { - "u": 0.41967871485943775, - "t": 0.6143633071816536, - "punct": 0.6944024205748865 - } - }, - "yo": { - "opus100": { - "u": 0.6851424806456927, - "t": null, - "punct": null - }, - "ud": { - "u": 0.7649208282582216, - "t": null, - "punct": null - } - }, - "zh": { - "ersatz": { - "u": 0.6756915832842849, - "t": 0.8161413562559695, - "punct": 0.9216520650813517 - }, - "opus100": { - "u": 0.5599556909443368, - "t": 0.6201888162672475, - "punct": 0.7806671920147097 - }, - "ud": { - "u": 0.6961038961038961, - "t": 0.945382323733863, - "punct": 0.9959839357429718 - } - }, - "zu": { - "opus100": { - "u": 0.8099974496301964, - "t": 0.831400966183575, - "punct": 0.8736594297671985 - } - } -} \ No newline at end of file diff --git a/wtpsplit/evaluation/evaluation_results/wtp-canine-s-12l-no-adapters_intrinsic_results.json b/wtpsplit/evaluation/evaluation_results/wtp-canine-s-12l-no-adapters_intrinsic_results.json deleted file mode 100644 index 0a0ac95b..00000000 --- a/wtpsplit/evaluation/evaluation_results/wtp-canine-s-12l-no-adapters_intrinsic_results.json +++ /dev/null @@ -1,977 +0,0 @@ -{ - "af": { - "opus100": { - "u": 0.765533111736129, - "t": 0.7920133111480866, - "punct": 0.883944374209861 - }, - "ud": { - "u": 0.9747706422018348, - "t": 0.9906759906759907, - "punct": 0.9929906542056075 - } - }, - "am": { - "opus100": { - "u": 0.5900526226447123, - "t": 0.6281961060286184, - "punct": 0.6986692015209125 - } - }, - "ar": { - "ersatz": { - "u": 0.8940182054616385, - "t": 0.8753993610223643, - "punct": 0.9252209381373215 - }, - "opus100": { - "u": 0.6798614547253835, - "t": 0.6784615384615384, - "punct": 0.7693069306930694 - }, - "ud": { - "u": 0.8289637952559301, - "t": 0.8765264586160109, - "punct": 0.8931351733899505 - } - }, - "az": { - "opus100": { - "u": 0.7594339622641509, - "t": 0.7880810488676996, - "punct": 0.8443271767810028 - } - }, - "be": { - "opus100": { - "u": 0.710348162475822, - "t": 0.7216447216447216, - "punct": 0.8920425747592499 - }, - "ud": { - "u": 0.893212669683258, - "t": 0.8972296693476317, - "punct": 0.9508970727101038 - } - }, - "bg": { - "opus100": { - "u": 0.9399377841588897, - "t": 0.932323710364411, - "punct": 0.9647749510763209 - }, - "ud": { - "u": 0.9833707865168538, - "t": 0.9825347066726378, - "punct": 0.9959659345584939 - } - }, - "bn": { - "opus100": { - "u": 0.7958069447477615, - "t": 0.8307767226686146, - "punct": 0.8724035608308605 - }, - "ud": { - "u": 0.9911504424778761, - "t": null, - "punct": null - } - }, - "ca": { - "opus100": { - "u": 0.8949343339587242, - "t": 0.8986262434864992, - "punct": 0.946380034593526 - }, - "ud": { - "u": 0.9857181352735113, - "t": 0.9867531765341984, - "punct": 0.998917748917749 - } - }, - "ceb": { - "ud": { - "u": 0.9973474801061007, - "t": null, - "punct": null - } - }, - "cs": { - "ersatz": { - "u": 0.9347305389221556, - "t": 0.9476761180941246, - "punct": 0.9921716439547694 - }, - "opus100": { - "u": 0.8920166128287956, - "t": 0.9020341360766893, - "punct": 0.9500124347177319 - }, - "ud": { - "u": 0.9288367255979795, - "t": 0.9289146480408184, - "punct": 0.964886397167306 - } - }, - "cy": { - "opus100": { - "u": 0.7388429752066115, - "t": 0.7612269938650307, - "punct": 0.8189679218967921 - }, - "ud": { - "u": 0.9926470588235294, - "t": 0.9926547743966422, - "punct": 0.9957983193277311 - } - }, - "da": { - "opus100": { - "u": 0.9010633379565418, - "t": 0.9126946672958943, - "punct": 0.9511718749999999 - }, - "ud": { - "u": 0.9477351916376308, - "t": 0.9373881932021467, - "punct": 0.9866190900981268 - } - }, - "de": { - "ersatz": { - "u": 0.958883628208323, - "t": 0.9598383430159132, - "punct": 0.9921139659119816 - }, - "opus100": { - "u": 0.8073555166374781, - "t": 0.855702280912365, - "punct": 0.9017281403146763 - }, - "ud": { - "u": 0.9608938547486033, - "t": 0.9604925602873268, - "punct": 0.9717573221757322 - } - }, - "el": { - "opus100": { - "u": 0.9223852342640796, - "t": 0.9319271332694153, - "punct": 0.9625984251968502 - }, - "ud": { - "u": 0.9694323144104805, - "t": 0.9675251959686451, - "punct": 0.9767441860465117 - } - }, - "en": { - "ersatz": { - "u": 0.9715676229508197, - "t": 0.9726973471890531, - "punct": 0.9901852453688658 - }, - "opus100": { - "u": 0.9179122764028425, - "t": 0.9099976252671574, - "punct": 0.9502084866323278 - }, - "ud": { - "u": 0.9465095194922938, - "t": 0.9464123524069029, - "punct": 0.9709006928406466 - } - }, - "eo": { - "opus100": { - "u": 0.920448999511957, - "t": 0.9189320388349514, - "punct": 0.9577187807276303 - } - }, - "es": { - "ersatz": { - "u": 0.9871062510200751, - "t": 0.9869067103109657, - "punct": 0.9952622120568535 - }, - "opus100": { - "u": 0.920416864045476, - "t": 0.924726060028585, - "punct": 0.9556263790144643 - }, - "ud": { - "u": 0.9724770642201835, - "t": 0.9740184757505774, - "punct": 0.9973890339425587 - } - }, - "et": { - "ersatz": { - "u": 0.959188721246599, - "t": 0.952092788703984, - "punct": 0.994297049342921 - }, - "opus100": { - "u": 0.8434839554682384, - "t": 0.8832866479925304, - "punct": 0.9516765285996056 - }, - "ud": { - "u": 0.9401234567901234, - "t": 0.939729397293973, - "punct": 0.9817044566067239 - } - }, - "eu": { - "opus100": { - "u": 0.8704639412034911, - "t": 0.871889400921659, - "punct": 0.9290259422418011 - }, - "ud": { - "u": 0.9716175254891154, - "t": 0.9744586340921709, - "punct": 0.998609952738393 - } - }, - "fa": { - "opus100": { - "u": 0.6015073009891663, - "t": 0.6019417475728156, - "punct": 0.7280376144518684 - }, - "ud": { - "u": 0.9732441471571907, - "t": 0.9807367353835754, - "punct": 0.9989701338825953 - } - }, - "fi": { - "ersatz": { - "u": 0.9722013523666416, - "t": 0.9726775956284153, - "punct": 0.9952488122030507 - }, - "opus100": { - "u": 0.9202915588996002, - "t": 0.9324681566931026, - "punct": 0.964014687882497 - }, - "ud": { - "u": 0.9482046393390532, - "t": 0.9462435233160622, - "punct": 0.9828423438005828 - } - }, - "fr": { - "ersatz": { - "u": 0.97610513739546, - "t": 0.9686834904226209, - "punct": 0.9897528631705846 - }, - "opus100": { - "u": 0.884580291970803, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9671361502347419, - "t": 0.970344009489917, - "punct": 0.9915764139590855 - } - }, - "fy": { - "opus100": { - "u": 0.6572008113590263, - "t": 0.6978094484032725, - "punct": 0.8897596656217346 - } - }, - "ga": { - "opus100": { - "u": 0.7926023778071334, - "t": 0.7920792079207921, - "punct": 0.888055972013993 - }, - "ud": { - "u": 0.8667953667953668, - "t": 0.905349794238683, - "punct": 0.9878453038674033 - } - }, - "gd": { - "opus100": { - "u": 0.8315473639091299, - "t": 0.8345740281224151, - "punct": 0.9254829806807727 - }, - "ud": { - "u": 0.7150715071507151, - "t": 0.7378640776699029, - "punct": 0.7831207065750735 - } - }, - "gl": { - "opus100": { - "u": 0.9012287334593573, - "t": 0.9022485207100591, - "punct": 0.9416687176962835 - }, - "ud": { - "u": 0.9887359198998749, - "t": 0.9899497487437187, - "punct": 0.9875621890547265 - } - }, - "gu": { - "ersatz": { - "u": 0.9084041548630784, - "t": 0.8970163618864293, - "punct": 0.9768586903003447 - }, - "opus100": { - "u": 0.7049910873440285, - "t": 0.718351324828263, - "punct": 0.7772435897435898 - } - }, - "ha": { - "opus100": { - "u": 0.8213087248322147, - "t": 0.8576206402293359, - "punct": 0.9179389312977099 - } - }, - "he": { - "opus100": { - "u": 0.9067234848484849, - "t": 0.9043355325164938, - "punct": 0.9431137724550896 - }, - "ud": { - "u": 0.9615861214374226, - "t": 0.9569620253164556, - "punct": 0.9732484076433121 - } - }, - "hi": { - "ersatz": { - "u": 0.9502314814814814, - "t": 0.9472849782005548, - "punct": 0.97514033680834 - }, - "opus100": { - "u": 0.6652977412731005, - "t": 0.6513102282333052, - "punct": 0.7567181926278241 - }, - "ud": { - "u": 0.9669947886508397, - "t": 0.9690601284296556, - "punct": 0.9985158800831107 - } - }, - "hu": { - "opus100": { - "u": 0.9291637052831082, - "t": 0.9336832061068703, - "punct": 0.9663803680981595 - }, - "ud": { - "u": 0.9664429530201342, - "t": 0.9652076318742985, - "punct": 0.9944382647385984 - } - }, - "hy": { - "opus100": { - "u": 0.8681114551083592, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9688013136288999, - "t": 0.9727947238252268, - "punct": 0.9799666110183639 - } - }, - "id": { - "opus100": { - "u": 0.8975230694511899, - "t": 0.9065281899109793, - "punct": 0.9475269897062516 - }, - "ud": { - "u": 0.9836390679226573, - "t": null, - "punct": null - } - }, - "ig": { - "opus100": { - "u": 0.8189785433665759, - "t": 0.8318531675547661, - "punct": 0.8943985307621671 - } - }, - "is": { - "opus100": { - "u": 0.9477012892240332, - "t": 0.9509803921568626, - "punct": 0.969786293294031 - }, - "ud": { - "u": 0.8612963913117837, - "t": 0.892476011120079, - "punct": 0.9673777862814503 - } - }, - "it": { - "opus100": { - "u": 0.8732899753307916, - "t": 0.8933302347199629, - "punct": 0.9479090242112986 - }, - "ud": { - "u": 0.9544554455445544, - "t": 0.9497536945812808, - "punct": 0.9968976215098242 - } - }, - "ja": { - "ersatz": { - "u": 0.8203799654576857, - "t": 0.8249780123131046, - "punct": 0.9532357109116675 - }, - "opus100": { - "u": 0.416860015467904, - "t": 0.7776744186046513, - "punct": 0.8620170597089813 - }, - "ud": { - "u": 0.9601449275362317, - "t": 0.9674418604651163, - "punct": 0.9851301115241634 - } - }, - "jv": { - "ud": { - "u": 0.9652509652509652, - "t": null, - "punct": null - } - }, - "ka": { - "opus100": { - "u": 0.9122807017543859, - "t": 0.9083073976538185, - "punct": 0.935244686109738 - } - }, - "kk": { - "ersatz": { - "u": 0.9604406609914873, - "t": 0.9604406609914873, - "punct": 0.998503740648379 - }, - "opus100": { - "u": 0.7403345215245408, - "t": 0.7555673382820786, - "punct": 0.912280701754386 - }, - "ud": { - "u": 0.9649207111965401, - "t": 0.9280322093608456, - "punct": 0.9796116504854369 - } - }, - "km": { - "ersatz": { - "u": 0.828270796108795, - "t": 0.9101303453007089, - "punct": 0.9149176258180998 - }, - "opus100": { - "u": 0.6928361138370951, - "t": 0.690566037735849, - "punct": 0.7809098520792631 - } - }, - "kn": { - "opus100": { - "u": 0.6932923707117256, - "t": 0.657694558221182, - "punct": 0.7707865168539326 - } - }, - "ko": { - "opus100": { - "u": 0.5615162222936074, - "t": 0.7116509221772379, - "punct": 0.8023032629558541 - }, - "ud": { - "u": 0.9913194444444444, - "t": 0.9919372412290259, - "punct": 0.9986893840104849 - } - }, - "ku": { - "opus100": { - "u": 0.7924528301886793, - "t": 0.6713153724247226, - "punct": 0.8068924539512775 - } - }, - "ky": { - "opus100": { - "u": 0.8395061728395062, - "t": 0.8449656750572082, - "punct": 0.910655981003265 - } - }, - "la": { - "ud": { - "u": 0.9170984455958548, - "t": 0.9196914700544464, - "punct": 0.9667142177841179 - } - }, - "lt": { - "ersatz": { - "u": 0.9684526790185277, - "t": 0.9647177419354839, - "punct": 0.9924812030075189 - }, - "opus100": { - "u": 0.8088794926004228, - "t": 0.8562818729717199, - "punct": 0.9153884215734786 - }, - "ud": { - "u": 0.9746192893401016, - "t": 0.9773226042428675, - "punct": 0.9896907216494846 - } - }, - "lv": { - "ersatz": { - "u": 0.9722627737226277, - "t": 0.9690721649484537, - "punct": 0.995546759030183 - }, - "opus100": { - "u": 0.8226643598615917, - "t": 0.8660230751118437, - "punct": 0.914553065747353 - }, - "ud": { - "u": 0.9608479342418343, - "t": 0.9630104768013684, - "punct": 0.9916147065147279 - } - }, - "mg": { - "opus100": { - "u": 0.8924780802953393, - "t": 0.917228103946102, - "punct": 0.9608771492648891 - } - }, - "mk": { - "opus100": { - "u": 0.9276516038398502, - "t": 0.9340607210626186, - "punct": 0.9598062953995158 - } - }, - "ml": { - "opus100": { - "u": 0.8062898814949863, - "t": 0.8112874779541446, - "punct": 0.8656361474435197 - } - }, - "mn": { - "opus100": { - "u": 0.7772485289997199, - "t": null, - "punct": null - } - }, - "mr": { - "opus100": { - "u": 0.8933171324422842, - "t": 0.894240317775571, - "punct": 0.919260700389105 - }, - "ud": { - "u": 0.9183673469387754, - "t": 0.9108910891089108, - "punct": 0.9787234042553191 - } - }, - "ms": { - "opus100": { - "u": 0.884313725490196, - "t": 0.8840970350404312, - "punct": 0.9434343434343434 - } - }, - "mt": { - "opus100": { - "u": 0.6731426515560354, - "t": 0.8103033220991815, - "punct": 0.8929529383077298 - }, - "ud": { - "u": 0.8970727101038716, - "t": 0.876847290640394, - "punct": 0.9436893203883494 - } - }, - "my": { - "opus100": { - "u": 0.6565404670142825, - "t": 0.7073673364245234, - "punct": 0.8065366367949395 - } - }, - "ne": { - "opus100": { - "u": 0.7088856161021109, - "t": 0.7094576892847785, - "punct": 0.7342391304347827 - } - }, - "nl": { - "opus100": { - "u": 0.9249941217963791, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9328859060402684, - "t": 0.9174757281553398, - "punct": 0.9733777038269552 - } - }, - "no": { - "opus100": { - "u": 0.9497938394373029, - "t": 0.9513145082765336, - "punct": 0.9676945668135095 - }, - "ud": { - "u": 0.9775222164140094, - "t": 0.9855446566855963, - "punct": 0.9956174271719515 - } - }, - "pa": { - "opus100": { - "u": 0.643329658213892, - "t": 0.6673448626653103, - "punct": 0.7664827948515891 - } - }, - "pl": { - "ersatz": { - "u": 0.954248366013072, - "t": 0.945010183299389, - "punct": 0.991504247876062 - }, - "opus100": { - "u": 0.9243256147051803, - "t": 0.9320106460198402, - "punct": 0.9607458292443571 - }, - "ud": { - "u": 0.9572984749455339, - "t": 0.9575070821529745, - "punct": 0.9931880108991825 - } - }, - "ps": { - "ersatz": { - "u": 0.8681991069952043, - "t": 0.9199496312286384, - "punct": 0.9569449644327966 - }, - "opus100": { - "u": 0.6355987055016181, - "t": 0.6987688098495213, - "punct": 0.7511210762331839 - } - }, - "pt": { - "opus100": { - "u": 0.9096005606166784, - "t": 0.9219419324131366, - "punct": 0.9582009288682474 - }, - "ud": { - "u": 0.9627089584226317, - "t": 0.9609507640067912, - "punct": 0.9853574504737295 - } - }, - "ro": { - "ersatz": { - "u": 0.9759960405840139, - "t": 0.9628130533771818, - "punct": 0.9955022488755622 - }, - "opus100": { - "u": 0.9082867783985101, - "t": 0.9147031102733271, - "punct": 0.9719950433705081 - }, - "ud": { - "u": 0.8109567901234568, - "t": 0.914018691588785, - "punct": 0.9942748091603053 - } - }, - "ru": { - "ersatz": { - "u": 0.9797775530839231, - "t": 0.9789156626506025, - "punct": 0.9939271255060729 - }, - "opus100": { - "u": 0.8385826771653543, - "t": null, - "punct": null - }, - "ud": { - "u": 0.863849765258216, - "t": 0.8717948717948718, - "punct": 0.9309376754632229 - } - }, - "si": { - "opus100": { - "u": 0.8027831094049904, - "t": 0.8115222249813757, - "punct": 0.8634538152610443 - } - }, - "sk": { - "opus100": { - "u": 0.9192867198498357, - "t": 0.9322979859257461, - "punct": 0.9635647464303299 - }, - "ud": { - "u": 0.9586698337292162, - "t": 0.9499518768046198, - "punct": 0.9804854831032841 - } - }, - "sl": { - "opus100": { - "u": 0.9228235294117647, - "t": 0.9296446458383019, - "punct": 0.9530493707647628 - }, - "ud": { - "u": 0.967766692248657, - "t": 0.9685681024447031, - "punct": 0.9929577464788732 - } - }, - "sq": { - "opus100": { - "u": 0.9017313991576977, - "t": 0.9151732377538829, - "punct": 0.9612632617813965 - }, - "ud": { - "u": 0.9917355371900827, - "t": null, - "punct": null - } - }, - "sr": { - "opus100": { - "u": 0.9466602594906295, - "t": 0.9476982405398892, - "punct": 0.9660063585228662 - }, - "ud": { - "u": 0.9844961240310077, - "t": 0.9808429118773947, - "punct": 0.9990393852065321 - } - }, - "sv": { - "opus100": { - "u": 0.9219592219357863, - "t": 0.9319971367215462, - "punct": 0.9588306942752741 - }, - "ud": { - "u": 0.9578256794751641, - "t": 0.9574669187145557, - "punct": 0.9721689059500959 - } - }, - "ta": { - "ersatz": { - "u": 0.945765937202664, - "t": 0.9580838323353293, - "punct": 0.9826989619377162 - }, - "opus100": { - "u": 0.6327497425334706, - "t": 0.6319404693760733, - "punct": 0.7274211099020675 - }, - "ud": { - "u": 0.9795918367346939, - "t": 0.9752066115702478, - "punct": 1.0 - } - }, - "te": { - "opus100": { - "u": 0.7634164777021921, - "t": 0.7675675675675675, - "punct": 0.8396821046862153 - } - }, - "tg": { - "opus100": { - "u": 0.8012022327179047, - "t": 0.8405225536110427, - "punct": 0.8996990972918756 - } - }, - "th": { - "opus100": { - "u": 0.6886003912502223, - "t": 0.7151036525172755, - "punct": 0.7292980243799916 - }, - "ud": { - "u": 0.6986837424404126, - "t": null, - "punct": null - } - }, - "tr": { - "ersatz": { - "u": 0.9363222871994801, - "t": 0.9350008227743952, - "punct": 0.9870388833499502 - }, - "opus100": { - "u": 0.9334301621883321, - "t": 0.9337570276216084, - "punct": 0.9600788565795959 - }, - "ud": { - "u": 0.9495548961424333, - "t": 0.9483709273182959, - "punct": 0.988855116514691 - } - }, - "uk": { - "opus100": { - "u": 0.8947491248541423, - "t": 0.8982373678025852, - "punct": 0.9463366821857387 - }, - "ud": { - "u": 0.9205113952195663, - "t": 0.9238521836506158, - "punct": 0.9832962138084632 - } - }, - "ur": { - "opus100": { - "u": 0.5225858804436029, - "t": 0.5240532241555782, - "punct": 0.6474149976711691 - }, - "ud": { - "u": 0.9397163120567376, - "t": 0.9590536851683349, - "punct": 0.9953051643192489 - } - }, - "uz": { - "opus100": { - "u": 0.7757475083056478, - "t": 0.8021158932435681, - "punct": 0.8633510381458233 - } - }, - "vi": { - "opus100": { - "u": 0.9117126233240577, - "t": 0.9116969541847966, - "punct": 0.9523329129886506 - }, - "ud": { - "u": 0.7332408691631992, - "t": 0.9115479115479116, - "punct": 0.9813432835820896 - } - }, - "xh": { - "opus100": { - "u": 0.7999043748505857, - "t": 0.8016350084154845, - "punct": 0.9048346055979644 - } - }, - "yi": { - "opus100": { - "u": 0.7567567567567567, - "t": 0.7530905446040761, - "punct": 0.7622090369694203 - } - }, - "yo": { - "opus100": { - "u": 0.7729161802474901, - "t": null, - "punct": null - }, - "ud": { - "u": 0.848901098901099, - "t": null, - "punct": null - } - }, - "zh": { - "ersatz": { - "u": 0.8807631160572337, - "t": 0.9033685763462012, - "punct": 0.9785 - }, - "opus100": { - "u": 0.7072135785007072, - "t": 0.7100877192982455, - "punct": 0.8789776998246054 - }, - "ud": { - "u": 0.9650924024640656, - "t": 0.9810945273631841, - "punct": 0.998 - } - }, - "zu": { - "opus100": { - "u": 0.7916194790486977, - "t": 0.8411795137092601, - "punct": 0.9046587215601299 - } - } -} \ No newline at end of file diff --git a/wtpsplit/evaluation/evaluation_results/wtp-canine-s-12l_intrinsic_results.json b/wtpsplit/evaluation/evaluation_results/wtp-canine-s-12l_intrinsic_results.json deleted file mode 100644 index 76ba635d..00000000 --- a/wtpsplit/evaluation/evaluation_results/wtp-canine-s-12l_intrinsic_results.json +++ /dev/null @@ -1,977 +0,0 @@ -{ - "af": { - "opus100": { - "u": 0.7837281153450052, - "t": 0.8062631455947652, - "punct": 0.8954081632653061 - }, - "ud": { - "u": 0.9906759906759907, - "t": 0.9941383352872216, - "punct": 0.9988249118683901 - } - }, - "am": { - "opus100": { - "u": 0.6339332517910187, - "t": 0.6960000000000001, - "punct": 0.7380543984317569 - } - }, - "ar": { - "ersatz": { - "u": 0.9008535784635587, - "t": 0.8923705722070845, - "punct": 0.9233390119250426 - }, - "opus100": { - "u": 0.7076142131979696, - "t": 0.7105694867588469, - "punct": 0.8156693102619562 - }, - "ud": { - "u": 0.863075924724205, - "t": 0.8712871287128714, - "punct": 0.9017667844522967 - } - }, - "az": { - "opus100": { - "u": 0.7924528301886792, - "t": 0.8152306565779839, - "punct": 0.8513547733603445 - } - }, - "be": { - "opus100": { - "u": 0.7702735415153715, - "t": 0.7700980392156862, - "punct": 0.9055276381909548 - }, - "ud": { - "u": 0.913871260199456, - "t": 0.9129454217410915, - "punct": 0.9259796806966618 - } - }, - "bg": { - "opus100": { - "u": 0.9452087859039343, - "t": 0.9289591692235073, - "punct": 0.9648780487804878 - }, - "ud": { - "u": 0.9852216748768474, - "t": 0.9852744310575636, - "punct": 0.9950914770191879 - } - }, - "bn": { - "opus100": { - "u": 0.819420035149385, - "t": 0.8511056511056512, - "punct": 0.8910271830647102 - }, - "ud": { - "u": 0.9911504424778761, - "t": null, - "punct": null - } - }, - "ca": { - "opus100": { - "u": 0.8992502343017807, - "t": 0.9037458511142721, - "punct": 0.9515287099179718 - }, - "ud": { - "u": 0.9861676159479251, - "t": 0.9869918699186991, - "punct": 0.9972855591748101 - } - }, - "ceb": { - "ud": { - "u": 0.9973474801061007, - "t": null, - "punct": null - } - }, - "cs": { - "ersatz": { - "u": 0.9325264750378215, - "t": 0.9511764705882352, - "punct": 0.9942096120440069 - }, - "opus100": { - "u": 0.9054401120709783, - "t": 0.9107981220657277, - "punct": 0.9538690476190477 - }, - "ud": { - "u": 0.9260889361056306, - "t": 0.9268366421445128, - "punct": 0.9717525264974118 - } - }, - "cy": { - "opus100": { - "u": 0.7370564281559046, - "t": 0.7844416562107905, - "punct": 0.844223876481941 - }, - "ud": { - "u": 0.9942196531791908, - "t": 0.9942135718043135, - "punct": 0.9968520461699895 - } - }, - "da": { - "opus100": { - "u": 0.9172332942555685, - "t": 0.9213695395513578, - "punct": 0.9545232273838631 - }, - "ud": { - "u": 0.9569798068481123, - "t": 0.9541446208112875, - "punct": 0.9892857142857143 - } - }, - "de": { - "ersatz": { - "u": 0.9670719351570416, - "t": 0.9658511722731906, - "punct": 0.9946469538618405 - }, - "opus100": { - "u": 0.8209354120267262, - "t": 0.8695443645083933, - "punct": 0.9158542842483326 - }, - "ud": { - "u": 0.9645972293483838, - "t": 0.9673066943435392, - "punct": 0.9807592303692148 - } - }, - "el": { - "opus100": { - "u": 0.9241672572643516, - "t": 0.9348970799425562, - "punct": 0.9633251833740831 - }, - "ud": { - "u": 0.9768467475192943, - "t": 0.9779735682819383, - "punct": 0.9878987898789879 - } - }, - "en": { - "ersatz": { - "u": 0.9757945425361155, - "t": 0.9757852142077205, - "punct": 0.9909581734209327 - }, - "opus100": { - "u": 0.9206583149103414, - "t": 0.90750353940538, - "punct": 0.9483521888834235 - }, - "ud": { - "u": 0.944007403979639, - "t": 0.9505041246562785, - "punct": 0.974548819990745 - } - }, - "eo": { - "opus100": { - "u": 0.9237931879441315, - "t": 0.9224033081975189, - "punct": 0.9552755127254756 - } - }, - "es": { - "ersatz": { - "u": 0.9883892068683564, - "t": 0.9826989619377162, - "punct": 0.9957446808510638 - }, - "opus100": { - "u": 0.920416864045476, - "t": 0.9241018320247443, - "punct": 0.9587959536146066 - }, - "ud": { - "u": 0.9676495848840537, - "t": 0.9662921348314606, - "punct": 0.9968088192631275 - } - }, - "et": { - "ersatz": { - "u": 0.9603392367173859, - "t": 0.9538773441459707, - "punct": 0.9867599300524605 - }, - "opus100": { - "u": 0.8617849988871579, - "t": 0.8982571832312765, - "punct": 0.9583230579531443 - }, - "ud": { - "u": 0.9477623577108997, - "t": 0.948905109489051, - "punct": 0.9841022443890275 - } - }, - "eu": { - "opus100": { - "u": 0.8684627575277337, - "t": 0.8797585886722378, - "punct": 0.9321577271614009 - }, - "ud": { - "u": 0.9665936473165389, - "t": 0.9673704414587332, - "punct": 0.9983314794215795 - } - }, - "fa": { - "opus100": { - "u": 0.659158814065732, - "t": 0.6590659942994957, - "punct": 0.7777509068923822 - }, - "ud": { - "u": 0.9758389261744966, - "t": 0.9824086603518268, - "punct": 0.9996564754379939 - } - }, - "fi": { - "ersatz": { - "u": 0.9765684051398338, - "t": 0.9789684526790184, - "punct": 0.9969984992496249 - }, - "opus100": { - "u": 0.9285883748517201, - "t": 0.9359036144578313, - "punct": 0.9639263803680982 - }, - "ud": { - "u": 0.9381475667189952, - "t": 0.9337139232477005, - "punct": 0.9832041343669252 - } - }, - "fr": { - "ersatz": { - "u": 0.9772998805256871, - "t": 0.9744052996085516, - "punct": 0.9879154078549849 - }, - "opus100": { - "u": 0.8942374450358713, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9729093050647822, - "t": 0.9749702026221693, - "punct": 0.9903381642512078 - } - }, - "fy": { - "opus100": { - "u": 0.6262688403568133, - "t": 0.671244635193133, - "punct": 0.8951351351351352 - } - }, - "ga": { - "opus100": { - "u": 0.8238147739801542, - "t": 0.8375565610859729, - "punct": 0.9083763203144191 - }, - "ud": { - "u": 0.8628954937679769, - "t": 0.9037487335359675, - "punct": 0.9792802617230099 - } - }, - "gd": { - "opus100": { - "u": 0.844642857142857, - "t": 0.864910790144435, - "punct": 0.942316566682049 - }, - "ud": { - "u": 0.7185104052573932, - "t": 0.7396798652064026, - "punct": 0.8556053811659193 - } - }, - "gl": { - "opus100": { - "u": 0.9009135628952917, - "t": 0.9089179548156956, - "punct": 0.9453565302621906 - }, - "ud": { - "u": 0.9912390488110137, - "t": 0.9912170639899623, - "punct": 0.9851116625310173 - } - }, - "gu": { - "ersatz": { - "u": 0.9273525109702585, - "t": 0.9031281533804238, - "punct": 0.9837358304583539 - }, - "opus100": { - "u": 0.74085637823372, - "t": 0.7547360444556707, - "punct": 0.8074599422117154 - } - }, - "ha": { - "opus100": { - "u": 0.8872593950504125, - "t": 0.9050953875875393, - "punct": 0.9164407197436528 - } - }, - "he": { - "opus100": { - "u": 0.9109099571224393, - "t": 0.8908337171810227, - "punct": 0.941351888667992 - }, - "ud": { - "u": 0.9724310776942355, - "t": 0.9737171464330412, - "punct": 0.989821882951654 - } - }, - "hi": { - "ersatz": { - "u": 0.9590436997844405, - "t": 0.9500201531640468, - "punct": 0.9787830264211369 - }, - "opus100": { - "u": 0.7033363390441839, - "t": 0.6969303934284479, - "punct": 0.7932530120481929 - }, - "ud": { - "u": 0.9869745411486087, - "t": 0.9882144961697114, - "punct": 0.9997031760166222 - } - }, - "hu": { - "opus100": { - "u": 0.9333651779316933, - "t": 0.9359251259899208, - "punct": 0.9659397206567018 - }, - "ud": { - "u": 0.9517396184062851, - "t": 0.9505617977528089, - "punct": 0.9921787709497207 - } - }, - "hy": { - "opus100": { - "u": 0.8798998616509651, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9654036243822075, - "t": 0.9653465346534653, - "punct": 0.9858215179316097 - } - }, - "id": { - "opus100": { - "u": 0.9052062868369353, - "t": 0.9063429137760158, - "punct": 0.9442917811641269 - }, - "ud": { - "u": 0.9880597014925374, - "t": null, - "punct": null - } - }, - "ig": { - "opus100": { - "u": 0.8360655737704917, - "t": 0.8576439637532886, - "punct": 0.9083129584352079 - } - }, - "is": { - "opus100": { - "u": 0.9479065238558909, - "t": 0.9480077745383868, - "punct": 0.96759941089838 - }, - "ud": { - "u": 0.8997568224804108, - "t": 0.8968417285497002, - "punct": 0.9641380636477263 - } - }, - "it": { - "opus100": { - "u": 0.8854166666666666, - "t": 0.9120905020032996, - "punct": 0.9449317738791424 - }, - "ud": { - "u": 0.9611166500498505, - "t": 0.9639278557114229, - "punct": 0.9968976215098242 - } - }, - "ja": { - "ersatz": { - "u": 0.8424747696358051, - "t": 0.8391408114558473, - "punct": 0.9607935758148323 - }, - "opus100": { - "u": 0.5557509672880759, - "t": 0.814519906323185, - "punct": 0.8955987717502559 - }, - "ud": { - "u": 0.959349593495935, - "t": 0.9646840148698885, - "punct": 0.9813780260707635 - } - }, - "jv": { - "ud": { - "u": 0.9763779527559056, - "t": null, - "punct": null - } - }, - "ka": { - "opus100": { - "u": 0.9050953875875393, - "t": 0.9050953875875393, - "punct": 0.9351737737244269 - } - }, - "kk": { - "ersatz": { - "u": 0.95643259866735, - "t": 0.9510056730273336, - "punct": 0.9985022466300548 - }, - "opus100": { - "u": 0.7963589076723017, - "t": 0.7382588774341351, - "punct": 0.9203402938901779 - }, - "ud": { - "u": 0.9651162790697675, - "t": 0.8952579468473163, - "punct": 0.9625769777356703 - } - }, - "km": { - "ersatz": { - "u": 0.7107282693813626, - "t": 0.927354260089686, - "punct": 0.9265232974910395 - }, - "opus100": { - "u": 0.7387387387387386, - "t": 0.7251059322033898, - "punct": 0.8084163898117386 - } - }, - "kn": { - "opus100": { - "u": 0.7282099343955013, - "t": 0.6893716970052849, - "punct": 0.8402699662542182 - } - }, - "ko": { - "opus100": { - "u": 0.5660377358490566, - "t": 0.7177339901477833, - "punct": 0.82588746679546 - }, - "ud": { - "u": 0.9939050936003483, - "t": 0.9932388222464559, - "punct": 0.9995627459554001 - } - }, - "ku": { - "opus100": { - "u": 0.81333033100653, - "t": 0.7676827463746669, - "punct": 0.8674832962138085 - } - }, - "ky": { - "opus100": { - "u": 0.8510878779316191, - "t": 0.8374807987711214, - "punct": 0.9203965154701111 - } - }, - "la": { - "ud": { - "u": 0.8582974137931034, - "t": 0.9144893111638955, - "punct": 0.9765680473372782 - } - }, - "lt": { - "ersatz": { - "u": 0.9819458375125377, - "t": 0.9730554143365532, - "punct": 0.9939819458375125 - }, - "opus100": { - "u": 0.8327601031814273, - "t": 0.8726772195457673, - "punct": 0.928659590224636 - }, - "ud": { - "u": 0.9728937728937728, - "t": 0.9721407624633431, - "punct": 0.9831994156318481 - } - }, - "lv": { - "ersatz": { - "u": 0.9758477677482312, - "t": 0.9724273756770063, - "punct": 0.9975259772389906 - }, - "opus100": { - "u": 0.8477926641774655, - "t": 0.8901623686723974, - "punct": 0.9313388916135361 - }, - "ud": { - "u": 0.9591304347826087, - "t": 0.9633911368015414, - "punct": 0.9918419922713612 - } - }, - "mg": { - "opus100": { - "u": 0.9310924369747899, - "t": 0.938715953307393, - "punct": 0.9681433549029368 - } - }, - "mk": { - "opus100": { - "u": 0.9341825902335457, - "t": 0.9388045540796964, - "punct": 0.9621175327829042 - } - }, - "ml": { - "opus100": { - "u": 0.8178506375227687, - "t": 0.8201697967584256, - "punct": 0.8822393822393823 - } - }, - "mn": { - "opus100": { - "u": 0.7923367361208223, - "t": null, - "punct": null - } - }, - "mr": { - "opus100": { - "u": 0.9149301299338073, - "t": 0.9184079601990051, - "punct": 0.9483057135790255 - }, - "ud": { - "u": 0.8842105263157894, - "t": 0.8775510204081632, - "punct": 0.9583333333333333 - } - }, - "ms": { - "opus100": { - "u": 0.882266731802638, - "t": 0.888447204968944, - "punct": 0.9438596491228071 - } - }, - "mt": { - "opus100": { - "u": 0.6699611719025768, - "t": 0.849566055930569, - "punct": 0.9073075036782736 - }, - "ud": { - "u": 0.8957528957528957, - "t": 0.8752475247524751, - "punct": 0.9576107899807322 - } - }, - "my": { - "opus100": { - "u": 0.6684547142562409, - "t": 0.7996882307092752, - "punct": 0.8586470123716767 - } - }, - "ne": { - "opus100": { - "u": 0.734818288393904, - "t": 0.7359192348565357, - "punct": 0.7781975175391257 - } - }, - "nl": { - "opus100": { - "u": 0.929283341243474, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9362786745964315, - "t": 0.9147034930950447, - "punct": 0.9757727652464494 - } - }, - "no": { - "opus100": { - "u": 0.9512254307206989, - "t": 0.9509946627850558, - "punct": 0.969845550379995 - }, - "ud": { - "u": 0.9758909853249477, - "t": 0.9858720780888774, - "punct": 0.9956264471314639 - } - }, - "pa": { - "opus100": { - "u": 0.5729729729729729, - "t": 0.6368778280542986, - "punct": 0.8081073966833376 - } - }, - "pl": { - "ersatz": { - "u": 0.9456024402643619, - "t": 0.9350515463917525, - "punct": 0.9925335988053758 - }, - "opus100": { - "u": 0.9274946159368269, - "t": 0.9313418453384726, - "punct": 0.9625642280401273 - }, - "ud": { - "u": 0.9621478873239436, - "t": 0.962684919408258, - "punct": 0.9929753002492635 - } - }, - "ps": { - "ersatz": { - "u": 0.8599316739873109, - "t": 0.9209170072862982, - "punct": 0.9563417388031614 - }, - "opus100": { - "u": 0.48456428292301673, - "t": 0.7323943661971831, - "punct": 0.7844482561463693 - } - }, - "pt": { - "opus100": { - "u": 0.9201789498469508, - "t": 0.9255369928400955, - "punct": 0.9589442815249266 - }, - "ud": { - "u": 0.961038961038961, - "t": 0.9655465759251384, - "punct": 0.9875376020627417 - } - }, - "ro": { - "ersatz": { - "u": 0.9786917740336968, - "t": 0.9645390070921986, - "punct": 0.9952511872031993 - }, - "opus100": { - "u": 0.9176857949200375, - "t": 0.9196702002355713, - "punct": 0.9740163325909428 - }, - "ud": { - "u": 0.8111025443330764, - "t": 0.9363057324840764, - "punct": 0.9938006676204101 - } - }, - "ru": { - "ersatz": { - "u": 0.980691056910569, - "t": 0.9843671205244579, - "punct": 0.9954522486104093 - }, - "opus100": { - "u": 0.8590634102621555, - "t": null, - "punct": null - }, - "ud": { - "u": 0.8631138975966561, - "t": 0.8636124275934702, - "punct": 0.9369067560022334 - } - }, - "si": { - "opus100": { - "u": 0.8123195380173244, - "t": 0.8117704280155641, - "punct": 0.8704663212435233 - } - }, - "sk": { - "opus100": { - "u": 0.9204705882352942, - "t": 0.9342327150084316, - "punct": 0.9636542239685658 - }, - "ud": { - "u": 0.9715370018975332, - "t": 0.9740688354549739, - "punct": 0.9858088930936614 - } - }, - "sl": { - "opus100": { - "u": 0.9310916410134975, - "t": 0.937304828248859, - "punct": 0.9565007249879168 - }, - "ud": { - "u": 0.9603421461897357, - "t": 0.959968907889623, - "punct": 0.9921996879875195 - } - }, - "sq": { - "opus100": { - "u": 0.9116541353383459, - "t": 0.9252720677146312, - "punct": 0.9612058314801087 - }, - "ud": { - "u": 1.0, - "t": null, - "punct": null - } - }, - "sr": { - "opus100": { - "u": 0.9488103821196827, - "t": 0.9542926239419588, - "punct": 0.9659564046044574 - }, - "ud": { - "u": 0.9834791059280854, - "t": 0.9903846153846154, - "punct": 1.0 - } - }, - "sv": { - "opus100": { - "u": 0.9275430729289592, - "t": 0.9372003835091083, - "punct": 0.9607604192054594 - }, - "ud": { - "u": 0.9579990561585654, - "t": 0.9579990561585654, - "punct": 0.9770114942528736 - } - }, - "ta": { - "ersatz": { - "u": 0.957950700821653, - "t": 0.9496761335326357, - "punct": 0.9851632047477745 - }, - "opus100": { - "u": 0.6957062850031115, - "t": 0.6975461814171492, - "punct": 0.7741582261155215 - }, - "ud": { - "u": 0.9876543209876543, - "t": 0.9917355371900827, - "punct": 0.9917355371900827 - } - }, - "te": { - "opus100": { - "u": 0.7616294349540078, - "t": 0.7629187224097637, - "punct": 0.8517548454688318 - } - }, - "tg": { - "opus100": { - "u": 0.8097768331562168, - "t": 0.8693293885601578, - "punct": 0.9143431298668676 - } - }, - "th": { - "opus100": { - "u": 0.6911032028469751, - "t": 0.7285657842749901, - "punct": 0.7501053518752634 - }, - "ud": { - "u": 0.7542726927459172, - "t": null, - "punct": null - } - }, - "tr": { - "ersatz": { - "u": 0.949069539666993, - "t": 0.9488099119660907, - "punct": 0.9892259240842035 - }, - "opus100": { - "u": 0.939437984496124, - "t": 0.9406593406593406, - "punct": 0.9591133004926109 - }, - "ud": { - "u": 0.9530400395452299, - "t": 0.954954954954955, - "punct": 0.9887983706720977 - } - }, - "uk": { - "opus100": { - "u": 0.9087479368073568, - "t": 0.9084905660377358, - "punct": 0.95010951569725 - }, - "ud": { - "u": 0.9260089686098655, - "t": 0.9284116331096196, - "punct": 0.9854260089686099 - } - }, - "ur": { - "opus100": { - "u": 0.6165413533834587, - "t": 0.5921663263411918, - "punct": 0.7024271844660194 - }, - "ud": { - "u": 0.9624885635864592, - "t": 0.9641214351425943, - "punct": 0.9924812030075186 - } - }, - "uz": { - "opus100": { - "u": 0.8074507255793805, - "t": 0.8279386712095401, - "punct": 0.8759881422924901 - } - }, - "vi": { - "opus100": { - "u": 0.9176767676767678, - "t": 0.9196746314184038, - "punct": 0.9554268446235205 - }, - "ud": { - "u": 0.7327466419638723, - "t": 0.9063074096754439, - "punct": 0.985705407085146 - } - }, - "xh": { - "opus100": { - "u": 0.7987435494727395, - "t": 0.8276519666269369, - "punct": 0.911353032659409 - } - }, - "yi": { - "opus100": { - "u": 0.7231139646869984, - "t": 0.7258193445243806, - "punct": 0.8248828291435875 - } - }, - "yo": { - "opus100": { - "u": 0.7968392013840184, - "t": null, - "punct": null - }, - "ud": { - "u": 0.8409703504043127, - "t": null, - "punct": null - } - }, - "zh": { - "ersatz": { - "u": 0.872134769400718, - "t": 0.9245803056877975, - "punct": 0.9822810082355877 - }, - "opus100": { - "u": 0.8371967654986522, - "t": 0.823614807645778, - "punct": 0.9023602484472051 - }, - "ud": { - "u": 0.9479166666666667, - "t": 0.9750747756729811, - "punct": 0.9969969969969971 - } - }, - "zu": { - "opus100": { - "u": 0.6405315614617939, - "t": 0.8377158034528552, - "punct": 0.9122525414660246 - } - } -} \ No newline at end of file diff --git a/wtpsplit/evaluation/evaluation_results/wtp-canine-s-1l-no-adapters_intrinsic_results.json b/wtpsplit/evaluation/evaluation_results/wtp-canine-s-1l-no-adapters_intrinsic_results.json deleted file mode 100644 index 75cadb52..00000000 --- a/wtpsplit/evaluation/evaluation_results/wtp-canine-s-1l-no-adapters_intrinsic_results.json +++ /dev/null @@ -1,977 +0,0 @@ -{ - "af": { - "opus100": { - "u": 0.7524606299212598, - "t": 0.7609977324263039, - "punct": 0.8546703995927717 - }, - "ud": { - "u": 0.9736540664375716, - "t": 0.9929411764705882, - "punct": 0.9988249118683901 - } - }, - "am": { - "opus100": { - "u": 0.5809196530846015, - "t": 0.6135265700483091, - "punct": 0.6721426895095048 - } - }, - "ar": { - "ersatz": { - "u": 0.8585669781931463, - "t": 0.8898891966759003, - "punct": 0.8894070619586943 - }, - "opus100": { - "u": 0.6224410469033428, - "t": 0.6253186066270179, - "punct": 0.7110091743119267 - }, - "ud": { - "u": 0.7435750999428897, - "t": 0.8289655172413792, - "punct": 0.8492008339124393 - } - }, - "az": { - "opus100": { - "u": 0.7347094801223241, - "t": 0.7605051664753159, - "punct": 0.8266296809986129 - } - }, - "be": { - "opus100": { - "u": 0.6553254437869823, - "t": 0.699486301369863, - "punct": 0.8663404584084471 - }, - "ud": { - "u": 0.8724353256021409, - "t": 0.8613945578231292, - "punct": 0.8930507639231148 - } - }, - "bg": { - "opus100": { - "u": 0.9349535382416011, - "t": 0.9176196032672113, - "punct": 0.9609184171958963 - }, - "ud": { - "u": 0.9730458221024259, - "t": 0.9730941704035875, - "punct": 0.9905533063427799 - } - }, - "bn": { - "opus100": { - "u": 0.7741517181759239, - "t": 0.8098976921246728, - "punct": 0.8435440566268 - }, - "ud": { - "u": 0.9911504424778761, - "t": null, - "punct": null - } - }, - "ca": { - "opus100": { - "u": 0.8940568475452196, - "t": 0.8941563013377141, - "punct": 0.9341584158415841 - }, - "ud": { - "u": 0.9761019649495486, - "t": 0.9857181352735113, - "punct": 0.9986460871919848 - } - }, - "ceb": { - "ud": { - "u": 0.9973474801061007, - "t": null, - "punct": null - } - }, - "cs": { - "ersatz": { - "u": 0.9430800346720601, - "t": 0.9299703264094955, - "punct": 0.9779837775202781 - }, - "opus100": { - "u": 0.8985304408677396, - "t": 0.9030288800187839, - "punct": 0.9450931677018632 - }, - "ud": { - "u": 0.9130393551446183, - "t": 0.9177953068412958, - "punct": 0.9390317396544797 - } - }, - "cy": { - "opus100": { - "u": 0.6659012629161882, - "t": 0.701565306646138, - "punct": 0.783612867748144 - }, - "ud": { - "u": 0.9827856025039123, - "t": 0.950603732162459, - "punct": 0.9910761154855643 - } - }, - "da": { - "opus100": { - "u": 0.8955154877484974, - "t": 0.9119466920514041, - "punct": 0.9457934755948 - }, - "ud": { - "u": 0.9354838709677419, - "t": 0.937112488928255, - "punct": 0.9795191451469277 - } - }, - "de": { - "ersatz": { - "u": 0.9290353620399326, - "t": 0.9435665914221218, - "punct": 0.9826264690853347 - }, - "opus100": { - "u": 0.7767491926803014, - "t": 0.8366803774497943, - "punct": 0.8710659898477157 - }, - "ud": { - "u": 0.945054945054945, - "t": 0.9621172807472755, - "punct": 0.9595800524934384 - } - }, - "el": { - "opus100": { - "u": 0.9203581526861451, - "t": 0.926956935522246, - "punct": 0.9603936039360395 - }, - "ud": { - "u": 0.9694323144104805, - "t": 0.974472807991121, - "punct": 0.9790055248618785 - } - }, - "en": { - "ersatz": { - "u": 0.954041005705687, - "t": 0.969337799536202, - "punct": 0.9825040650406505 - }, - "opus100": { - "u": 0.9063116370808678, - "t": 0.9005736137667303, - "punct": 0.94482421875 - }, - "ud": { - "u": 0.931909212283044, - "t": 0.9320822162645219, - "punct": 0.9634089856415007 - } - }, - "eo": { - "opus100": { - "u": 0.9098901098901099, - "t": 0.9067632850241546, - "punct": 0.9506990434142752 - } - }, - "es": { - "ersatz": { - "u": 0.9795653584171262, - "t": 0.9808102345415778, - "punct": 0.9913327882256746 - }, - "opus100": { - "u": 0.9046168268104054, - "t": 0.9104689720511606, - "punct": 0.9513752455795678 - }, - "ud": { - "u": 0.9598427408031452, - "t": 0.9712292938099389, - "punct": 0.9944847605224965 - } - }, - "et": { - "ersatz": { - "u": 0.9355222460147513, - "t": 0.9480552070263488, - "punct": 0.9826216484607746 - }, - "opus100": { - "u": 0.8479467258601554, - "t": 0.8892609423582875, - "punct": 0.9422507403751236 - }, - "ud": { - "u": 0.8968502761606211, - "t": 0.9041906839071506, - "punct": 0.9667294413057125 - } - }, - "eu": { - "opus100": { - "u": 0.8706317981948624, - "t": 0.874446773817843, - "punct": 0.9189713731198448 - }, - "ud": { - "u": 0.9491525423728814, - "t": 0.9808492922564529, - "punct": 0.9994441356309061 - } - }, - "fa": { - "opus100": { - "u": 0.5578664056765353, - "t": 0.5655849468230161, - "punct": 0.671623296158612 - }, - "ud": { - "u": 0.9651741293532339, - "t": 0.9877467665078284, - "punct": 0.9989687177724303 - } - }, - "fi": { - "ersatz": { - "u": 0.9410354745925216, - "t": 0.9542743538767395, - "punct": 0.9800756620428752 - }, - "opus100": { - "u": 0.9145138562705496, - "t": 0.9276126558005752, - "punct": 0.9575402635431918 - }, - "ud": { - "u": 0.9242658423493044, - "t": 0.9368662087013413, - "punct": 0.9716704656463692 - } - }, - "fr": { - "ersatz": { - "u": 0.9643281807372176, - "t": 0.9667271627344223, - "punct": 0.9806060606060606 - }, - "opus100": { - "u": 0.8813793103448276, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9579439252336448, - "t": 0.9725864123957092, - "punct": 0.9819059107358263 - } - }, - "fy": { - "opus100": { - "u": 0.5929229854296758, - "t": 0.6859241126070992, - "punct": 0.8612640965119329 - } - }, - "ga": { - "opus100": { - "u": 0.7586206896551724, - "t": 0.7652519893899205, - "punct": 0.8551045510455105 - }, - "ud": { - "u": 0.876953125, - "t": 0.9022082018927445, - "punct": 0.9878987898789879 - } - }, - "gd": { - "opus100": { - "u": 0.7881006864988559, - "t": 0.8101793909052983, - "punct": 0.9180778032036614 - }, - "ud": { - "u": 0.6573426573426574, - "t": 0.6636363636363636, - "punct": 0.7340529931305202 - } - }, - "gl": { - "opus100": { - "u": 0.8822289861859051, - "t": 0.8823391812865498, - "punct": 0.9364077669902912 - }, - "ud": { - "u": 0.9536585365853658, - "t": 0.9699248120300752, - "punct": 0.9712140175219024 - } - }, - "gu": { - "ersatz": { - "u": 0.8683498647430118, - "t": 0.8815165876777251, - "punct": 0.9407114624505928 - }, - "opus100": { - "u": 0.6794003868471953, - "t": 0.6789473684210527, - "punct": 0.7335907335907336 - } - }, - "ha": { - "opus100": { - "u": 0.8229604709840203, - "t": 0.8781190019193857, - "punct": 0.9099142040038131 - } - }, - "he": { - "opus100": { - "u": 0.9105188005711565, - "t": 0.9035294117647059, - "punct": 0.9391435011269722 - }, - "ud": { - "u": 0.9187878787878787, - "t": 0.9431524547803617, - "punct": 0.9593709043250327 - } - }, - "hi": { - "ersatz": { - "u": 0.9092599479360358, - "t": 0.9387674739121875, - "punct": 0.9585607438851829 - }, - "opus100": { - "u": 0.63155505107832, - "t": 0.6118852459016394, - "punct": 0.7153284671532847 - }, - "ud": { - "u": 0.9336307863915226, - "t": 0.9485484334578902, - "punct": 0.9979234648472265 - } - }, - "hu": { - "opus100": { - "u": 0.9233325421314976, - "t": 0.9239904988123515, - "punct": 0.9611721611721611 - }, - "ud": { - "u": 0.959051724137931, - "t": 0.9721913236929923, - "punct": 0.9910514541387024 - } - }, - "hy": { - "opus100": { - "u": 0.8564746759321128, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9140625, - "t": 0.9537953795379537, - "punct": 0.9763513513513513 - } - }, - "id": { - "opus100": { - "u": 0.8873884735953701, - "t": 0.8966699314397649, - "punct": 0.9393258426966292 - }, - "ud": { - "u": 0.9640077821011673, - "t": null, - "punct": null - } - }, - "ig": { - "opus100": { - "u": 0.7918067869153164, - "t": 0.806219406852865, - "punct": 0.8864115697499246 - } - }, - "is": { - "opus100": { - "u": 0.9415190487745693, - "t": 0.943754565376187, - "punct": 0.9636184857423796 - }, - "ud": { - "u": 0.8904724201636317, - "t": 0.9077833409194356, - "punct": 0.962715105162524 - } - }, - "it": { - "opus100": { - "u": 0.8630965718126822, - "t": 0.8832675794847993, - "punct": 0.9310428260343576 - }, - "ud": { - "u": 0.9487179487179487, - "t": 0.96579476861167, - "punct": 0.9958592132505175 - } - }, - "ja": { - "ersatz": { - "u": 0.7925184862983906, - "t": 0.7894039735099337, - "punct": 0.9339712918660286 - }, - "opus100": { - "u": 0.268542199488491, - "t": 0.7364269141531322, - "punct": 0.8490759753593429 - }, - "ud": { - "u": 0.9624885635864592, - "t": 0.962406015037594, - "punct": 0.9822926374650512 - } - }, - "jv": { - "ud": { - "u": 0.9578544061302682, - "t": null, - "punct": null - } - }, - "ka": { - "opus100": { - "u": 0.8911995942176006, - "t": 0.9110576923076923, - "punct": 0.9322159230958836 - } - }, - "kk": { - "ersatz": { - "u": 0.9702293801854563, - "t": 0.9667493796526054, - "punct": 0.9975012493753124 - }, - "opus100": { - "u": 0.713617606602476, - "t": 0.7763649962602843, - "punct": 0.8898817345597898 - }, - "ud": { - "u": 0.9666510098637859, - "t": 0.9048843187660668, - "punct": 0.9299748110831233 - } - }, - "km": { - "ersatz": { - "u": 0.8443612151477321, - "t": 0.9058070416095108, - "punct": 0.9091322126306225 - }, - "opus100": { - "u": 0.6618953603158934, - "t": 0.6659505907626209, - "punct": 0.7542890375109044 - } - }, - "kn": { - "opus100": { - "u": 0.5966292134831461, - "t": 0.6050328227571116, - "punct": 0.7167155425219941 - } - }, - "ko": { - "opus100": { - "u": 0.5285145888594164, - "t": 0.6504897595725735, - "punct": 0.7888372093023256 - }, - "ud": { - "u": 0.9941240478781285, - "t": 0.9945235487404163, - "punct": 0.999343975508419 - } - }, - "ku": { - "opus100": { - "u": 0.7828901734104048, - "t": 0.6248331108144193, - "punct": 0.8660235798499464 - } - }, - "ky": { - "opus100": { - "u": 0.7991644285287973, - "t": 0.8080046403712298, - "punct": 0.892962743938498 - } - }, - "la": { - "ud": { - "u": 0.9003550295857987, - "t": 0.897686620558073, - "punct": 0.9528571428571428 - } - }, - "lt": { - "ersatz": { - "u": 0.9426506024096385, - "t": 0.9498495486459377, - "punct": 0.9756345177664975 - }, - "opus100": { - "u": 0.7775919732441471, - "t": 0.8313980492336275, - "punct": 0.8881222276983736 - }, - "ud": { - "u": 0.9502452697967765, - "t": 0.9671292914536158, - "punct": 0.9822485207100592 - } - }, - "lv": { - "ersatz": { - "u": 0.9479166666666666, - "t": 0.9656694458067681, - "punct": 0.991571641051066 - }, - "opus100": { - "u": 0.7927194860813704, - "t": 0.8500237304224015, - "punct": 0.8852618757612668 - }, - "ud": { - "u": 0.9608796785789807, - "t": 0.9561270801815431, - "punct": 0.9845559845559845 - } - }, - "mg": { - "opus100": { - "u": 0.8203924914675768, - "t": 0.8901515151515151, - "punct": 0.9422552664188352 - } - }, - "mk": { - "opus100": { - "u": 0.9140083217753121, - "t": 0.9177891314444959, - "punct": 0.9560145808019441 - } - }, - "ml": { - "opus100": { - "u": 0.8023148148148149, - "t": 0.8217208033721795, - "punct": 0.847165749820617 - } - }, - "mn": { - "opus100": { - "u": 0.8647806903711395, - "t": null, - "punct": null - } - }, - "mr": { - "opus100": { - "u": 0.8894324853228963, - "t": 0.8950770760815515, - "punct": 0.926017874875869 - }, - "ud": { - "u": 0.9278350515463918, - "t": 0.898876404494382, - "punct": 0.924731182795699 - } - }, - "ms": { - "opus100": { - "u": 0.8714180749448933, - "t": 0.8723509117792015, - "punct": 0.9324391455538997 - } - }, - "mt": { - "opus100": { - "u": 0.6975734858946537, - "t": 0.7864125122189637, - "punct": 0.8671532846715327 - }, - "ud": { - "u": 0.8998109640831757, - "t": 0.904397705544933, - "punct": 0.9220272904483431 - } - }, - "my": { - "opus100": { - "u": 0.6280662983425415, - "t": 0.6759581881533101, - "punct": 0.7785967785967787 - } - }, - "ne": { - "opus100": { - "u": 0.6519507186858317, - "t": 0.6605762473647223, - "punct": 0.7294911734164071 - } - }, - "nl": { - "opus100": { - "u": 0.9153094462540716, - "t": null, - "punct": null - }, - "ud": { - "u": 0.922077922077922, - "t": 0.912449799196787, - "punct": 0.9526970954356847 - } - }, - "no": { - "opus100": { - "u": 0.9484386347131446, - "t": 0.9523346303501945, - "punct": 0.9646017699115044 - }, - "ud": { - "u": 0.9830595482546202, - "t": 0.9822393822393822, - "punct": 0.9904368053760662 - } - }, - "pa": { - "opus100": { - "u": 0.6360191967668605, - "t": 0.6546285714285714, - "punct": 0.725083269280041 - } - }, - "pl": { - "ersatz": { - "u": 0.9203109815354713, - "t": 0.9226830517153098, - "punct": 0.9406657018813315 - }, - "opus100": { - "u": 0.9181188827882549, - "t": 0.9274661508704062, - "punct": 0.9585479519254354 - }, - "ud": { - "u": 0.9442299442299442, - "t": 0.966079295154185, - "punct": 0.9892865283793024 - } - }, - "ps": { - "ersatz": { - "u": 0.8574712643678161, - "t": 0.9304793267471644, - "punct": 0.9486745628877609 - }, - "opus100": { - "u": 0.6623486341875529, - "t": 0.6721748400852878, - "punct": 0.7277339346110485 - } - }, - "pt": { - "opus100": { - "u": 0.9078146934955544, - "t": 0.920172084130019, - "punct": 0.9511480214948705 - }, - "ud": { - "u": 0.9518987341772153, - "t": 0.9504096593359207, - "punct": 0.9781021897810219 - } - }, - "ro": { - "ersatz": { - "u": 0.969889840881273, - "t": 0.9702127659574468, - "punct": 0.9892634207240949 - }, - "opus100": { - "u": 0.8798011748757343, - "t": 0.8792052382027546, - "punct": 0.9702675916749257 - }, - "ud": { - "u": 0.8132972555083108, - "t": 0.908762420957543, - "punct": 0.991908614945264 - } - }, - "ru": { - "ersatz": { - "u": 0.9486552567237163, - "t": 0.95748987854251, - "punct": 0.9787018255578094 - }, - "opus100": { - "u": 0.8040265581494966, - "t": null, - "punct": null - }, - "ud": { - "u": 0.8457919498170412, - "t": 0.8497757847533632, - "punct": 0.908670520231214 - } - }, - "si": { - "opus100": { - "u": 0.8020502806931901, - "t": 0.8158295281582952, - "punct": 0.8475359342915811 - } - }, - "sk": { - "opus100": { - "u": 0.9116344136162275, - "t": 0.9275431861804223, - "punct": 0.9592538046146294 - }, - "ud": { - "u": 0.941342092914125, - "t": 0.9434141702330004, - "punct": 0.953199617956065 - } - }, - "sl": { - "opus100": { - "u": 0.9193776520509193, - "t": 0.9252380952380953, - "punct": 0.9531667071099247 - }, - "ud": { - "u": 0.9472107824784725, - "t": 0.9621036349574633, - "punct": 0.9917743830787309 - } - }, - "sq": { - "opus100": { - "u": 0.8911074994195496, - "t": 0.900165211234364, - "punct": 0.9555774925962488 - }, - "ud": { - "u": 0.9917355371900827, - "t": null, - "punct": null - } - }, - "sr": { - "opus100": { - "u": 0.9422754491017965, - "t": 0.9422754491017965, - "punct": 0.9610961585515048 - }, - "ud": { - "u": 0.9376146788990826, - "t": 0.9586935638808838, - "punct": 0.9942196531791908 - } - }, - "sv": { - "opus100": { - "u": 0.913660693507098, - "t": 0.9315525876460768, - "punct": 0.9552311435523114 - }, - "ud": { - "u": 0.9396471680594243, - "t": 0.946919431279621, - "punct": 0.9683604985618409 - } - }, - "ta": { - "ersatz": { - "u": 0.9320754716981132, - "t": 0.9663823381836428, - "punct": 0.9785749875435974 - }, - "opus100": { - "u": 0.5893719806763286, - "t": 0.574530516431925, - "punct": 0.6902068965517242 - }, - "ud": { - "u": 0.944, - "t": 0.9629629629629629, - "punct": 1.0 - } - }, - "te": { - "opus100": { - "u": 0.7624254473161033, - "t": 0.7684154742978271, - "punct": 0.8290527478597074 - } - }, - "tg": { - "opus100": { - "u": 0.7901775926331944, - "t": 0.8081612341378452, - "punct": 0.905484818805093 - } - }, - "th": { - "opus100": { - "u": 0.6880602366439584, - "t": 0.6981132075471698, - "punct": 0.7110577884423115 - }, - "ud": { - "u": 0.6397153945666235, - "t": null, - "punct": null - } - }, - "tr": { - "ersatz": { - "u": 0.9153187440532826, - "t": 0.9235513715305955, - "punct": 0.9743504330446369 - }, - "opus100": { - "u": 0.9285542458503729, - "t": 0.9304347826086957, - "punct": 0.9535802469135802 - }, - "ud": { - "u": 0.9362934362934363, - "t": 0.9445262641138931, - "punct": 0.990872210953347 - } - }, - "uk": { - "opus100": { - "u": 0.888113514770877, - "t": 0.8894073728418105, - "punct": 0.9421487603305784 - }, - "ud": { - "u": 0.9084010840108401, - "t": 0.9214245336348219, - "punct": 0.9824759751271904 - } - }, - "ur": { - "opus100": { - "u": 0.48287385129490396, - "t": 0.4684684684684685, - "punct": 0.5970961887477314 - }, - "ud": { - "u": 0.898723404255319, - "t": 0.9757009345794393, - "punct": 0.9915966386554622 - } - }, - "uz": { - "opus100": { - "u": 0.766845804051261, - "t": 0.7797592510031208, - "punct": 0.8393186003683242 - } - }, - "vi": { - "opus100": { - "u": 0.9119754350051177, - "t": 0.9128310619696579, - "punct": 0.9480584972264247 - }, - "ud": { - "u": 0.8227979274611399, - "t": 0.920099875156055, - "punct": 0.9732753262896209 - } - }, - "xh": { - "opus100": { - "u": 0.80523890371089, - "t": 0.8054408549914986, - "punct": 0.8921444809295276 - } - }, - "yi": { - "opus100": { - "u": 0.6427492447129909, - "t": 0.652906976744186, - "punct": 0.7825383993532741 - } - }, - "yo": { - "opus100": { - "u": 0.744346824139421, - "t": null, - "punct": null - }, - "ud": { - "u": 0.8380187416331996, - "t": null, - "punct": null - } - }, - "zh": { - "ersatz": { - "u": 0.670755326016785, - "t": 0.8827721798967806, - "punct": 0.9657057654075547 - }, - "opus100": { - "u": 0.5328813559322033, - "t": 0.6854965254427259, - "punct": 0.8590943587149 - }, - "ud": { - "u": 0.7669543773119606, - "t": 0.9518935516888434, - "punct": 0.998998998998999 - } - }, - "zu": { - "opus100": { - "u": 0.8156606851549756, - "t": 0.8472081218274112, - "punct": 0.884227730863525 - } - } -} \ No newline at end of file diff --git a/wtpsplit/evaluation/evaluation_results/wtp-canine-s-1l_intrinsic_results.json b/wtpsplit/evaluation/evaluation_results/wtp-canine-s-1l_intrinsic_results.json deleted file mode 100644 index 4b672219..00000000 --- a/wtpsplit/evaluation/evaluation_results/wtp-canine-s-1l_intrinsic_results.json +++ /dev/null @@ -1,977 +0,0 @@ -{ - "af": { - "opus100": { - "u": 0.7420956362686177, - "t": 0.7782638728989177, - "punct": 0.8768171384850804 - }, - "ud": { - "u": 0.9770114942528736, - "t": 0.988262910798122, - "punct": 0.9269356597600872 - } - }, - "am": { - "opus100": { - "u": 0.5617293835068055, - "t": 0.6032897919690372, - "punct": 0.6987771400049912 - } - }, - "ar": { - "ersatz": { - "u": 0.8654750705550329, - "t": 0.886426592797784, - "punct": 0.9219021173203749 - }, - "opus100": { - "u": 0.6600936652699039, - "t": 0.6649006622516556, - "punct": 0.7645169261181122 - }, - "ud": { - "u": 0.7949940405244339, - "t": 0.8606726149622511, - "punct": 0.8821022727272727 - } - }, - "az": { - "opus100": { - "u": 0.7624693376941947, - "t": 0.7672318872495044, - "punct": 0.8343825665859564 - } - }, - "be": { - "opus100": { - "u": 0.7080066524114992, - "t": 0.7204857207105914, - "punct": 0.8685540950455005 - }, - "ud": { - "u": 0.8930648769574945, - "t": 0.8842471714534378, - "punct": 0.8937221947602572 - } - }, - "bg": { - "opus100": { - "u": 0.9364851957975167, - "t": 0.9243816254416961, - "punct": 0.9632713026444663 - }, - "ud": { - "u": 0.9753030983385721, - "t": 0.9753030983385721, - "punct": 0.9902048085485308 - } - }, - "bn": { - "opus100": { - "u": 0.7814313346228241, - "t": 0.8233046800382045, - "punct": 0.8573511543134872 - }, - "ud": { - "u": 0.9824561403508771, - "t": null, - "punct": null - } - }, - "ca": { - "opus100": { - "u": 0.8968421052631579, - "t": 0.9075750357313005, - "punct": 0.9443207126948775 - }, - "ud": { - "u": 0.9831415573989832, - "t": 0.9880952380952381, - "punct": 0.9994585814834868 - } - }, - "ceb": { - "ud": { - "u": 0.9973474801061007, - "t": null, - "punct": null - } - }, - "cs": { - "ersatz": { - "u": 0.9393761035903474, - "t": 0.9384025935750073, - "punct": 0.9877977919814063 - }, - "opus100": { - "u": 0.8925581395348837, - "t": 0.8954248366013072, - "punct": 0.9480712166172107 - }, - "ud": { - "u": 0.9229269243894116, - "t": 0.9236179908856746, - "punct": 0.9480958536342846 - } - }, - "cy": { - "opus100": { - "u": 0.7069065583284968, - "t": 0.75, - "punct": 0.8209157409916013 - }, - "ud": { - "u": 0.9900575614861329, - "t": 0.9846316905140434, - "punct": 0.9947478991596639 - } - }, - "da": { - "opus100": { - "u": 0.8970114942528735, - "t": 0.9105423808405729, - "punct": 0.9456415279138101 - }, - "ud": { - "u": 0.9496527777777778, - "t": 0.9471830985915494, - "punct": 0.983050847457627 - } - }, - "de": { - "ersatz": { - "u": 0.9505291656411519, - "t": 0.9494640122511485, - "punct": 0.9902813299232738 - }, - "opus100": { - "u": 0.7847177505902554, - "t": 0.8410329305851694, - "punct": 0.8890015205271161 - }, - "ud": { - "u": 0.9589457678661937, - "t": 0.9634585692228512, - "punct": 0.9666319082377477 - } - }, - "el": { - "opus100": { - "u": 0.9129213483146068, - "t": 0.9206349206349207, - "punct": 0.964180569185476 - }, - "ud": { - "u": 0.9804347826086957, - "t": 0.9690265486725664, - "punct": 0.9901853871319519 - } - }, - "en": { - "ersatz": { - "u": 0.9721938775510204, - "t": 0.9758585076168345, - "punct": 0.9859889214727924 - }, - "opus100": { - "u": 0.9175257731958764, - "t": 0.9081364829396325, - "punct": 0.9467745891586952 - }, - "ud": { - "u": 0.9438706780422093, - "t": 0.9437696806117859, - "punct": 0.9622119815668203 - } - }, - "eo": { - "opus100": { - "u": 0.9138014527845036, - "t": 0.9102256361017763, - "punct": 0.9537173806006892 - } - }, - "es": { - "ersatz": { - "u": 0.985855958380751, - "t": 0.9839080459770115, - "punct": 0.9949420786425193 - }, - "opus100": { - "u": 0.9188165680473372, - "t": 0.9259525521207764, - "punct": 0.9546574667323804 - }, - "ud": { - "u": 0.9640328518833191, - "t": 0.9702398150823461, - "punct": 0.9953569355774812 - } - }, - "et": { - "ersatz": { - "u": 0.9539136795903438, - "t": 0.9541927409261577, - "punct": 0.9817545613596601 - }, - "opus100": { - "u": 0.8535003354954147, - "t": 0.8789659224441833, - "punct": 0.9494799405646359 - }, - "ud": { - "u": 0.9103909934580862, - "t": 0.9140829862717877, - "punct": 0.9715274500550574 - } - }, - "eu": { - "opus100": { - "u": 0.8647409445208619, - "t": 0.8731673260414242, - "punct": 0.91796875 - }, - "ud": { - "u": 0.9699945444626297, - "t": 0.9735537190082645, - "punct": 0.999166435120867 - } - }, - "fa": { - "opus100": { - "u": 0.6019235280319024, - "t": 0.6049966239027684, - "punct": 0.7139376218323586 - }, - "ud": { - "u": 0.9706275033377837, - "t": 0.9816700610997964, - "punct": 1.0 - } - }, - "fi": { - "ersatz": { - "u": 0.9709645669291339, - "t": 0.9716678984971668, - "punct": 0.992496248124062 - }, - "opus100": { - "u": 0.9145259224661373, - "t": 0.9301990885104342, - "punct": 0.9606453189929113 - }, - "ud": { - "u": 0.9281288723667905, - "t": 0.9293252318516149, - "punct": 0.9751211631663974 - } - }, - "fr": { - "ersatz": { - "u": 0.9668953176260066, - "t": 0.9659574468085106, - "punct": 0.9894610057211684 - }, - "opus100": { - "u": 0.8863897176956621, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9660023446658851, - "t": 0.9704142011834319, - "punct": 0.9805825242718447 - } - }, - "fy": { - "opus100": { - "u": 0.5708206686930092, - "t": 0.6727456940222897, - "punct": 0.8719723183391004 - } - }, - "ga": { - "opus100": { - "u": 0.8113671962407697, - "t": 0.8084350721420643, - "punct": 0.892245720040282 - }, - "ud": { - "u": 0.8832188420019628, - "t": 0.9167523124357656, - "punct": 0.986784140969163 - } - }, - "gd": { - "opus100": { - "u": 0.823216659282233, - "t": 0.8395270270270271, - "punct": 0.9250693802035153 - }, - "ud": { - "u": 0.6980728051391863, - "t": 0.7088830255057167, - "punct": 0.8056603773584906 - } - }, - "gl": { - "opus100": { - "u": 0.8891498356035699, - "t": 0.8920320603488922, - "punct": 0.9407443682664055 - }, - "ud": { - "u": 0.9788819875776398, - "t": 0.9849246231155778, - "punct": 0.9800498753117208 - } - }, - "gu": { - "ersatz": { - "u": 0.8846153846153847, - "t": 0.8836987607244995, - "punct": 0.9581256231306081 - }, - "opus100": { - "u": 0.70703125, - "t": 0.7075030750307503, - "punct": 0.7653319817536748 - } - }, - "ha": { - "opus100": { - "u": 0.8395273899033299, - "t": 0.8765730880929332, - "punct": 0.9144641999038924 - } - }, - "he": { - "opus100": { - "u": 0.9105999525729191, - "t": 0.9007955077211044, - "punct": 0.9437810945273631 - }, - "ud": { - "u": 0.9529702970297029, - "t": 0.9514066496163683, - "punct": 0.9664082687338501 - } - }, - "hi": { - "ersatz": { - "u": 0.9329789253844694, - "t": 0.9425874950845458, - "punct": 0.9639877924720244 - }, - "opus100": { - "u": 0.6672748004561003, - "t": 0.6463286345466022, - "punct": 0.7571022727272727 - }, - "ud": { - "u": 0.9494430162810625, - "t": 0.9501871580765909, - "punct": 0.9985158800831107 - } - }, - "hu": { - "opus100": { - "u": 0.9187793427230048, - "t": 0.9283341243474134, - "punct": 0.9637610186092066 - }, - "ud": { - "u": 0.9640130861504908, - "t": 0.9614512471655328, - "punct": 0.9921436588103255 - } - }, - "hy": { - "opus100": { - "u": 0.8674359318395276, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9461847389558233, - "t": 0.9548810500410173, - "punct": 0.9839119390347164 - } - }, - "id": { - "opus100": { - "u": 0.8858513189448441, - "t": 0.8977690610443735, - "punct": 0.9436970602889885 - }, - "ud": { - "u": 0.9826818406729342, - "t": null, - "punct": null - } - }, - "ig": { - "opus100": { - "u": 0.8112522686025408, - "t": 0.8237716975581052, - "punct": 0.9050961245041195 - } - }, - "is": { - "opus100": { - "u": 0.9439884113954611, - "t": 0.9482129832239241, - "punct": 0.9674418604651163 - }, - "ud": { - "u": 0.8910681155579113, - "t": 0.9038496294957528, - "punct": 0.9645484949832775 - } - }, - "it": { - "opus100": { - "u": 0.8680679785330948, - "t": 0.8866312629877627, - "punct": 0.938319572608062 - }, - "ud": { - "u": 0.9554013875123885, - "t": 0.964964964964965, - "punct": 0.9968976215098242 - } - }, - "ja": { - "ersatz": { - "u": 0.7773820124666074, - "t": 0.7816901408450704, - "punct": 0.9419047619047619 - }, - "opus100": { - "u": 0.39716312056737585, - "t": 0.8076099881093936, - "punct": 0.8682092555331992 - }, - "ud": { - "u": 0.959780621572212, - "t": 0.9589552238805971, - "punct": 0.9812734082397003 - } - }, - "jv": { - "ud": { - "u": 0.9803921568627451, - "t": null, - "punct": null - } - }, - "ka": { - "opus100": { - "u": 0.9109720885466795, - "t": 0.9109720885466795, - "punct": 0.9339901477832512 - } - }, - "kk": { - "ersatz": { - "u": 0.96575682382134, - "t": 0.9651741293532339, - "punct": 0.9960079840319361 - }, - "opus100": { - "u": 0.7331362536952432, - "t": 0.7604485219164118, - "punct": 0.9143452225982817 - }, - "ud": { - "u": 0.9588122605363985, - "t": 0.8973697782362042, - "punct": 0.9686444766039556 - } - }, - "km": { - "ersatz": { - "u": 0.8365727885759618, - "t": 0.9129540781357094, - "punct": 0.9145548329221798 - }, - "opus100": { - "u": 0.6985555292446128, - "t": 0.6936887921653971, - "punct": 0.7782305005820722 - } - }, - "kn": { - "opus100": { - "u": 0.630221763795771, - "t": 0.619047619047619, - "punct": 0.7423631123919306 - } - }, - "ko": { - "opus100": { - "u": 0.5872367180132034, - "t": 0.718957345971564, - "punct": 0.8185053380782917 - }, - "ud": { - "u": 0.9923830250272035, - "t": 0.9923664122137404, - "punct": 0.999343975508419 - } - }, - "ku": { - "opus100": { - "u": 0.7883442511859047, - "t": 0.6757634827810266, - "punct": 0.8596004439511654 - } - }, - "ky": { - "opus100": { - "u": 0.8481471161883534, - "t": 0.846929950994523, - "punct": 0.9011715229798739 - } - }, - "la": { - "ud": { - "u": 0.8895768833849329, - "t": 0.9005442850074221, - "punct": 0.9637010676156584 - } - }, - "lt": { - "ersatz": { - "u": 0.9542547958681751, - "t": 0.9616548940464178, - "punct": 0.9880239520958083 - }, - "opus100": { - "u": 0.8007630351844001, - "t": 0.852879459081371, - "punct": 0.9073146292585171 - }, - "ud": { - "u": 0.9691756272401434, - "t": 0.9742457689477556, - "punct": 0.9867060561299852 - } - }, - "lv": { - "ersatz": { - "u": 0.9660322813779813, - "t": 0.9711182424092817, - "punct": 0.991861898890259 - }, - "opus100": { - "u": 0.8035065212743212, - "t": 0.8612485276796231, - "punct": 0.9110127826941986 - }, - "ud": { - "u": 0.9685426920607747, - "t": 0.9675338636852291, - "punct": 0.9896818572656921 - } - }, - "mg": { - "opus100": { - "u": 0.8945538818076477, - "t": 0.9123649459783912, - "punct": 0.9544436146377894 - } - }, - "mk": { - "opus100": { - "u": 0.9199255121042831, - "t": 0.9237605238540691, - "punct": 0.9552529182879378 - } - }, - "ml": { - "opus100": { - "u": 0.8050056882821388, - "t": 0.8173913043478263, - "punct": 0.8637905962190984 - } - }, - "mn": { - "opus100": { - "u": 0.8910990809517814, - "t": null, - "punct": null - } - }, - "mr": { - "opus100": { - "u": 0.8853410740203193, - "t": 0.8879668049792531, - "punct": 0.9265986563821846 - }, - "ud": { - "u": 0.9375000000000001, - "t": 0.9484536082474226, - "punct": 0.967032967032967 - } - }, - "ms": { - "opus100": { - "u": 0.8697967086156824, - "t": 0.8790720631786771, - "punct": 0.9389505549949545 - } - }, - "mt": { - "opus100": { - "u": 0.6812080536912751, - "t": 0.8060208788540908, - "punct": 0.8860510805500983 - }, - "ud": { - "u": 0.8943396226415093, - "t": 0.888671875, - "punct": 0.9268292682926829 - } - }, - "my": { - "opus100": { - "u": 0.6952129995608257, - "t": 0.7625099285146942, - "punct": 0.8162722680138261 - } - }, - "ne": { - "opus100": { - "u": 0.7056128293241695, - "t": 0.7156374501992032, - "punct": 0.7552742616033755 - } - }, - "nl": { - "opus100": { - "u": 0.9158878504672898, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9261410788381743, - "t": 0.9095315024232633, - "punct": 0.959731543624161 - } - }, - "no": { - "opus100": { - "u": 0.9466827503015681, - "t": 0.9480048367593711, - "punct": 0.9666503187837175 - }, - "ud": { - "u": 0.986060918946825, - "t": 0.9865841073271413, - "punct": 0.9930286599535244 - } - }, - "pa": { - "opus100": { - "u": 0.6444677503932879, - "t": 0.6651917404129793, - "punct": 0.7445652173913044 - } - }, - "pl": { - "ersatz": { - "u": 0.9324191968658179, - "t": 0.9334006054490415, - "punct": 0.9835082458770614 - }, - "opus100": { - "u": 0.9196768060836503, - "t": 0.9248499399759904, - "punct": 0.9605295415543025 - }, - "ud": { - "u": 0.9507984462667242, - "t": 0.961386573058359, - "punct": 0.9925356254241121 - } - }, - "ps": { - "ersatz": { - "u": 0.8551477752517052, - "t": 0.9231605654222546, - "punct": 0.9651722860318638 - }, - "opus100": { - "u": 0.6923990498812351, - "t": 0.7177963813124493, - "punct": 0.7615819209039547 - } - }, - "pt": { - "opus100": { - "u": 0.9125849543004453, - "t": 0.9237389433420989, - "punct": 0.9562454167685163 - }, - "ud": { - "u": 0.9558198810535259, - "t": 0.9543320529236022, - "punct": 0.9810834049871022 - } - }, - "ro": { - "ersatz": { - "u": 0.975202553400442, - "t": 0.9683417085427135, - "punct": 0.9927770859277708 - }, - "opus100": { - "u": 0.8944099378881988, - "t": 0.8976668976668978, - "punct": 0.9687344913151364 - }, - "ud": { - "u": 0.8205928237129485, - "t": 0.902360018509949, - "punct": 0.9933269780743567 - } - }, - "ru": { - "ersatz": { - "u": 0.9704852426213106, - "t": 0.970126582278481, - "punct": 0.9878172588832487 - }, - "opus100": { - "u": 0.8193450444589025, - "t": null, - "punct": null - }, - "ud": { - "u": 0.8481675392670156, - "t": 0.8569868995633187, - "punct": 0.920045045045045 - } - }, - "si": { - "opus100": { - "u": 0.7944234404536862, - "t": 0.8042669312825601, - "punct": 0.8565022421524664 - } - }, - "sk": { - "opus100": { - "u": 0.9057481964161043, - "t": 0.9260247392675236, - "punct": 0.9619940769990128 - }, - "ud": { - "u": 0.9639126305792972, - "t": 0.9610511496949787, - "punct": 0.9733079122974261 - } - }, - "sl": { - "opus100": { - "u": 0.9158091674462114, - "t": 0.9274327861051629, - "punct": 0.9534658264663112 - }, - "ud": { - "u": 0.9596958174904943, - "t": 0.9631067961165048, - "punct": 0.9937353171495693 - } - }, - "sq": { - "opus100": { - "u": 0.8980446927374303, - "t": 0.91, - "punct": 0.9563501849568434 - }, - "ud": { - "u": 1.0, - "t": null, - "punct": null - } - }, - "sr": { - "opus100": { - "u": 0.9437664513041397, - "t": 0.9466089466089466, - "punct": 0.9656357388316151 - }, - "ud": { - "u": 0.9657142857142856, - "t": 0.9720347155255544, - "punct": 0.9961538461538462 - } - }, - "sv": { - "opus100": { - "u": 0.9149382428338384, - "t": 0.929845422116528, - "punct": 0.9575402635431918 - }, - "ud": { - "u": 0.9486940298507464, - "t": 0.9532357109116675, - "punct": 0.9679272379128769 - } - }, - "ta": { - "ersatz": { - "u": 0.939667458432304, - "t": 0.9552683896620278, - "punct": 0.9812807881773399 - }, - "opus100": { - "u": 0.6387655886704714, - "t": 0.632155573815393, - "punct": 0.7176079734219268 - }, - "ud": { - "u": 0.975609756097561, - "t": 0.9752066115702478, - "punct": 0.975 - } - }, - "te": { - "opus100": { - "u": 0.7686472819216182, - "t": 0.7740932642487047, - "punct": 0.8409212640599893 - } - }, - "tg": { - "opus100": { - "u": 0.810386209906175, - "t": 0.8406296114117068, - "punct": 0.9097187962506166 - } - }, - "th": { - "opus100": { - "u": 0.691788126555279, - "t": 0.7067812798471823, - "punct": 0.7247459033395561 - }, - "ud": { - "u": 0.6891939457937346, - "t": null, - "punct": null - } - }, - "tr": { - "ersatz": { - "u": 0.9204453767952236, - "t": 0.9173676397822861, - "punct": 0.981113320079523 - }, - "opus100": { - "u": 0.9301088270858525, - "t": 0.9315201554152501, - "punct": 0.9592693162182178 - }, - "ud": { - "u": 0.9344422700587085, - "t": 0.9364919354838711, - "punct": 0.9918616480162767 - } - }, - "uk": { - "opus100": { - "u": 0.8924581005586593, - "t": 0.895278167367929, - "punct": 0.9435721295387636 - }, - "ud": { - "u": 0.9228206551915603, - "t": 0.9207419898819562, - "punct": 0.9842873176206509 - } - }, - "ur": { - "opus100": { - "u": 0.529923830250272, - "t": 0.50790413054564, - "punct": 0.6368869445084043 - }, - "ud": { - "u": 0.9058219178082192, - "t": 0.9517743403093722, - "punct": 0.9924953095684803 - } - }, - "uz": { - "opus100": { - "u": 0.7745688759609392, - "t": 0.7889528193325662, - "punct": 0.8643984220907297 - } - }, - "vi": { - "opus100": { - "u": 0.906210711591652, - "t": 0.9101552557902773, - "punct": 0.9494241362043064 - }, - "ud": { - "u": 0.8260416666666667, - "t": 0.9340659340659341, - "punct": 0.9757311761045426 - } - }, - "xh": { - "opus100": { - "u": 0.798978644382544, - "t": 0.8280159521435693, - "punct": 0.9070841889117043 - } - }, - "yi": { - "opus100": { - "u": 0.7341115434500648, - "t": 0.7322368421052632, - "punct": 0.7883271681052199 - } - }, - "yo": { - "opus100": { - "u": 0.7457842018511475, - "t": null, - "punct": null - }, - "ud": { - "u": 0.8339973439575034, - "t": null, - "punct": null - } - }, - "zh": { - "ersatz": { - "u": 0.8518105849582172, - "t": 0.90963554667998, - "punct": 0.9715852442671984 - }, - "opus100": { - "u": 0.7765714285714286, - "t": 0.7954492686324588, - "punct": 0.8854457708915418 - }, - "ud": { - "u": 0.8847006651884702, - "t": 0.9681908548707753, - "punct": 0.9979959919839679 - } - }, - "zu": { - "opus100": { - "u": 0.7584795321637428, - "t": 0.8374281233664402, - "punct": 0.8971506105834465 - } - } -} \ No newline at end of file diff --git a/wtpsplit/evaluation/evaluation_results/wtp-canine-s-3l-no-adapters_intrinsic_results.json b/wtpsplit/evaluation/evaluation_results/wtp-canine-s-3l-no-adapters_intrinsic_results.json deleted file mode 100644 index 2dbcb95c..00000000 --- a/wtpsplit/evaluation/evaluation_results/wtp-canine-s-3l-no-adapters_intrinsic_results.json +++ /dev/null @@ -1,977 +0,0 @@ -{ - "af": { - "opus100": { - "u": 0.7701346389228886, - "t": 0.7708779443254818, - "punct": 0.8685626441199078 - }, - "ud": { - "u": 0.9747706422018348, - "t": 0.985981308411215, - "punct": 1.0 - } - }, - "am": { - "opus100": { - "u": 0.5893962201036963, - "t": 0.6090457021075064, - "punct": 0.6816969696969697 - } - }, - "ar": { - "ersatz": { - "u": 0.8697866921362624, - "t": 0.890282131661442, - "punct": 0.9275660830758667 - }, - "opus100": { - "u": 0.6372828623282344, - "t": 0.6391184573002755, - "punct": 0.7223320158102767 - }, - "ud": { - "u": 0.7742316784869976, - "t": 0.8473177441540577, - "punct": 0.8597475455820476 - } - }, - "az": { - "opus100": { - "u": 0.7387944358578052, - "t": 0.7722955756312253, - "punct": 0.8295640009326184 - } - }, - "be": { - "opus100": { - "u": 0.6746268656716418, - "t": 0.7104464672735153, - "punct": 0.880161127895267 - }, - "ud": { - "u": 0.8811002661934338, - "t": 0.874067573497148, - "punct": 0.9076848719188013 - } - }, - "bg": { - "opus100": { - "u": 0.9296577946768061, - "t": 0.9182242990654205, - "punct": 0.9632352941176471 - }, - "ud": { - "u": 0.9745876058849755, - "t": 0.973142345568487, - "punct": 0.9901610017889089 - } - }, - "bn": { - "opus100": { - "u": 0.776029316663074, - "t": 0.8268129770992367, - "punct": 0.8563078421821724 - }, - "ud": { - "u": 0.9911504424778761, - "t": null, - "punct": null - } - }, - "ca": { - "opus100": { - "u": 0.8885780885780886, - "t": 0.8973203699312308, - "punct": 0.9386954389406572 - }, - "ud": { - "u": 0.9727729315358181, - "t": 0.9804551539491297, - "punct": 0.9989171629669735 - } - }, - "ceb": { - "ud": { - "u": 0.9973474801061007, - "t": null, - "punct": null - } - }, - "cs": { - "ersatz": { - "u": 0.9497822931785196, - "t": 0.9433186490455213, - "punct": 0.988070992144312 - }, - "opus100": { - "u": 0.8970792767732962, - "t": 0.9050751879699247, - "punct": 0.9456710493674026 - }, - "ud": { - "u": 0.920630344960123, - "t": 0.9218773280357586, - "punct": 0.9454995054401583 - } - }, - "cy": { - "opus100": { - "u": 0.661896243291592, - "t": 0.7173857544038805, - "punct": 0.7998918334234723 - }, - "ud": { - "u": 0.9864442127215851, - "t": 0.9792442788717403, - "punct": 0.995260663507109 - } - }, - "da": { - "opus100": { - "u": 0.891555145362252, - "t": 0.909952606635071, - "punct": 0.9470051762385998 - }, - "ud": { - "u": 0.9414802065404476, - "t": 0.937112488928255, - "punct": 0.9857397504456328 - } - }, - "de": { - "ersatz": { - "u": 0.9457743038593063, - "t": 0.9530776992936428, - "punct": 0.9890334098444274 - }, - "opus100": { - "u": 0.7821229050279329, - "t": 0.8537954600927508, - "punct": 0.8872987477638641 - }, - "ud": { - "u": 0.9562154001006543, - "t": 0.9631551634665283, - "punct": 0.9608879492600424 - } - }, - "el": { - "opus100": { - "u": 0.9134705332086063, - "t": 0.9295774647887324, - "punct": 0.9621993127147765 - }, - "ud": { - "u": 0.9725576289791438, - "t": 0.9767955801104973, - "punct": 0.9779735682819383 - } - }, - "en": { - "ersatz": { - "u": 0.9634370445472403, - "t": 0.9707096774193549, - "punct": 0.9843414982782145 - }, - "opus100": { - "u": 0.9110127826941987, - "t": 0.9054151624548736, - "punct": 0.9452054794520548 - }, - "ud": { - "u": 0.9384822631342613, - "t": 0.938093086308179, - "punct": 0.9613970588235294 - } - }, - "eo": { - "opus100": { - "u": 0.9091352009744215, - "t": 0.9090909090909092, - "punct": 0.9538613372810264 - } - }, - "es": { - "ersatz": { - "u": 0.9824902723735409, - "t": 0.9846605744125326, - "punct": 0.9921285667431944 - }, - "opus100": { - "u": 0.9048288795124237, - "t": 0.9200191570881225, - "punct": 0.9515375153751536 - }, - "ud": { - "u": 0.9611923509561304, - "t": 0.9742103738046943, - "punct": 0.9956483899042645 - } - }, - "et": { - "ersatz": { - "u": 0.9511432009626957, - "t": 0.9578736208625879, - "punct": 0.9755489021956087 - }, - "opus100": { - "u": 0.853019538188277, - "t": 0.8939539347408828, - "punct": 0.944059405940594 - }, - "ud": { - "u": 0.9089266897486075, - "t": 0.9218362282878413, - "punct": 0.9733270940570894 - } - }, - "eu": { - "opus100": { - "u": 0.8698471514590088, - "t": 0.8734827264239028, - "punct": 0.9236399121737008 - }, - "ud": { - "u": 0.9670804101457097, - "t": 0.9828634604754007, - "punct": 0.9994444444444445 - } - }, - "fa": { - "opus100": { - "u": 0.5678776290630975, - "t": 0.5673146148308135, - "punct": 0.6987437781464803 - }, - "ud": { - "u": 0.9635036496350364, - "t": 0.9786946229286438, - "punct": 0.9989701338825953 - } - }, - "fi": { - "ersatz": { - "u": 0.9624206930209858, - "t": 0.9717993511355129, - "punct": 0.9879336349924586 - }, - "opus100": { - "u": 0.9130028063610852, - "t": 0.93125, - "punct": 0.9590786571918647 - }, - "ud": { - "u": 0.9316846986089645, - "t": 0.943579766536965, - "punct": 0.9748224661071659 - } - }, - "fr": { - "ersatz": { - "u": 0.968564650059312, - "t": 0.9532073132940813, - "punct": 0.9803089972735534 - }, - "opus100": { - "u": 0.8812243033348561, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9602803738317758, - "t": 0.9704142011834319, - "punct": 0.9854368932038836 - } - }, - "fy": { - "opus100": { - "u": 0.5774735532047294, - "t": 0.6527173913043479, - "punct": 0.864353312302839 - } - }, - "ga": { - "opus100": { - "u": 0.7839422643211547, - "t": 0.787685290763968, - "punct": 0.8676907829534193 - }, - "ud": { - "u": 0.8814887365328109, - "t": 0.9118852459016393, - "punct": 0.9867549668874173 - } - }, - "gd": { - "opus100": { - "u": 0.7910787437414656, - "t": 0.8248355263157895, - "punct": 0.910174152153987 - }, - "ud": { - "u": 0.6651270207852193, - "t": 0.7023346303501946, - "punct": 0.7568627450980393 - } - }, - "gl": { - "opus100": { - "u": 0.8875878220140515, - "t": 0.891463703003074, - "punct": 0.9388154674498288 - }, - "ud": { - "u": 0.9656019656019655, - "t": 0.9774436090225564, - "punct": 0.978776529338327 - } - }, - "gu": { - "ersatz": { - "u": 0.8834467120181406, - "t": 0.8734852157052836, - "punct": 0.9563492063492064 - }, - "opus100": { - "u": 0.685401286020481, - "t": 0.6860209807270067, - "punct": 0.7549094618719714 - } - }, - "ha": { - "opus100": { - "u": 0.8144286905754795, - "t": 0.8667301285102332, - "punct": 0.9117994797824545 - } - }, - "he": { - "opus100": { - "u": 0.9115065243179121, - "t": 0.9014282369468508, - "punct": 0.9440120512176752 - }, - "ud": { - "u": 0.9258809234507898, - "t": 0.9538461538461538, - "punct": 0.9608355091383812 - } - }, - "hi": { - "ersatz": { - "u": 0.9303030303030302, - "t": 0.9424290220820191, - "punct": 0.9502151198524893 - }, - "opus100": { - "u": 0.6432616081540204, - "t": 0.6172890447699607, - "punct": 0.7322156773901224 - }, - "ud": { - "u": 0.9461756373937679, - "t": 0.9477633477633477, - "punct": 0.9991084695393759 - } - }, - "hu": { - "opus100": { - "u": 0.9239516702203269, - "t": 0.9296446458383019, - "punct": 0.9624170965364776 - }, - "ud": { - "u": 0.9664864864864865, - "t": 0.9800884955752213, - "punct": 0.9933333333333333 - } - }, - "hy": { - "opus100": { - "u": 0.8618930913426508, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9476228847703466, - "t": 0.9551227773073667, - "punct": 0.9786871270247229 - } - }, - "id": { - "opus100": { - "u": 0.8884078884078884, - "t": 0.900518390520859, - "punct": 0.9399152331089504 - }, - "ud": { - "u": 0.976401179941003, - "t": null, - "punct": null - } - }, - "ig": { - "opus100": { - "u": 0.8144391408114559, - "t": 0.8223645894001166, - "punct": 0.8930370370370371 - } - }, - "is": { - "opus100": { - "u": 0.9448091417456844, - "t": 0.9481915933528837, - "punct": 0.964972866304884 - }, - "ud": { - "u": 0.8985738329347152, - "t": 0.9147201612164514, - "punct": 0.9667300380228137 - } - }, - "it": { - "opus100": { - "u": 0.8579684374305402, - "t": 0.8924581005586593, - "punct": 0.9356866537717602 - }, - "ud": { - "u": 0.9432485322896282, - "t": 0.9460255152109912, - "punct": 0.9948293691830403 - } - }, - "ja": { - "ersatz": { - "u": 0.8029969149405025, - "t": 0.8023360287511231, - "punct": 0.9341029341029342 - }, - "opus100": { - "u": 0.3294987674609696, - "t": 0.778698224852071, - "punct": 0.8663291139240507 - }, - "ud": { - "u": 0.9651376146788991, - "t": 0.9612842304060436, - "punct": 0.9850467289719625 - } - }, - "jv": { - "ud": { - "u": 0.9727626459143969, - "t": null, - "punct": null - } - }, - "ka": { - "opus100": { - "u": 0.9047146401985112, - "t": 0.9100935027571326, - "punct": 0.9345794392523363 - } - }, - "kk": { - "ersatz": { - "u": 0.9699062654168722, - "t": 0.9633718013045659, - "punct": 0.999001996007984 - }, - "opus100": { - "u": 0.7373599344083083, - "t": 0.7534391534391534, - "punct": 0.8953125 - }, - "ud": { - "u": 0.9686311787072244, - "t": 0.8829396325459317, - "punct": 0.9827751196172249 - } - }, - "km": { - "ersatz": { - "u": 0.8258013139558033, - "t": 0.9085872576177286, - "punct": 0.9106940990278092 - }, - "opus100": { - "u": 0.6660386245878475, - "t": 0.6687402799377916, - "punct": 0.7521222410865874 - } - }, - "kn": { - "opus100": { - "u": 0.6121521862578081, - "t": 0.6131221719457014, - "punct": 0.7581920903954802 - } - }, - "ko": { - "opus100": { - "u": 0.5417354008578028, - "t": 0.6965930018416205, - "punct": 0.7963224893917963 - }, - "ud": { - "u": 0.9917534722222222, - "t": 0.9956331877729259, - "punct": 0.9995627459554001 - } - }, - "ku": { - "opus100": { - "u": 0.7855203619909502, - "t": 0.6082229018008835, - "punct": 0.7933734939759035 - } - }, - "ky": { - "opus100": { - "u": 0.8078962875662936, - "t": 0.8171428571428572, - "punct": 0.8984560570071259 - } - }, - "la": { - "ud": { - "u": 0.9217512808570099, - "t": 0.9191652674502279, - "punct": 0.9619834710743801 - } - }, - "lt": { - "ersatz": { - "u": 0.9547004383828543, - "t": 0.9541099344427634, - "punct": 0.9813036887316827 - }, - "opus100": { - "u": 0.7984726347051336, - "t": 0.8463138433094523, - "punct": 0.9045705675539929 - }, - "ud": { - "u": 0.9635974304068522, - "t": 0.9772893772893771, - "punct": 0.9853157121879589 - } - }, - "lv": { - "ersatz": { - "u": 0.9620739318290926, - "t": 0.9721880383952745, - "punct": 0.9916049382716049 - }, - "opus100": { - "u": 0.8126361655773421, - "t": 0.852328574771823, - "punct": 0.9088657751362059 - }, - "ud": { - "u": 0.9633886760323541, - "t": 0.9628363947031183, - "punct": 0.9896774193548387 - } - }, - "mg": { - "opus100": { - "u": 0.8471001757469244, - "t": 0.8986197049024274, - "punct": 0.9498011928429423 - } - }, - "mk": { - "opus100": { - "u": 0.9210037174721191, - "t": 0.9295973628443608, - "punct": 0.9560999272374485 - } - }, - "ml": { - "opus100": { - "u": 0.8013714285714285, - "t": 0.8031901209158734, - "punct": 0.8584344515822032 - } - }, - "mn": { - "opus100": { - "u": 0.8857689853222718, - "t": null, - "punct": null - } - }, - "mr": { - "opus100": { - "u": 0.8993055555555556, - "t": 0.9010715175679043, - "punct": 0.9312089023773394 - }, - "ud": { - "u": 0.9591836734693878, - "t": 0.9591836734693878, - "punct": 0.9072164948453608 - } - }, - "ms": { - "opus100": { - "u": 0.8704914064391188, - "t": 0.8816936488169366, - "punct": 0.9371714643304129 - } - }, - "mt": { - "opus100": { - "u": 0.7165712028441636, - "t": 0.8082553395728342, - "punct": 0.8763440860215054 - }, - "ud": { - "u": 0.8949343339587242, - "t": 0.888888888888889, - "punct": 0.9297725024727992 - } - }, - "my": { - "opus100": { - "u": 0.6458238247375627, - "t": 0.684446791655664, - "punct": 0.782016348773842 - } - }, - "ne": { - "opus100": { - "u": 0.6947096774193549, - "t": 0.6931137724550899, - "punct": 0.7301333333333333 - } - }, - "nl": { - "opus100": { - "u": 0.913953488372093, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9240816326530612, - "t": 0.9117174959871589, - "punct": 0.9633333333333334 - } - }, - "no": { - "opus100": { - "u": 0.9477503628447024, - "t": 0.9515697249939159, - "punct": 0.9650934119960668 - }, - "ud": { - "u": 0.9832344596337376, - "t": 0.9832084732627228, - "punct": 0.990735975295934 - } - }, - "pa": { - "opus100": { - "u": 0.6385772913816689, - "t": 0.6572366759282647, - "punct": 0.7567975830815711 - } - }, - "pl": { - "ersatz": { - "u": 0.9323529411764706, - "t": 0.9299232736572889, - "punct": 0.9789789789789789 - }, - "opus100": { - "u": 0.9179548156956004, - "t": 0.9296931625996617, - "punct": 0.9601564409679785 - }, - "ud": { - "u": 0.9520311149524633, - "t": 0.9672167216721673, - "punct": 0.9918256130790192 - } - }, - "ps": { - "ersatz": { - "u": 0.8642833498841443, - "t": 0.9276100400145507, - "punct": 0.9518977827884254 - }, - "opus100": { - "u": 0.5157048940832725, - "t": 0.6554149085794656, - "punct": 0.7363353157745681 - } - }, - "pt": { - "opus100": { - "u": 0.9082847141190199, - "t": 0.9231505865453674, - "punct": 0.9548670407416443 - }, - "ud": { - "u": 0.9543918918918918, - "t": 0.9570815450643777, - "punct": 0.9780267126238691 - } - }, - "ro": { - "ersatz": { - "u": 0.9725490196078431, - "t": 0.9699570815450644, - "punct": 0.9937640309304066 - }, - "opus100": { - "u": 0.8811568007230005, - "t": 0.897761366258943, - "punct": 0.9694258016405668 - }, - "ud": { - "u": 0.8193146417445483, - "t": 0.9202965708989805, - "punct": 0.9942748091603053 - } - }, - "ru": { - "ersatz": { - "u": 0.971201588877855, - "t": 0.9711099847947289, - "punct": 0.9913924050632911 - }, - "opus100": { - "u": 0.8155799440499248, - "t": null, - "punct": null - }, - "ud": { - "u": 0.8481078278900985, - "t": 0.8542825361512792, - "punct": 0.9260523321956768 - } - }, - "si": { - "opus100": { - "u": 0.8037518037518039, - "t": 0.8187075685189842, - "punct": 0.854235563469855 - } - }, - "sk": { - "opus100": { - "u": 0.9109557109557108, - "t": 0.9373177842565598, - "punct": 0.9625615763546799 - }, - "ud": { - "u": 0.9527856468366382, - "t": 0.9534662867996202, - "punct": 0.9738965353583293 - } - }, - "sl": { - "opus100": { - "u": 0.9202830188679246, - "t": 0.9307120594581635, - "punct": 0.9554852833860374 - }, - "ud": { - "u": 0.9609700644183402, - "t": 0.9692070823710548, - "punct": 0.9925810230378759 - } - }, - "sq": { - "opus100": { - "u": 0.8940520446096654, - "t": 0.9089609151572927, - "punct": 0.9535403726708075 - }, - "ud": { - "u": 1.0, - "t": null, - "punct": null - } - }, - "sr": { - "opus100": { - "u": 0.9409514702366722, - "t": 0.9454720153735288, - "punct": 0.9630905511811024 - }, - "ud": { - "u": 0.9690721649484535, - "t": 0.9691714836223506, - "punct": 0.9961612284069097 - } - }, - "sv": { - "opus100": { - "u": 0.9176580359225566, - "t": 0.9351519502273271, - "punct": 0.9597855227882036 - }, - "ud": { - "u": 0.9483480688692414, - "t": 0.9554924242424242, - "punct": 0.9699857074797522 - } - }, - "ta": { - "ersatz": { - "u": 0.9317004239284032, - "t": 0.957436154231347, - "punct": 0.9748644652538195 - }, - "opus100": { - "u": 0.612141652613828, - "t": 0.5931558935361216, - "punct": 0.7004936917169501 - }, - "ud": { - "u": 0.9795918367346939, - "t": 0.9745762711864409, - "punct": 0.995850622406639 - } - }, - "te": { - "opus100": { - "u": 0.7651533742331288, - "t": 0.7784743991640543, - "punct": 0.83663631494376 - } - }, - "tg": { - "opus100": { - "u": 0.782608695652174, - "t": 0.8194375925012333, - "punct": 0.9059454191033139 - } - }, - "th": { - "opus100": { - "u": 0.6879432624113475, - "t": 0.7043227665706052, - "punct": 0.7179281268821521 - }, - "ud": { - "u": 0.6533066132264529, - "t": null, - "punct": null - } - }, - "tr": { - "ersatz": { - "u": 0.9249680715197958, - "t": 0.9239612415831828, - "punct": 0.9788579990011653 - }, - "opus100": { - "u": 0.9300867888138862, - "t": 0.9349890430971513, - "punct": 0.9579624134520277 - }, - "ud": { - "u": 0.9383195726080622, - "t": 0.9455984174085065, - "punct": 0.9928861788617886 - } - }, - "uk": { - "opus100": { - "u": 0.8846776057314537, - "t": 0.892422825070159, - "punct": 0.9431929480901078 - }, - "ud": { - "u": 0.9097826086956523, - "t": 0.9246344206974128, - "punct": 0.9837716843872413 - } - }, - "ur": { - "opus100": { - "u": 0.47888888888888886, - "t": 0.48346636259977194, - "punct": 0.6070947462954648 - }, - "ud": { - "u": 0.9036658141517475, - "t": 0.9640552995391705, - "punct": 0.9934272300469483 - } - }, - "uz": { - "opus100": { - "u": 0.7796469366562824, - "t": 0.7956541840036986, - "punct": 0.8458646616541353 - } - }, - "vi": { - "opus100": { - "u": 0.9076768690416457, - "t": 0.9106317411402157, - "punct": 0.9501627848735286 - }, - "ud": { - "u": 0.8187919463087249, - "t": 0.9281210592686003, - "punct": 0.9743589743589743 - } - }, - "xh": { - "opus100": { - "u": 0.7868852459016394, - "t": 0.7881254488867608, - "punct": 0.891835673426937 - } - }, - "yi": { - "opus100": { - "u": 0.7192362093352193, - "t": 0.7173702868192071, - "punct": 0.7881388997268824 - } - }, - "yo": { - "opus100": { - "u": 0.7203116883116885, - "t": null, - "punct": null - }, - "ud": { - "u": 0.8770949720670391, - "t": null, - "punct": null - } - }, - "zh": { - "ersatz": { - "u": 0.8640350877192983, - "t": 0.9168968114486569, - "punct": 0.9507232164746261 - }, - "opus100": { - "u": 0.6613238349658652, - "t": 0.7077137546468402, - "punct": 0.8541140837960626 - }, - "ud": { - "u": 0.9270386266094421, - "t": 0.979020979020979, - "punct": 0.9979959919839679 - } - }, - "zu": { - "opus100": { - "u": 0.7799097065462754, - "t": 0.8419974391805378, - "punct": 0.897705802968961 - } - } -} \ No newline at end of file diff --git a/wtpsplit/evaluation/evaluation_results/wtp-canine-s-3l_intrinsic_results.json b/wtpsplit/evaluation/evaluation_results/wtp-canine-s-3l_intrinsic_results.json deleted file mode 100644 index 07263e1c..00000000 --- a/wtpsplit/evaluation/evaluation_results/wtp-canine-s-3l_intrinsic_results.json +++ /dev/null @@ -1,977 +0,0 @@ -{ - "af": { - "opus100": { - "u": 0.7576292559899118, - "t": 0.7782764811490125, - "punct": 0.8854139964111767 - }, - "ud": { - "u": 0.9803921568627451, - "t": 0.990632318501171, - "punct": 0.9988249118683901 - } - }, - "am": { - "opus100": { - "u": 0.6037735849056605, - "t": 0.651238139319602, - "punct": 0.7191977077363898 - } - }, - "ar": { - "ersatz": { - "u": 0.8783610755441741, - "t": 0.8893513701005896, - "punct": 0.929299796057104 - }, - "opus100": { - "u": 0.6617790811339198, - "t": 0.6642616642616642, - "punct": 0.771964461994077 - }, - "ud": { - "u": 0.8208955223880596, - "t": 0.8752556237218814, - "punct": 0.8822695035460993 - } - }, - "az": { - "opus100": { - "u": 0.7658971243418388, - "t": 0.760575858250277, - "punct": 0.8387096774193549 - } - }, - "be": { - "opus100": { - "u": 0.7314519799726901, - "t": 0.7421383647798743, - "punct": 0.8979489744872436 - }, - "ud": { - "u": 0.8978518193774659, - "t": 0.896431679721497, - "punct": 0.9214758751182591 - } - }, - "bg": { - "opus100": { - "u": 0.9368421052631579, - "t": 0.9333333333333333, - "punct": 0.9651474530831099 - }, - "ud": { - "u": 0.9815895823978447, - "t": 0.9811659192825112, - "punct": 0.9959695476936856 - } - }, - "bn": { - "opus100": { - "u": 0.7949826989619376, - "t": 0.8308966861598441, - "punct": 0.874031007751938 - }, - "ud": { - "u": 0.9411764705882353, - "t": null, - "punct": null - } - }, - "ca": { - "opus100": { - "u": 0.8868799258229022, - "t": 0.895662368112544, - "punct": 0.9450332758195712 - }, - "ud": { - "u": 0.984467059453669, - "t": 0.9851712051765974, - "punct": 0.9975669099756691 - } - }, - "ceb": { - "ud": { - "u": 0.9973474801061007, - "t": null, - "punct": null - } - }, - "cs": { - "ersatz": { - "u": 0.9371866705986435, - "t": 0.9411420204978037, - "punct": 0.9892722528269063 - }, - "opus100": { - "u": 0.8854333409558655, - "t": 0.9077177248052868, - "punct": 0.9523573200992557 - }, - "ud": { - "u": 0.9247068098945501, - "t": 0.9256609024050884, - "punct": 0.9550707312213589 - } - }, - "cy": { - "opus100": { - "u": 0.6982248520710058, - "t": 0.7564385577630611, - "punct": 0.8263244128891317 - }, - "ud": { - "u": 0.9921259842519685, - "t": 0.9889182058047494, - "punct": 0.9947478991596639 - } - }, - "da": { - "opus100": { - "u": 0.8922443376801648, - "t": 0.9088774072334428, - "punct": 0.9497672139181573 - }, - "ud": { - "u": 0.9521322889469105, - "t": 0.9456812110418521, - "punct": 0.9857142857142857 - } - }, - "de": { - "ersatz": { - "u": 0.9569093610698365, - "t": 0.9598586215602121, - "punct": 0.9933639612046963 - }, - "opus100": { - "u": 0.7846746575342467, - "t": 0.8578210301447899, - "punct": 0.9006893030380393 - }, - "ud": { - "u": 0.9565217391304347, - "t": 0.9576446280991735, - "punct": 0.9668595476065229 - } - }, - "el": { - "opus100": { - "u": 0.9173708920187793, - "t": 0.9256355428842956, - "punct": 0.9623103279490944 - }, - "ud": { - "u": 0.9736263736263737, - "t": 0.9767955801104973, - "punct": 0.9767955801104973 - } - }, - "en": { - "ersatz": { - "u": 0.9679229640966776, - "t": 0.9691804959313128, - "punct": 0.9870787611194078 - }, - "opus100": { - "u": 0.9132976132489041, - "t": 0.9031035299692016, - "punct": 0.9503335804299481 - }, - "ud": { - "u": 0.950316169828365, - "t": 0.9499323410013532, - "punct": 0.9668202764976959 - } - }, - "eo": { - "opus100": { - "u": 0.9157384987893463, - "t": 0.9122302158273381, - "punct": 0.9567367119901112 - } - }, - "es": { - "ersatz": { - "u": 0.9878068606730613, - "t": 0.9776156524622783, - "punct": 0.9952622120568535 - }, - "opus100": { - "u": 0.9080648953679754, - "t": 0.9211657907310081, - "punct": 0.9541554302525128 - }, - "ud": { - "u": 0.9673202614379085, - "t": 0.9711649365628605, - "punct": 0.9970980847359256 - } - }, - "et": { - "ersatz": { - "u": 0.9623430962343096, - "t": 0.9580629056415376, - "punct": 0.9799196787148594 - }, - "opus100": { - "u": 0.8395222584147666, - "t": 0.8861712135465661, - "punct": 0.9491358024691359 - }, - "ud": { - "u": 0.9297895222000307, - "t": 0.9316345556246116, - "punct": 0.9820564830706818 - } - }, - "eu": { - "opus100": { - "u": 0.8571428571428572, - "t": 0.8704088704088704, - "punct": 0.9221351616062683 - }, - "ud": { - "u": 0.9737274220032841, - "t": 0.9755292823755843, - "punct": 0.9988876529477196 - } - }, - "fa": { - "opus100": { - "u": 0.6115702479338843, - "t": 0.6123931623931623, - "punct": 0.736245572609209 - }, - "ud": { - "u": 0.9696767744085304, - "t": 0.9803122878479295, - "punct": 0.9989701338825953 - } - }, - "fi": { - "ersatz": { - "u": 0.9753208292201382, - "t": 0.9731493099121707, - "punct": 0.9939909864797195 - }, - "opus100": { - "u": 0.9153810191678354, - "t": 0.9309683604985617, - "punct": 0.961038961038961 - }, - "ud": { - "u": 0.9275541795665635, - "t": 0.9304403318442885, - "punct": 0.9820166987797045 - } - }, - "fr": { - "ersatz": { - "u": 0.9738251041046995, - "t": 0.9684848484848485, - "punct": 0.9861529199277544 - }, - "opus100": { - "u": 0.8793298618972153, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9672131147540984, - "t": 0.9705535924617198, - "punct": 0.9879518072289156 - } - }, - "fy": { - "opus100": { - "u": 0.4505531344284278, - "t": 0.6180443003781738, - "punct": 0.8858204992033988 - } - }, - "ga": { - "opus100": { - "u": 0.7910863509749304, - "t": 0.7861541728604798, - "punct": 0.8798208509579497 - }, - "ud": { - "u": 0.8566001899335233, - "t": 0.9133537206931701, - "punct": 0.9813800657174152 - } - }, - "gd": { - "opus100": { - "u": 0.8461873638344227, - "t": 0.8487603305785123, - "punct": 0.9292076887013596 - }, - "ud": { - "u": 0.7183406113537117, - "t": 0.7197943444730076, - "punct": 0.8119349005424955 - } - }, - "gl": { - "opus100": { - "u": 0.8943165805542509, - "t": 0.8980651250589902, - "punct": 0.9437377690802348 - }, - "ud": { - "u": 0.9777227722772278, - "t": 0.9887359198998749, - "punct": 0.9863354037267081 - } - }, - "gu": { - "ersatz": { - "u": 0.9050070191857744, - "t": 0.8962172647914647, - "punct": 0.9688581314878892 - }, - "opus100": { - "u": 0.7088068181818181, - "t": 0.7101935483870968, - "punct": 0.7777185501066096 - } - }, - "ha": { - "opus100": { - "u": 0.8412017167381975, - "t": 0.8953121204760748, - "punct": 0.9209770114942529 - } - }, - "he": { - "opus100": { - "u": 0.9083570750237417, - "t": 0.9014479215319944, - "punct": 0.9410007468259896 - }, - "ud": { - "u": 0.9554455445544554, - "t": 0.9633375474083439, - "punct": 0.9717948717948718 - } - }, - "hi": { - "ersatz": { - "u": 0.943570057581574, - "t": 0.9474102004365946, - "punct": 0.9643070787637088 - }, - "opus100": { - "u": 0.6668127053669223, - "t": 0.6614736842105263, - "punct": 0.7742243436754176 - }, - "ud": { - "u": 0.9646043165467627, - "t": 0.9705796679289251, - "punct": 0.9991095280498665 - } - }, - "hu": { - "opus100": { - "u": 0.9220291216533583, - "t": 0.9269330811066445, - "punct": 0.9649595687331537 - }, - "ud": { - "u": 0.9613259668508287, - "t": 0.9641255605381165, - "punct": 0.9933333333333333 - } - }, - "hy": { - "opus100": { - "u": 0.8625008196183855, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9633251833740832, - "t": 0.9630872483221476, - "punct": 0.980688497061293 - } - }, - "id": { - "opus100": { - "u": 0.8983463035019454, - "t": 0.9037037037037038, - "punct": 0.9440000000000001 - }, - "ud": { - "u": 0.9816922315685305, - "t": null, - "punct": null - } - }, - "ig": { - "opus100": { - "u": 0.7912087912087912, - "t": 0.8286805759623862, - "punct": 0.9076136021667168 - } - }, - "is": { - "opus100": { - "u": 0.944108395838374, - "t": 0.9483933787731257, - "punct": 0.9686428221460068 - }, - "ud": { - "u": 0.8689475963620615, - "t": 0.8974057669709844, - "punct": 0.9667875548768848 - } - }, - "it": { - "opus100": { - "u": 0.8585925435693801, - "t": 0.8928406466512703, - "punct": 0.9399806389157792 - }, - "ud": { - "u": 0.9432485322896282, - "t": 0.9432485322896282, - "punct": 0.9948400412796699 - } - }, - "ja": { - "ersatz": { - "u": 0.8153250107619457, - "t": 0.8259072117593018, - "punct": 0.9481132075471699 - }, - "opus100": { - "u": 0.4490106544901066, - "t": 0.8053498925244805, - "punct": 0.8741556167125344 - }, - "ud": { - "u": 0.9387755102040817, - "t": 0.9575645756457565, - "punct": 0.9803554724041159 - } - }, - "jv": { - "ud": { - "u": 0.9727626459143969, - "t": null, - "punct": null - } - }, - "ka": { - "opus100": { - "u": 0.9186159844054581, - "t": 0.9168680521991301, - "punct": 0.9319862000985708 - } - }, - "kk": { - "ersatz": { - "u": 0.9638916750250751, - "t": 0.957932083122149, - "punct": 0.9975062344139651 - }, - "opus100": { - "u": 0.7443946188340809, - "t": 0.7595262615859938, - "punct": 0.9244992295839752 - }, - "ud": { - "u": 0.9761677788369876, - "t": 0.8302094818081588, - "punct": 0.9724170172978027 - } - }, - "km": { - "ersatz": { - "u": 0.721262950172669, - "t": 0.9146118721461187, - "punct": 0.9199909848997071 - }, - "opus100": { - "u": 0.7292401787814632, - "t": 0.7204328781241948, - "punct": 0.7934352009054896 - } - }, - "kn": { - "opus100": { - "u": 0.6603245663122552, - "t": 0.6128029832193909, - "punct": 0.7856723429242514 - } - }, - "ko": { - "opus100": { - "u": 0.575307862330281, - "t": 0.7180198935924127, - "punct": 0.8258493703967689 - }, - "ud": { - "u": 0.9928245270711025, - "t": 0.9943231441048035, - "punct": 0.999343975508419 - } - }, - "ku": { - "opus100": { - "u": 0.7955575702629193, - "t": 0.6701030927835051, - "punct": 0.8486748361356512 - } - }, - "ky": { - "opus100": { - "u": 0.8532212885154062, - "t": 0.8516388729154686, - "punct": 0.9094117647058825 - } - }, - "la": { - "ud": { - "u": 0.8946171341925702, - "t": 0.9050144648023144, - "punct": 0.9727294285036756 - } - }, - "lt": { - "ersatz": { - "u": 0.9687344913151364, - "t": 0.966144517433047, - "punct": 0.9915127309036444 - }, - "opus100": { - "u": 0.7825199101490709, - "t": 0.8525271174705746, - "punct": 0.9072063178677195 - }, - "ud": { - "u": 0.9833694866232827, - "t": 0.981021897810219, - "punct": 0.9948717948717949 - } - }, - "lv": { - "ersatz": { - "u": 0.9718172983479106, - "t": 0.9725759059745348, - "punct": 0.9938164729161514 - }, - "opus100": { - "u": 0.7959824231010671, - "t": 0.8654254065519681, - "punct": 0.9191743347426015 - }, - "ud": { - "u": 0.9652807543934848, - "t": 0.9641485275288092, - "punct": 0.9907784687969119 - } - }, - "mg": { - "opus100": { - "u": 0.8997214484679665, - "t": 0.9208253358925144, - "punct": 0.9546693088927423 - } - }, - "mk": { - "opus100": { - "u": 0.9296105114969498, - "t": 0.9304837952090184, - "punct": 0.9604656803298569 - } - }, - "ml": { - "opus100": { - "u": 0.812856165944837, - "t": 0.8242939265183704, - "punct": 0.8743117069667226 - } - }, - "mn": { - "opus100": { - "u": 0.8096339638045993, - "t": null, - "punct": null - } - }, - "mr": { - "opus100": { - "u": 0.890926640926641, - "t": 0.8903836571998008, - "punct": 0.9373134328358209 - }, - "ud": { - "u": 0.9052631578947369, - "t": 0.9278350515463918, - "punct": 0.9787234042553191 - } - }, - "ms": { - "opus100": { - "u": 0.8773035887487876, - "t": 0.8847290640394089, - "punct": 0.9424533064109036 - } - }, - "mt": { - "opus100": { - "u": 0.6299797434166104, - "t": 0.8113067195636002, - "punct": 0.8907891522110922 - }, - "ud": { - "u": 0.9063097514340344, - "t": 0.8795180722891567, - "punct": 0.9443902439024391 - } - }, - "my": { - "opus100": { - "u": 0.7054915854738707, - "t": 0.7547921334329101, - "punct": 0.827622561939905 - } - }, - "ne": { - "opus100": { - "u": 0.7063770147161879, - "t": 0.7022514545914496, - "punct": 0.760419467598817 - } - }, - "nl": { - "opus100": { - "u": 0.9224642773483251, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9444444444444444, - "t": 0.9360655737704918, - "punct": 0.9721518987341773 - } - }, - "no": { - "opus100": { - "u": 0.947139753801593, - "t": 0.9480048367593711, - "punct": 0.9639617553321892 - }, - "ud": { - "u": 0.9823468328141226, - "t": 0.9850283944243676, - "punct": 0.9945862335653519 - } - }, - "pa": { - "opus100": { - "u": 0.56289592760181, - "t": 0.6331945889698232, - "punct": 0.786707882534776 - } - }, - "pl": { - "ersatz": { - "u": 0.9479424888448189, - "t": 0.9280910501810656, - "punct": 0.9795307039440839 - }, - "opus100": { - "u": 0.9239182120779839, - "t": 0.9282964388835419, - "punct": 0.9601564409679785 - }, - "ud": { - "u": 0.949255020513928, - "t": 0.9582331073693418, - "punct": 0.9938872537921667 - } - }, - "ps": { - "ersatz": { - "u": 0.8496520472568376, - "t": 0.915911111111111, - "punct": 0.960179472798654 - }, - "opus100": { - "u": 0.6421959782955634, - "t": 0.7160426345996175, - "punct": 0.7672883787661405 - } - }, - "pt": { - "opus100": { - "u": 0.9071877180739706, - "t": 0.9212410501193318, - "punct": 0.9582213535304177 - }, - "ud": { - "u": 0.960817717206133, - "t": 0.9574924860455132, - "punct": 0.9828620394173093 - } - }, - "ro": { - "ersatz": { - "u": 0.9778978388998036, - "t": 0.9706398996235884, - "punct": 0.9940209267563528 - }, - "opus100": { - "u": 0.8951465201465203, - "t": 0.8995854444956242, - "punct": 0.9692765113974231 - }, - "ud": { - "u": 0.8247745981967856, - "t": 0.939565627950897, - "punct": 0.9938006676204101 - } - }, - "ru": { - "ersatz": { - "u": 0.9763462506290891, - "t": 0.9773755656108597, - "punct": 0.9939271255060729 - }, - "opus100": { - "u": 0.8212290502793297, - "t": null, - "punct": null - }, - "ud": { - "u": 0.8607329842931937, - "t": 0.8773635872501351, - "punct": 0.9341657207718501 - } - }, - "si": { - "opus100": { - "u": 0.8039867109634552, - "t": 0.8084147257700975, - "punct": 0.8599556977602756 - } - }, - "sk": { - "opus100": { - "u": 0.9065615580802225, - "t": 0.9309178743961353, - "punct": 0.9621380846325167 - }, - "ud": { - "u": 0.9618104667609618, - "t": 0.9603024574669187, - "punct": 0.9805595068752964 - } - }, - "sl": { - "opus100": { - "u": 0.9203747072599532, - "t": 0.9325708839647368, - "punct": 0.9544573643410853 - }, - "ud": { - "u": 0.9603658536585367, - "t": 0.964769647696477, - "punct": 0.9914196567862714 - } - }, - "sq": { - "opus100": { - "u": 0.8967114404817045, - "t": 0.9078885214926784, - "punct": 0.9583126550868487 - }, - "ud": { - "u": 1.0, - "t": null, - "punct": null - } - }, - "sr": { - "opus100": { - "u": 0.9431166347992352, - "t": 0.9481018740989908, - "punct": 0.9669036528560921 - }, - "ud": { - "u": 0.9788461538461538, - "t": 0.9818529130850048, - "punct": 0.9980806142034548 - } - }, - "sv": { - "opus100": { - "u": 0.912630359212051, - "t": 0.9307307783860985, - "punct": 0.96142578125 - }, - "ud": { - "u": 0.9509116409537167, - "t": 0.9528301886792453, - "punct": 0.9685114503816793 - } - }, - "ta": { - "ersatz": { - "u": 0.945765937202664, - "t": 0.9504659146640511, - "punct": 0.9821782178217822 - }, - "opus100": { - "u": 0.6605014385532264, - "t": 0.6645817044566067, - "punct": 0.7512328767123289 - }, - "ud": { - "u": 0.97165991902834, - "t": 0.979253112033195, - "punct": 0.9876543209876543 - } - }, - "te": { - "opus100": { - "u": 0.7869339272457312, - "t": 0.7865853658536585, - "punct": 0.8442816016749543 - } - }, - "tg": { - "opus100": { - "u": 0.8148309705561614, - "t": 0.8378119001919386, - "punct": 0.9189189189189189 - } - }, - "th": { - "opus100": { - "u": 0.6884083658277207, - "t": 0.7151862464183382, - "punct": 0.7326396262476109 - }, - "ud": { - "u": 0.6954689146469969, - "t": null, - "punct": null - } - }, - "tr": { - "ersatz": { - "u": 0.9337844369260512, - "t": 0.9339265117811831, - "punct": 0.9842689186951481 - }, - "opus100": { - "u": 0.934010152284264, - "t": 0.9359223300970874, - "punct": 0.957037037037037 - }, - "ud": { - "u": 0.9425511197663096, - "t": 0.9433771486349848, - "punct": 0.9913749365804161 - } - }, - "uk": { - "opus100": { - "u": 0.8909426987060999, - "t": 0.8984283368519822, - "punct": 0.9472910026967393 - }, - "ud": { - "u": 0.9261521377012771, - "t": 0.9310151430173864, - "punct": 0.9837535014005602 - } - }, - "ur": { - "opus100": { - "u": 0.5378430828943721, - "t": 0.52972791400739, - "punct": 0.6809015421115066 - }, - "ud": { - "u": 0.9240174672489082, - "t": 0.9614678899082569, - "punct": 0.9943609022556391 - } - }, - "uz": { - "opus100": { - "u": 0.7815996684624946, - "t": 0.8042402826855124, - "punct": 0.8589928057553957 - } - }, - "vi": { - "opus100": { - "u": 0.9074912016088487, - "t": 0.9091831557584982, - "punct": 0.9492462311557789 - }, - "ud": { - "u": 0.6783425886373345, - "t": 0.9139318885448916, - "punct": 0.9794135995009358 - } - }, - "xh": { - "opus100": { - "u": 0.7868852459016392, - "t": 0.8194511314395764, - "punct": 0.9041450777202072 - } - }, - "yi": { - "opus100": { - "u": 0.7468634686346862, - "t": 0.7569965870307167, - "punct": 0.8179631114675221 - } - }, - "yo": { - "opus100": { - "u": 0.7693839677725978, - "t": null, - "punct": null - }, - "ud": { - "u": 0.8446866485013624, - "t": null, - "punct": null - } - }, - "zh": { - "ersatz": { - "u": 0.9354177344951307, - "t": 0.9372873116188625, - "punct": 0.9785963165754107 - }, - "opus100": { - "u": 0.8096644295302012, - "t": 0.7784926470588236, - "punct": 0.8917576961271102 - }, - "ud": { - "u": 0.9810568295114657, - "t": 0.982178217821782, - "punct": 0.9979959919839679 - } - }, - "zu": { - "opus100": { - "u": 0.7315010570824524, - "t": 0.8391281785158278, - "punct": 0.9082666666666666 - } - } -} \ No newline at end of file diff --git a/wtpsplit/evaluation/evaluation_results/wtp-canine-s-6l-no-adapters_intrinsic_results.json b/wtpsplit/evaluation/evaluation_results/wtp-canine-s-6l-no-adapters_intrinsic_results.json deleted file mode 100644 index 7e140efd..00000000 --- a/wtpsplit/evaluation/evaluation_results/wtp-canine-s-6l-no-adapters_intrinsic_results.json +++ /dev/null @@ -1,977 +0,0 @@ -{ - "af": { - "opus100": { - "u": 0.7786370227162882, - "t": 0.7836393214036719, - "punct": 0.8710174717368961 - }, - "ud": { - "u": 0.9770114942528736, - "t": 0.9859154929577465, - "punct": 0.9976525821596244 - } - }, - "am": { - "opus100": { - "u": 0.6030716723549487, - "t": 0.6284530386740331, - "punct": 0.6870748299319728 - } - }, - "ar": { - "ersatz": { - "u": 0.8728786423310919, - "t": 0.8858921161825726, - "punct": 0.9207886544448288 - }, - "opus100": { - "u": 0.6535723054259759, - "t": 0.6587383660806617, - "punct": 0.7393341553637485 - }, - "ud": { - "u": 0.788171394085697, - "t": 0.8537931034482757, - "punct": 0.8793718772305497 - } - }, - "az": { - "opus100": { - "u": 0.7528487229862476, - "t": 0.7784915536521533, - "punct": 0.8375780874579529 - } - }, - "be": { - "opus100": { - "u": 0.6675018792282635, - "t": 0.7104913678618858, - "punct": 0.8687022900763358 - }, - "ud": { - "u": 0.8901531728665207, - "t": 0.8875379939209727, - "punct": 0.911976911976912 - } - }, - "bg": { - "opus100": { - "u": 0.9387266634753471, - "t": 0.9224988257397839, - "punct": 0.9644520715861731 - }, - "ud": { - "u": 0.9807949977668603, - "t": 0.9794091316025068, - "punct": 0.9968623935454953 - } - }, - "bn": { - "opus100": { - "u": 0.7769017685915193, - "t": 0.8273328434974284, - "punct": 0.8669423497932377 - }, - "ud": { - "u": 1.0, - "t": null, - "punct": null - } - }, - "ca": { - "opus100": { - "u": 0.8899721448467967, - "t": 0.9025082820634169, - "punct": 0.943276283618582 - }, - "ud": { - "u": 0.982652788897785, - "t": 0.9844337090713902, - "punct": 0.9989171629669735 - } - }, - "ceb": { - "ud": { - "u": 0.9973474801061007, - "t": null, - "punct": null - } - }, - "cs": { - "ersatz": { - "u": 0.9498834498834497, - "t": 0.9451974071891573, - "punct": 0.9912739965095986 - }, - "opus100": { - "u": 0.8957564575645756, - "t": 0.9046397761716016, - "punct": 0.9482288828337875 - }, - "ud": { - "u": 0.9245200486026731, - "t": 0.9222600681226206, - "punct": 0.9580512871189288 - } - }, - "cy": { - "opus100": { - "u": 0.7051792828685259, - "t": 0.7417746759720838, - "punct": 0.8232378854625552 - }, - "ud": { - "u": 0.9900471451021478, - "t": 0.9915611814345991, - "punct": 0.9957939011566773 - } - }, - "da": { - "opus100": { - "u": 0.8968217411331184, - "t": 0.9138790035587188, - "punct": 0.9481699828052075 - }, - "ud": { - "u": 0.9541918755401901, - "t": 0.9473684210526315, - "punct": 0.9884238646482635 - } - }, - "de": { - "ersatz": { - "u": 0.9507874015748032, - "t": 0.9515640766902118, - "punct": 0.9900281257990283 - }, - "opus100": { - "u": 0.7884408022428293, - "t": 0.8570739937334297, - "punct": 0.8910191725529768 - }, - "ud": { - "u": 0.9616935483870966, - "t": 0.9634961439588688, - "punct": 0.9647924330005254 - } - }, - "el": { - "opus100": { - "u": 0.9182567726737337, - "t": 0.9239904988123515, - "punct": 0.9628901449987712 - }, - "ud": { - "u": 0.972617743702081, - "t": 0.975609756097561, - "punct": 0.9834254143646409 - } - }, - "en": { - "ersatz": { - "u": 0.9676599529830359, - "t": 0.9721187202878068, - "punct": 0.9872594903796152 - }, - "opus100": { - "u": 0.914804812177756, - "t": 0.9073941134242642, - "punct": 0.9479115479115479 - }, - "ud": { - "u": 0.9499323410013532, - "t": 0.9495495495495496, - "punct": 0.9695290858725761 - } - }, - "eo": { - "opus100": { - "u": 0.9114481409001957, - "t": 0.9066795273691823, - "punct": 0.9590786571918648 - } - }, - "es": { - "ersatz": { - "u": 0.9853990914990266, - "t": 0.9777998674618954, - "punct": 0.9937786509495744 - }, - "opus100": { - "u": 0.9042056074766355, - "t": 0.9174399238639068, - "punct": 0.950284442245857 - }, - "ud": { - "u": 0.9602256699576869, - "t": 0.9721739130434782, - "punct": 0.9956534337873081 - } - }, - "et": { - "ersatz": { - "u": 0.9552964042759962, - "t": 0.9547839120659505, - "punct": 0.985319731276437 - }, - "opus100": { - "u": 0.8363321049208757, - "t": 0.8831048105972577, - "punct": 0.9501726689689196 - }, - "ud": { - "u": 0.9225640248522503, - "t": 0.9314435372053613, - "punct": 0.9792479325947886 - } - }, - "eu": { - "opus100": { - "u": 0.866073465662788, - "t": 0.8768908540842449, - "punct": 0.921277114306605 - }, - "ud": { - "u": 0.9638424177010254, - "t": 0.9790518191841237, - "punct": 0.9994441356309061 - } - }, - "fa": { - "opus100": { - "u": 0.5752354697909488, - "t": 0.5800762631077215, - "punct": 0.7071099247755399 - }, - "ud": { - "u": 0.9622516556291391, - "t": 0.9816949152542372, - "punct": 0.9996564754379939 - } - }, - "fi": { - "ersatz": { - "u": 0.9744094488188976, - "t": 0.9772502472799209, - "punct": 0.9919959979989995 - }, - "opus100": { - "u": 0.9142322972657162, - "t": 0.9271809661139149, - "punct": 0.9601181683899557 - }, - "ud": { - "u": 0.9321351100092966, - "t": 0.943359375, - "punct": 0.9791063966570235 - } - }, - "fr": { - "ersatz": { - "u": 0.9693907875185735, - "t": 0.9642424242424243, - "punct": 0.987410071942446 - }, - "opus100": { - "u": 0.8702118071203245, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9614935822637106, - "t": 0.9704142011834319, - "punct": 0.9855769230769231 - } - }, - "fy": { - "opus100": { - "u": 0.6272973787285326, - "t": 0.6732007840940912, - "punct": 0.866252364225885 - } - }, - "ga": { - "opus100": { - "u": 0.7979528259902092, - "t": 0.7838716670279645, - "punct": 0.880648330058939 - }, - "ud": { - "u": 0.8538899430740039, - "t": 0.9141675284384695, - "punct": 0.988962472406181 - } - }, - "gd": { - "opus100": { - "u": 0.7943637164244827, - "t": 0.8178835110746513, - "punct": 0.9274826789838336 - }, - "ud": { - "u": 0.7027027027027026, - "t": 0.7263157894736844, - "punct": 0.7842003853564549 - } - }, - "gl": { - "opus100": { - "u": 0.8935771214252226, - "t": 0.8982007575757575, - "punct": 0.9400785854616897 - }, - "ud": { - "u": 0.9788819875776398, - "t": 0.9848101265822785, - "punct": 0.9800995024875623 - } - }, - "gu": { - "ersatz": { - "u": 0.8996793403573065, - "t": 0.8984526112185687, - "punct": 0.9612326043737575 - }, - "opus100": { - "u": 0.7112274024738345, - "t": 0.7096617181796057, - "punct": 0.769271383315734 - } - }, - "ha": { - "opus100": { - "u": 0.8238024899767885, - "t": 0.8666345690900337, - "punct": 0.9120192307692307 - } - }, - "he": { - "opus100": { - "u": 0.9086186540731995, - "t": 0.8969527797162129, - "punct": 0.9397650587353161 - }, - "ud": { - "u": 0.9394313967861558, - "t": 0.9419354838709678, - "punct": 0.9611398963730571 - } - }, - "hi": { - "ersatz": { - "u": 0.9320204506722211, - "t": 0.9443012884043608, - "punct": 0.96797583081571 - }, - "opus100": { - "u": 0.6525981393238031, - "t": 0.6191051995163241, - "punct": 0.7593123209169055 - }, - "ud": { - "u": 0.9516770892552585, - "t": 0.9564716056500433, - "punct": 0.9991095280498665 - } - }, - "hu": { - "opus100": { - "u": 0.9246753246753248, - "t": 0.93044306812768, - "punct": 0.9622918707149853 - }, - "ud": { - "u": 0.976897689768977, - "t": 0.9742441209406495, - "punct": 0.9933035714285714 - } - }, - "hy": { - "opus100": { - "u": 0.8578230392805185, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9585702680747361, - "t": 0.9734219269102992, - "punct": 0.9772535804549284 - } - }, - "id": { - "opus100": { - "u": 0.8901762008206614, - "t": 0.8997050147492626, - "punct": 0.9436689930209372 - }, - "ud": { - "u": 0.9783037475345167, - "t": null, - "punct": null - } - }, - "ig": { - "opus100": { - "u": 0.8290522586359611, - "t": 0.831427733802404, - "punct": 0.9031866464339908 - } - }, - "is": { - "opus100": { - "u": 0.9450389105058367, - "t": 0.9498409591387327, - "punct": 0.9699803149606299 - }, - "ud": { - "u": 0.8865148861646235, - "t": 0.915802038380314, - "punct": 0.967470340604669 - } - }, - "it": { - "opus100": { - "u": 0.8514373491332016, - "t": 0.8848540565387267, - "punct": 0.9415204678362573 - }, - "ud": { - "u": 0.9359223300970874, - "t": 0.962962962962963, - "punct": 0.9968976215098242 - } - }, - "ja": { - "ersatz": { - "u": 0.8135451505016721, - "t": 0.8141906873614191, - "punct": 0.9479606188466948 - }, - "opus100": { - "u": 0.37010391686650684, - "t": 0.792864222001982, - "punct": 0.8675311467073481 - }, - "ud": { - "u": 0.9445438282647584, - "t": 0.9588785046728971, - "punct": 0.9879294336118849 - } - }, - "jv": { - "ud": { - "u": 0.9652509652509652, - "t": null, - "punct": null - } - }, - "ka": { - "opus100": { - "u": 0.9129040217123119, - "t": 0.9052380952380953, - "punct": 0.9355077835433655 - } - }, - "kk": { - "ersatz": { - "u": 0.9708641975308643, - "t": 0.9644822411205604, - "punct": 0.9970089730807578 - }, - "opus100": { - "u": 0.7235794674718639, - "t": 0.7549805950840879, - "punct": 0.9023883696780892 - }, - "ud": { - "u": 0.9710763394973921, - "t": 0.9103024090210147, - "punct": 0.9791565681047019 - } - }, - "km": { - "ersatz": { - "u": 0.8393574297188755, - "t": 0.9045362220717671, - "punct": 0.9056768558951965 - }, - "opus100": { - "u": 0.6933520599250936, - "t": 0.695740882427952, - "punct": 0.7787810383747179 - } - }, - "kn": { - "opus100": { - "u": 0.6319910514541387, - "t": 0.6070588235294118, - "punct": 0.7400000000000001 - } - }, - "ko": { - "opus100": { - "u": 0.5524180460889321, - "t": 0.7012138188608777, - "punct": 0.7996223743214539 - }, - "ud": { - "u": 0.9934754240974336, - "t": 0.9965034965034965, - "punct": 0.9984712819392881 - } - }, - "ku": { - "opus100": { - "u": 0.7784217565760143, - "t": 0.5999316706525454, - "punct": 0.8060141509433963 - } - }, - "ky": { - "opus100": { - "u": 0.8228334812185742, - "t": 0.826945412311266, - "punct": 0.9062961868164351 - } - }, - "la": { - "ud": { - "u": 0.9087788131436979, - "t": 0.9181232750689972, - "punct": 0.9688466111771701 - } - }, - "lt": { - "ersatz": { - "u": 0.9669135802469135, - "t": 0.9619095987811072, - "punct": 0.9889224572004028 - }, - "opus100": { - "u": 0.79856267173959, - "t": 0.8512628624883068, - "punct": 0.9069593414816662 - }, - "ud": { - "u": 0.9664045746962115, - "t": 0.9736456808199121, - "punct": 0.9831006612784718 - } - }, - "lv": { - "ersatz": { - "u": 0.9669162038155035, - "t": 0.9698159509202454, - "punct": 0.9938195302843016 - }, - "opus100": { - "u": 0.8083170890188434, - "t": 0.8592347661785544, - "punct": 0.9073339940535182 - }, - "ud": { - "u": 0.9655172413793104, - "t": 0.9659747485555317, - "punct": 0.9901160292221745 - } - }, - "mg": { - "opus100": { - "u": 0.8712960868581768, - "t": 0.9043519846963175, - "punct": 0.9533730158730158 - } - }, - "mk": { - "opus100": { - "u": 0.9280524958987579, - "t": 0.9312941176470588, - "punct": 0.9582726831635129 - } - }, - "ml": { - "opus100": { - "u": 0.7975432211101, - "t": 0.8153187200806249, - "punct": 0.8537927733907634 - } - }, - "mn": { - "opus100": { - "u": 0.7327021121631464, - "t": null, - "punct": null - } - }, - "mr": { - "opus100": { - "u": 0.8914956011730205, - "t": 0.9030333167578319, - "punct": 0.9318692288495133 - }, - "ud": { - "u": 0.9387755102040817, - "t": 0.9387755102040817, - "punct": 0.9484536082474226 - } - }, - "ms": { - "opus100": { - "u": 0.8775857872961793, - "t": 0.883399209486166, - "punct": 0.9454910826425521 - } - }, - "mt": { - "opus100": { - "u": 0.6951871657754011, - "t": 0.8067346454825706, - "punct": 0.8829184126201626 - }, - "ud": { - "u": 0.9092627599243857, - "t": 0.9099616858237548, - "punct": 0.9545014520813164 - } - }, - "my": { - "opus100": { - "u": 0.6724910394265233, - "t": 0.7197404542051411, - "punct": 0.7995660428532682 - } - }, - "ne": { - "opus100": { - "u": 0.6877987421383648, - "t": 0.6866782436120069, - "punct": 0.7445217839649393 - } - }, - "nl": { - "opus100": { - "u": 0.9223915592028137, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9347646573080098, - "t": 0.9225806451612903, - "punct": 0.9608006672226855 - } - }, - "no": { - "opus100": { - "u": 0.9492017416545718, - "t": 0.9507401116233924, - "punct": 0.966887417218543 - }, - "ud": { - "u": 0.9852675109847505, - "t": 0.9853432759064027, - "punct": 0.9924967658473479 - } - }, - "pa": { - "opus100": { - "u": 0.6266999722453511, - "t": 0.6592909228061353, - "punct": 0.7526041666666666 - } - }, - "pl": { - "ersatz": { - "u": 0.9381494309747649, - "t": 0.9363867684478373, - "punct": 0.9880478087649402 - }, - "opus100": { - "u": 0.9173003802281369, - "t": 0.9325272067714631, - "punct": 0.960548885077187 - }, - "ud": { - "u": 0.9471876341777588, - "t": 0.964881474978051, - "punct": 0.9915889974994317 - } - }, - "ps": { - "ersatz": { - "u": 0.8682813536828136, - "t": 0.9250496658840527, - "punct": 0.9650946899368733 - }, - "opus100": { - "u": 0.5173716012084593, - "t": 0.6792351198491785, - "punct": 0.7463442069741283 - } - }, - "pt": { - "opus100": { - "u": 0.9101123595505617, - "t": 0.9204761904761904, - "punct": 0.9565853658536586 - }, - "ud": { - "u": 0.9612270984235194, - "t": 0.9597257926306769, - "punct": 0.9805783340526544 - } - }, - "ro": { - "ersatz": { - "u": 0.975442043222004, - "t": 0.9654650869674818, - "punct": 0.9945246391239422 - }, - "opus100": { - "u": 0.8973768982972848, - "t": 0.9089210649229331, - "punct": 0.9701640974639483 - }, - "ud": { - "u": 0.8458752515090543, - "t": 0.9359698681732581, - "punct": 0.9937888198757764 - } - }, - "ru": { - "ersatz": { - "u": 0.9778894472361809, - "t": 0.9776876267748479, - "punct": 0.994436014162873 - }, - "opus100": { - "u": 0.8295973884657235, - "t": null, - "punct": null - }, - "ud": { - "u": 0.8587926509186352, - "t": 0.8705625341343528, - "punct": 0.9217391304347826 - } - }, - "si": { - "opus100": { - "u": 0.7974927675988428, - "t": 0.8107160741111668, - "punct": 0.8524096385542168 - } - }, - "sk": { - "opus100": { - "u": 0.9129927688360159, - "t": 0.9388149939540508, - "punct": 0.964135542913678 - }, - "ud": { - "u": 0.9627534181989628, - "t": 0.9615567157095396, - "punct": 0.968779564806055 - } - }, - "sl": { - "opus100": { - "u": 0.9183243622747483, - "t": 0.9324743497971845, - "punct": 0.9530038759689922 - }, - "ud": { - "u": 0.963159893657425, - "t": 0.9698608964451314, - "punct": 0.9925810230378759 - } - }, - "sq": { - "opus100": { - "u": 0.9027291812456263, - "t": 0.912742051159455, - "punct": 0.9560493827160493 - }, - "ud": { - "u": 1.0, - "t": null, - "punct": null - } - }, - "sr": { - "opus100": { - "u": 0.9421329507412721, - "t": 0.944337811900192, - "punct": 0.9652131308182265 - }, - "ud": { - "u": 0.9808061420345489, - "t": 0.9808061420345489, - "punct": 0.9980769230769231 - } - }, - "sv": { - "opus100": { - "u": 0.9182242990654206, - "t": 0.9328233325364571, - "punct": 0.9581712062256807 - }, - "ud": { - "u": 0.951048951048951, - "t": 0.9553656220322887, - "punct": 0.9668428640076886 - } - }, - "ta": { - "ersatz": { - "u": 0.9378852536747273, - "t": 0.9530469530469531, - "punct": 0.9806835066864783 - }, - "opus100": { - "u": 0.6262711864406779, - "t": 0.6094859414938938, - "punct": 0.6996681415929203 - }, - "ud": { - "u": 0.97165991902834, - "t": 0.9663865546218487, - "punct": 1.0 - } - }, - "te": { - "opus100": { - "u": 0.7635299853729889, - "t": 0.7660026560424966, - "punct": 0.8259290304554344 - } - }, - "tg": { - "opus100": { - "u": 0.7911802853437095, - "t": 0.8259351620947631, - "punct": 0.9095330739299612 - } - }, - "th": { - "opus100": { - "u": 0.6843678565148584, - "t": 0.7079265911753221, - "punct": 0.7166500498504486 - }, - "ud": { - "u": 0.6747237569060774, - "t": null, - "punct": null - } - }, - "tr": { - "ersatz": { - "u": 0.9273974788575076, - "t": 0.9260237780713343, - "punct": 0.9823215476984656 - }, - "opus100": { - "u": 0.9319826338639653, - "t": 0.9343704423918328, - "punct": 0.9588162762022194 - }, - "ud": { - "u": 0.9375907111756169, - "t": 0.9455445544554455, - "punct": 0.9918616480162767 - } - }, - "uk": { - "opus100": { - "u": 0.8891471066697653, - "t": 0.8987911827447261, - "punct": 0.9438092921430309 - }, - "ud": { - "u": 0.9223140495867769, - "t": 0.9316770186335404, - "punct": 0.9865470852017937 - } - }, - "ur": { - "opus100": { - "u": 0.5076086956521739, - "t": 0.5075368560543316, - "punct": 0.6364053186611648 - }, - "ud": { - "u": 0.9192006950477846, - "t": 0.9739776951672863, - "punct": 0.9934518241347053 - } - }, - "uz": { - "opus100": { - "u": 0.7701554404145078, - "t": 0.8004662004662004, - "punct": 0.8523878437047757 - } - }, - "vi": { - "opus100": { - "u": 0.9121741331308528, - "t": 0.9139344262295082, - "punct": 0.9498243853487205 - }, - "ud": { - "u": 0.7291954022988506, - "t": 0.9226932668329176, - "punct": 0.976889444097439 - } - }, - "xh": { - "opus100": { - "u": 0.7934040047114251, - "t": 0.7941523225654327, - "punct": 0.895169578622816 - } - }, - "yi": { - "opus100": { - "u": 0.729144479943202, - "t": 0.7197164948453608, - "punct": 0.8077879038939519 - } - }, - "yo": { - "opus100": { - "u": 0.7610900761931473, - "t": null, - "punct": null - }, - "ud": { - "u": 0.8835227272727272, - "t": null, - "punct": null - } - }, - "zh": { - "ersatz": { - "u": 0.8872839824606655, - "t": 0.9009523809523808, - "punct": 0.9733499377334994 - }, - "opus100": { - "u": 0.6192236598890942, - "t": 0.7037366548042704, - "punct": 0.8682926829268292 - }, - "ud": { - "u": 0.9714285714285714, - "t": 0.9831181727904666, - "punct": 0.9979959919839679 - } - }, - "zu": { - "opus100": { - "u": 0.7851263128015896, - "t": 0.8391895357784047, - "punct": 0.8989981045220689 - } - } -} \ No newline at end of file diff --git a/wtpsplit/evaluation/evaluation_results/wtp-canine-s-6l_intrinsic_results.json b/wtpsplit/evaluation/evaluation_results/wtp-canine-s-6l_intrinsic_results.json deleted file mode 100644 index 29f95068..00000000 --- a/wtpsplit/evaluation/evaluation_results/wtp-canine-s-6l_intrinsic_results.json +++ /dev/null @@ -1,977 +0,0 @@ -{ - "af": { - "opus100": { - "u": 0.7210626185958254, - "t": 0.7779316712834718, - "punct": 0.8891186353062807 - }, - "ud": { - "u": 0.9883720930232558, - "t": 0.9976525821596244, - "punct": 1.0 - } - }, - "am": { - "opus100": { - "u": 0.621072730428745, - "t": 0.6529175050301811, - "punct": 0.718105423987777 - } - }, - "ar": { - "ersatz": { - "u": 0.881975625400898, - "t": 0.8865417376490631, - "punct": 0.9275163174166953 - }, - "opus100": { - "u": 0.6858124693176239, - "t": 0.6840269704719832, - "punct": 0.7828471702817253 - }, - "ud": { - "u": 0.8386277001270648, - "t": 0.8706365503080082, - "punct": 0.8956884561891516 - } - }, - "az": { - "opus100": { - "u": 0.7702647657841141, - "t": 0.765639589169001, - "punct": 0.8422843256379102 - } - }, - "be": { - "opus100": { - "u": 0.7182569496619083, - "t": 0.7464199162811191, - "punct": 0.8934588701684835 - }, - "ud": { - "u": 0.9002671415850401, - "t": 0.9029126213592232, - "punct": 0.9337094499294781 - } - }, - "bg": { - "opus100": { - "u": 0.937005988023952, - "t": 0.928284023668639, - "punct": 0.9643206256109481 - }, - "ud": { - "u": 0.9838854073410923, - "t": 0.9830053667262969, - "punct": 0.9964125560538117 - } - }, - "bn": { - "opus100": { - "u": 0.8027151302824612, - "t": 0.8357418111753373, - "punct": 0.8806584362139918 - }, - "ud": { - "u": 1.0, - "t": null, - "punct": null - } - }, - "ca": { - "opus100": { - "u": 0.8968588842006564, - "t": 0.9040835707502374, - "punct": 0.9473941766576951 - }, - "ud": { - "u": 0.9833512352309346, - "t": 0.9840927473712591, - "punct": 0.9989165763813651 - } - }, - "ceb": { - "ud": { - "u": 0.9973474801061007, - "t": null, - "punct": null - } - }, - "cs": { - "ersatz": { - "u": 0.9350493864112541, - "t": 0.9479227618490345, - "punct": 0.9927641099855282 - }, - "opus100": { - "u": 0.8922722029988466, - "t": 0.9080568720379146, - "punct": 0.9514419521814148 - }, - "ud": { - "u": 0.925398406374502, - "t": 0.9258873169520973, - "punct": 0.9648390941597139 - } - }, - "cy": { - "opus100": { - "u": 0.6793363499245852, - "t": 0.7520355292376018, - "punct": 0.8356239692138538 - }, - "ud": { - "u": 0.9926470588235294, - "t": 0.992639327024185, - "punct": 0.9968487394957984 - } - }, - "da": { - "opus100": { - "u": 0.9001614018907078, - "t": 0.9079439252336448, - "punct": 0.948743223262691 - }, - "ud": { - "u": 0.9574283231972198, - "t": 0.9499561018437226, - "punct": 0.9884444444444446 - } - }, - "de": { - "ersatz": { - "u": 0.9645995480793371, - "t": 0.9664463650228775, - "punct": 0.9931104873692269 - }, - "opus100": { - "u": 0.7999129109514478, - "t": 0.8564570331253001, - "punct": 0.903358113304281 - }, - "ud": { - "u": 0.9633401221995925, - "t": 0.9604113110539846, - "punct": 0.9717277486910993 - } - }, - "el": { - "opus100": { - "u": 0.9201037002121141, - "t": 0.9309438470728794, - "punct": 0.9663142365379886 - }, - "ud": { - "u": 0.9768467475192943, - "t": 0.9778761061946903, - "punct": 0.9789590254706534 - } - }, - "en": { - "ersatz": { - "u": 0.9694334313850876, - "t": 0.970098385955887, - "punct": 0.9881864208749838 - }, - "opus100": { - "u": 0.914117934915586, - "t": 0.8976525821596244, - "punct": 0.9519607843137254 - }, - "ud": { - "u": 0.9522944116310768, - "t": 0.9498175182481751, - "punct": 0.9729729729729729 - } - }, - "eo": { - "opus100": { - "u": 0.9206271435570799, - "t": 0.9131786653910547, - "punct": 0.9572271386430679 - } - }, - "es": { - "ersatz": { - "u": 0.9884157285038342, - "t": 0.9843827059016933, - "punct": 0.996079712512251 - }, - "opus100": { - "u": 0.9171793658305726, - "t": 0.9196386115073704, - "punct": 0.9557697059550284 - }, - "ud": { - "u": 0.9639102017618641, - "t": 0.9673504767408264, - "punct": 0.9985477781004937 - } - }, - "et": { - "ersatz": { - "u": 0.9644092931290164, - "t": 0.9589937106918239, - "punct": 0.995795201582983 - }, - "opus100": { - "u": 0.8490857016964087, - "t": 0.8985507246376813, - "punct": 0.9581786686463747 - }, - "ud": { - "u": 0.932077801790676, - "t": 0.9336426914153133, - "punct": 0.9774506733479487 - } - }, - "eu": { - "opus100": { - "u": 0.8619986403806935, - "t": 0.8808071328015016, - "punct": 0.928921568627451 - }, - "ud": { - "u": 0.9694635488308115, - "t": 0.970678581401843, - "punct": 0.9988876529477196 - } - }, - "fa": { - "opus100": { - "u": 0.6354142492457647, - "t": 0.627931544474963, - "punct": 0.7555238774055595 - }, - "ud": { - "u": 0.9689896632210737, - "t": 0.9777177582714383, - "punct": 1.0 - } - }, - "fi": { - "ersatz": { - "u": 0.9794794794794794, - "t": 0.9777999501122475, - "punct": 0.9964982491245622 - }, - "opus100": { - "u": 0.9206498704968212, - "t": 0.9295977011494252, - "punct": 0.963235294117647 - }, - "ud": { - "u": 0.9306683322923173, - "t": 0.9296046287367405, - "punct": 0.9829417444480206 - } - }, - "fr": { - "ersatz": { - "u": 0.9738406658739595, - "t": 0.9678007290400973, - "punct": 0.9878934624697335 - }, - "opus100": { - "u": 0.8758495695514271, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9694117647058823, - "t": 0.976303317535545, - "punct": 0.9806763285024154 - } - }, - "fy": { - "opus100": { - "u": 0.5050916496945009, - "t": 0.6705324196099104, - "punct": 0.8878779769868879 - } - }, - "ga": { - "opus100": { - "u": 0.8207295373665481, - "t": 0.8196357174589071, - "punct": 0.894175960346964 - }, - "ud": { - "u": 0.8673076923076922, - "t": 0.9116751269035532, - "punct": 0.9792802617230099 - } - }, - "gd": { - "opus100": { - "u": 0.8217122683142101, - "t": 0.8358700123406005, - "punct": 0.9342592592592592 - }, - "ud": { - "u": 0.7042553191489362, - "t": 0.7307001795332135, - "punct": 0.8171846435100549 - } - }, - "gl": { - "opus100": { - "u": 0.9007344231224828, - "t": 0.8994334277620395, - "punct": 0.9436172809372712 - }, - "ud": { - "u": 0.9912390488110137, - "t": 0.9899244332493703, - "punct": 0.9875311720698254 - } - }, - "gu": { - "ersatz": { - "u": 0.9122974261201144, - "t": 0.9051808406647116, - "punct": 0.9742574257425742 - }, - "opus100": { - "u": 0.7262464722483538, - "t": 0.7225362496927993, - "punct": 0.79768177028451 - } - }, - "ha": { - "opus100": { - "u": 0.8551906546175887, - "t": 0.8943405392275929, - "punct": 0.9214079074252652 - } - }, - "he": { - "opus100": { - "u": 0.9107355391573435, - "t": 0.9060229669557066, - "punct": 0.9460333250435214 - }, - "ud": { - "u": 0.9688667496886674, - "t": 0.9672544080604534, - "punct": 0.9820512820512821 - } - }, - "hi": { - "ersatz": { - "u": 0.9535063928709803, - "t": 0.9535297607277041, - "punct": 0.9728744939271255 - }, - "opus100": { - "u": 0.6779584462511293, - "t": 0.6621535857837952, - "punct": 0.7715426156374735 - }, - "ud": { - "u": 0.9746577337605592, - "t": 0.9752114319043453, - "punct": 0.9991095280498665 - } - }, - "hu": { - "opus100": { - "u": 0.9280983916745507, - "t": 0.9392557022809124, - "punct": 0.9657198824681684 - }, - "ud": { - "u": 0.9665924276169265, - "t": 0.9666666666666667, - "punct": 0.9899888765294772 - } - }, - "hy": { - "opus100": { - "u": 0.8656834342902291, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9720394736842105, - "t": 0.9792874896437448, - "punct": 0.979933110367893 - } - }, - "id": { - "opus100": { - "u": 0.9022298456260721, - "t": 0.9045622688039457, - "punct": 0.9450658712403679 - }, - "ud": { - "u": 0.9846306395637084, - "t": null, - "punct": null - } - }, - "ig": { - "opus100": { - "u": 0.7906542056074767, - "t": 0.8234260614934114, - "punct": 0.903632542779946 - } - }, - "is": { - "opus100": { - "u": 0.9469069654164637, - "t": 0.9501222493887531, - "punct": 0.9686735193343123 - }, - "ud": { - "u": 0.8853677967602017, - "t": 0.9058295964125561, - "punct": 0.9612462006079028 - } - }, - "it": { - "opus100": { - "u": 0.8687318921328281, - "t": 0.902484763244257, - "punct": 0.948443579766537 - }, - "ud": { - "u": 0.9488188976377953, - "t": 0.9423264907135874, - "punct": 0.9948293691830403 - } - }, - "ja": { - "ersatz": { - "u": 0.8438308886971527, - "t": 0.8403908794788274, - "punct": 0.9546971864568432 - }, - "opus100": { - "u": 0.5210810810810811, - "t": 0.8080614203454894, - "punct": 0.880484114977307 - }, - "ud": { - "u": 0.9574660633484162, - "t": 0.9713228492136909, - "punct": 0.9813432835820896 - } - }, - "jv": { - "ud": { - "u": 0.9649805447470818, - "t": null, - "punct": null - } - }, - "ka": { - "opus100": { - "u": 0.9188665536449504, - "t": 0.9219858156028369, - "punct": 0.9351418002466091 - } - }, - "kk": { - "ersatz": { - "u": 0.96113074204947, - "t": 0.9605263157894738, - "punct": 0.9944917376064095 - }, - "opus100": { - "u": 0.6826036866359446, - "t": 0.7316813216093792, - "punct": 0.9194337194337194 - }, - "ud": { - "u": 0.972195589645254, - "t": 0.9075025693730729, - "punct": 0.9658489658489658 - } - }, - "km": { - "ersatz": { - "u": 0.776150996648439, - "t": 0.9208988764044943, - "punct": 0.9180848649875197 - }, - "opus100": { - "u": 0.7192608386638237, - "t": 0.7065527065527065, - "punct": 0.7947247706422018 - } - }, - "kn": { - "opus100": { - "u": 0.7138397502601457, - "t": 0.6803887935963407, - "punct": 0.7865818392134182 - } - }, - "ko": { - "opus100": { - "u": 0.5354645354645355, - "t": 0.7252336448598131, - "punct": 0.8188865398167724 - }, - "ud": { - "u": 0.9947757945145842, - "t": 0.9952048823016565, - "punct": 0.9991254919108001 - } - }, - "ku": { - "opus100": { - "u": 0.7974769092137869, - "t": 0.6985616010006255, - "punct": 0.8739310344827586 - } - }, - "ky": { - "opus100": { - "u": 0.8321917808219179, - "t": 0.8282652765710975, - "punct": 0.9154553544942153 - } - }, - "la": { - "ud": { - "u": 0.8911300749935351, - "t": 0.9175377468060396, - "punct": 0.9741399762752077 - } - }, - "lt": { - "ersatz": { - "u": 0.9745889387144993, - "t": 0.9681978798586572, - "punct": 0.992 - }, - "opus100": { - "u": 0.7968425425841297, - "t": 0.8498519024834814, - "punct": 0.9177294323580895 - }, - "ud": { - "u": 0.9847936278059377, - "t": 0.9840579710144928, - "punct": 0.9955947136563876 - } - }, - "lv": { - "ersatz": { - "u": 0.9732750242954324, - "t": 0.9747239263803681, - "punct": 0.9967829745112595 - }, - "opus100": { - "u": 0.7997491638795986, - "t": 0.8744650499286734, - "punct": 0.9287493771798704 - }, - "ud": { - "u": 0.967269595176572, - "t": 0.969270166453265, - "punct": 0.990149892933619 - } - }, - "mg": { - "opus100": { - "u": 0.9129821260583254, - "t": 0.9251080172827653, - "punct": 0.9648115797354629 - } - }, - "mk": { - "opus100": { - "u": 0.9311969839773798, - "t": 0.9327036599763873, - "punct": 0.961594555177443 - } - }, - "ml": { - "opus100": { - "u": 0.810126582278481, - "t": 0.803129074315515, - "punct": 0.8790361445783132 - } - }, - "mn": { - "opus100": { - "u": 0.7744031642887413, - "t": null, - "punct": null - } - }, - "mr": { - "opus100": { - "u": 0.9082862060486846, - "t": 0.9118888061553737, - "punct": 0.9294949987801904 - }, - "ud": { - "u": 0.8958333333333333, - "t": 0.9199999999999999, - "punct": 0.989247311827957 - } - }, - "ms": { - "opus100": { - "u": 0.8809231524674687, - "t": 0.8851401637310841, - "punct": 0.9405028628329599 - } - }, - "mt": { - "opus100": { - "u": 0.6342530282637955, - "t": 0.8271515151515152, - "punct": 0.8968742308638937 - }, - "ud": { - "u": 0.9078822412155745, - "t": 0.8999028182701652, - "punct": 0.9631782945736435 - } - }, - "my": { - "opus100": { - "u": 0.7012817727568977, - "t": 0.7745490981963927, - "punct": 0.8344406058995483 - } - }, - "ne": { - "opus100": { - "u": 0.7179245283018867, - "t": 0.7065806451612903, - "punct": 0.7465865647187329 - } - }, - "nl": { - "opus100": { - "u": 0.926921263554927, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9398998330550918, - "t": 0.9358552631578948, - "punct": 0.9632721202003339 - } - }, - "no": { - "opus100": { - "u": 0.949636803874092, - "t": 0.9517107498180053, - "punct": 0.9671084928816888 - }, - "ud": { - "u": 0.9804432855280313, - "t": 0.9868590569440865, - "punct": 0.9938144329896907 - } - }, - "pa": { - "opus100": { - "u": 0.5963330327622482, - "t": 0.647374062165059, - "punct": 0.7737148913619502 - } - }, - "pl": { - "ersatz": { - "u": 0.9488977955911824, - "t": 0.93573264781491, - "punct": 0.9363722697056029 - }, - "opus100": { - "u": 0.9273422562141492, - "t": 0.9338822959554373, - "punct": 0.9614534740977168 - }, - "ud": { - "u": 0.9537860707311782, - "t": 0.9563890100305277, - "punct": 0.9941149841557265 - } - }, - "ps": { - "ersatz": { - "u": 0.8459807073954985, - "t": 0.9180560482440581, - "punct": 0.9702048417132217 - }, - "opus100": { - "u": 0.44329896907216493, - "t": 0.6832513740237199, - "punct": 0.7700688073394495 - } - }, - "pt": { - "opus100": { - "u": 0.9096864763687411, - "t": 0.9237753882915173, - "punct": 0.9587301587301588 - }, - "ud": { - "u": 0.9635974304068522, - "t": 0.9641025641025641, - "punct": 0.9793991416309014 - } - }, - "ro": { - "ersatz": { - "u": 0.9794909809735607, - "t": 0.968149646107179, - "punct": 0.995260663507109 - }, - "opus100": { - "u": 0.9015711645101664, - "t": 0.903899721448468, - "punct": 0.9715135001238544 - }, - "ud": { - "u": 0.8177225029148854, - "t": 0.9056776556776556, - "punct": 0.9947494033412888 - } - }, - "ru": { - "ersatz": { - "u": 0.9796954314720813, - "t": 0.9823499747856783, - "punct": 0.9954522486104093 - }, - "opus100": { - "u": 0.8330787693650448, - "t": null, - "punct": null - }, - "ud": { - "u": 0.8675703858185609, - "t": 0.8815013404825737, - "punct": 0.937923250564334 - } - }, - "si": { - "opus100": { - "u": 0.8081692195477752, - "t": 0.8110414052697615, - "punct": 0.869198312236287 - } - }, - "sk": { - "opus100": { - "u": 0.9168033747363487, - "t": 0.934108527131783, - "punct": 0.9602763385146805 - }, - "ud": { - "u": 0.9632422243166824, - "t": 0.959047619047619, - "punct": 0.9858088930936614 - } - }, - "sl": { - "opus100": { - "u": 0.9257250648431974, - "t": 0.9340659340659342, - "punct": 0.957070094591317 - }, - "ud": { - "u": 0.9633911368015414, - "t": 0.9644238205723125, - "punct": 0.9953161592505854 - } - }, - "sq": { - "opus100": { - "u": 0.9033317691224777, - "t": 0.9175332527206771, - "punct": 0.9605717102020701 - }, - "ud": { - "u": 1.0, - "t": null, - "punct": null - } - }, - "sr": { - "opus100": { - "u": 0.9465356029729082, - "t": 0.949699157641396, - "punct": 0.9654665686994857 - }, - "ud": { - "u": 0.9854227405247813, - "t": 0.9913710450623201, - "punct": 1.0 - } - }, - "sv": { - "opus100": { - "u": 0.9204119850187266, - "t": 0.9335246293639407, - "punct": 0.9611911154503294 - }, - "ud": { - "u": 0.956766917293233, - "t": 0.959280303030303, - "punct": 0.962607861936721 - } - }, - "ta": { - "ersatz": { - "u": 0.9489747257987601, - "t": 0.9530400395452298, - "punct": 0.983184965380811 - }, - "opus100": { - "u": 0.6819571865443425, - "t": 0.683289124668435, - "punct": 0.75 - }, - "ud": { - "u": 0.975609756097561, - "t": 0.975609756097561, - "punct": 0.995850622406639 - } - }, - "te": { - "opus100": { - "u": 0.7861570247933883, - "t": 0.7850753897265526, - "punct": 0.8511309836927932 - } - }, - "tg": { - "opus100": { - "u": 0.8174482006543076, - "t": 0.8390488110137672, - "punct": 0.9120715350223547 - } - }, - "th": { - "opus100": { - "u": 0.6923625981441828, - "t": 0.7218372599485251, - "punct": 0.7325776658270361 - }, - "ud": { - "u": 0.709585121602289, - "t": null, - "punct": null - } - }, - "tr": { - "ersatz": { - "u": 0.9451899757477769, - "t": 0.9429831726842018, - "punct": 0.9852478037460634 - }, - "opus100": { - "u": 0.9361185329123148, - "t": 0.936542669584245, - "punct": 0.9576771653543307 - }, - "ud": { - "u": 0.9496332518337408, - "t": 0.955955955955956, - "punct": 0.990872210953347 - } - }, - "uk": { - "opus100": { - "u": 0.8953623863901187, - "t": 0.8973277074542898, - "punct": 0.9461426491994178 - }, - "ud": { - "u": 0.9284916201117319, - "t": 0.9313835770528683, - "punct": 0.9849414389291691 - } - }, - "ur": { - "opus100": { - "u": 0.5455534403045961, - "t": 0.5305591677503252, - "punct": 0.695066185318893 - }, - "ud": { - "u": 0.948787061994609, - "t": 0.9704251386321627, - "punct": 0.9915492957746478 - } - }, - "uz": { - "opus100": { - "u": 0.7950257289879931, - "t": 0.8070257611241218, - "punct": 0.86562123039807 - } - }, - "vi": { - "opus100": { - "u": 0.9101947887680243, - "t": 0.9131882202304737, - "punct": 0.9523096129837703 - }, - "ud": { - "u": 0.760536398467433, - "t": 0.9278728606356969, - "punct": 0.9780839073262366 - } - }, - "xh": { - "opus100": { - "u": 0.8107983387171205, - "t": 0.820849964780465, - "punct": 0.8993808049535604 - } - }, - "yi": { - "opus100": { - "u": 0.6078173034694774, - "t": 0.6465408805031446, - "punct": 0.8139145012573344 - } - }, - "yo": { - "opus100": { - "u": 0.7700611730838431, - "t": null, - "punct": null - }, - "ud": { - "u": 0.8634482758620688, - "t": null, - "punct": null - } - }, - "zh": { - "ersatz": { - "u": 0.9224426694150992, - "t": 0.9333661175313499, - "punct": 0.9735998026153466 - }, - "opus100": { - "u": 0.8208677136012776, - "t": 0.7907300115874856, - "punct": 0.8939929328621908 - }, - "ud": { - "u": 0.9790628115653042, - "t": 0.9811320754716981, - "punct": 0.997002997002997 - } - }, - "zu": { - "opus100": { - "u": 0.5591246028944582, - "t": 0.8276939094221759, - "punct": 0.9085481682496608 - } - } -} \ No newline at end of file diff --git a/wtpsplit/evaluation/evaluation_results/wtp-canine-s-9l-no-adapters_intrinsic_results.json b/wtpsplit/evaluation/evaluation_results/wtp-canine-s-9l-no-adapters_intrinsic_results.json deleted file mode 100644 index f0e559de..00000000 --- a/wtpsplit/evaluation/evaluation_results/wtp-canine-s-9l-no-adapters_intrinsic_results.json +++ /dev/null @@ -1,977 +0,0 @@ -{ - "af": { - "opus100": { - "u": 0.7706283118849356, - "t": 0.7890083632019116, - "punct": 0.8745882949075247 - }, - "ud": { - "u": 0.9803921568627451, - "t": 0.982373678025852, - "punct": 0.9988249118683901 - } - }, - "am": { - "opus100": { - "u": 0.5865531914893617, - "t": 0.6183574879227053, - "punct": 0.6907039489820946 - } - }, - "ar": { - "ersatz": { - "u": 0.8861736334405145, - "t": 0.8854274835583247, - "punct": 0.9151740452855696 - }, - "opus100": { - "u": 0.6842105263157894, - "t": 0.6847389558232931, - "punct": 0.7635752042287363 - }, - "ud": { - "u": 0.8189172370877412, - "t": 0.8728755948334466, - "punct": 0.8868841082581541 - } - }, - "az": { - "opus100": { - "u": 0.7589479928811549, - "t": 0.7741935483870966, - "punct": 0.840356368889959 - } - }, - "be": { - "opus100": { - "u": 0.6703853955375253, - "t": 0.7112311501237902, - "punct": 0.8834012219959266 - }, - "ud": { - "u": 0.89937106918239, - "t": 0.8890845070422535, - "punct": 0.9144486692015209 - } - }, - "bg": { - "opus100": { - "u": 0.9320665083135392, - "t": 0.9259346343757348, - "punct": 0.9627815866797258 - }, - "ud": { - "u": 0.9815895823978447, - "t": 0.9808463251670377, - "punct": 0.9968623935454953 - } - }, - "bn": { - "opus100": { - "u": 0.7977381470204437, - "t": 0.8337705699976151, - "punct": 0.8670187229956793 - }, - "ud": { - "u": 0.9911504424778761, - "t": null, - "punct": null - } - }, - "ca": { - "opus100": { - "u": 0.8925464378086057, - "t": 0.9014684983420179, - "punct": 0.9463892288861688 - }, - "ud": { - "u": 0.9828693790149893, - "t": 0.9837662337662338, - "punct": 0.9981024667931689 - } - }, - "ceb": { - "ud": { - "u": 0.9973474801061007, - "t": null, - "punct": null - } - }, - "cs": { - "ersatz": { - "u": 0.9463126843657816, - "t": 0.9467490438364224, - "punct": 0.9915917657291968 - }, - "opus100": { - "u": 0.8932532347504621, - "t": 0.9030743956817647, - "punct": 0.9489846458642892 - }, - "ud": { - "u": 0.9266873736626732, - "t": 0.9245547328397039, - "punct": 0.9599645442458266 - } - }, - "cy": { - "opus100": { - "u": 0.631680618158403, - "t": 0.7404778802933523, - "punct": 0.8313120176405733 - }, - "ud": { - "u": 0.9916142557651991, - "t": 0.9915700737618546, - "punct": 0.9958027282266527 - } - }, - "da": { - "opus100": { - "u": 0.8974418068679418, - "t": 0.9091762497066418, - "punct": 0.950354609929078 - }, - "ud": { - "u": 0.9522983521248916, - "t": 0.9454225352112676, - "punct": 0.987544483985765 - } - }, - "de": { - "ersatz": { - "u": 0.9575155279503105, - "t": 0.9596139192278385, - "punct": 0.9943991853360489 - }, - "opus100": { - "u": 0.7963446475195821, - "t": 0.8495617152333571, - "punct": 0.898801937292888 - }, - "ud": { - "u": 0.9598781107160996, - "t": 0.9665121071612571, - "punct": 0.9712192569335426 - } - }, - "el": { - "opus100": { - "u": 0.919415645617342, - "t": 0.9278055754110078, - "punct": 0.9620876415558838 - }, - "ud": { - "u": 0.9704918032786886, - "t": 0.9778270509977827, - "punct": 0.9725576289791438 - } - }, - "en": { - "ersatz": { - "u": 0.9687738975274024, - "t": 0.9715533294805112, - "punct": 0.9888635623575383 - }, - "opus100": { - "u": 0.9110893991206643, - "t": 0.901375059269796, - "punct": 0.9486107696090484 - }, - "ud": { - "u": 0.9497054825555052, - "t": 0.9508344609833107, - "punct": 0.9696969696969697 - } - }, - "eo": { - "opus100": { - "u": 0.9145236345824148, - "t": 0.9154204461877911, - "punct": 0.9575876440303995 - } - }, - "es": { - "ersatz": { - "u": 0.9874654077812144, - "t": 0.9825197889182058, - "punct": 0.9962473486702562 - }, - "opus100": { - "u": 0.9175746091899574, - "t": 0.9228932919551205, - "punct": 0.9535802469135802 - }, - "ud": { - "u": 0.9704209328782708, - "t": 0.9754548079699682, - "punct": 0.9959420289855072 - } - }, - "et": { - "ersatz": { - "u": 0.9636005902606986, - "t": 0.9537060460409815, - "punct": 0.988328780730072 - }, - "opus100": { - "u": 0.831359035106612, - "t": 0.8805412972468502, - "punct": 0.9503090234857849 - }, - "ud": { - "u": 0.9271665642286416, - "t": 0.92868169188021, - "punct": 0.9758922980588604 - } - }, - "eu": { - "opus100": { - "u": 0.8784313725490196, - "t": 0.8823802882380288, - "punct": 0.9276574803149606 - }, - "ud": { - "u": 0.9769230769230769, - "t": 0.979490022172949, - "punct": 0.9994441356309061 - } - }, - "fa": { - "opus100": { - "u": 0.5960709151892669, - "t": 0.5953510436432637, - "punct": 0.7200962695547533 - }, - "ud": { - "u": 0.9742216270505525, - "t": 0.9847094801223242, - "punct": 0.9993131868131867 - } - }, - "fi": { - "ersatz": { - "u": 0.9765142150803462, - "t": 0.9764326469858596, - "punct": 0.9950024987506246 - }, - "opus100": { - "u": 0.9225335530963034, - "t": 0.9340052795776337, - "punct": 0.9622178606476939 - }, - "ud": { - "u": 0.9496994621955076, - "t": 0.9490094186424164, - "punct": 0.981958762886598 - } - }, - "fr": { - "ersatz": { - "u": 0.9734882335418529, - "t": 0.9696417729204615, - "punct": 0.9877428998505232 - }, - "opus100": { - "u": 0.8817498291182502, - "t": null, - "punct": null - }, - "ud": { - "u": 0.971764705882353, - "t": 0.9726516052318668, - "punct": 0.9878934624697335 - } - }, - "fy": { - "opus100": { - "u": 0.571336966976595, - "t": 0.6380510440835266, - "punct": 0.8887134964483031 - } - }, - "ga": { - "opus100": { - "u": 0.7986813186813186, - "t": 0.7625750362243842, - "punct": 0.8808139534883721 - }, - "ud": { - "u": 0.8645533141210375, - "t": 0.9109730848861284, - "punct": 0.9867549668874173 - } - }, - "gd": { - "opus100": { - "u": 0.7777777777777778, - "t": 0.8303608461219412, - "punct": 0.9330254041570438 - }, - "ud": { - "u": 0.7145969498910676, - "t": 0.7523892267593397, - "punct": 0.8025247971145175 - } - }, - "gl": { - "opus100": { - "u": 0.9037914691943127, - "t": 0.9062276306370795, - "punct": 0.9441176470588236 - }, - "ud": { - "u": 0.9875, - "t": 0.9899749373433584, - "punct": 0.9851116625310173 - } - }, - "gu": { - "ersatz": { - "u": 0.9165103189493433, - "t": 0.9069767441860466, - "punct": 0.9705014749262537 - }, - "opus100": { - "u": 0.702166897187644, - "t": 0.7032598274209013, - "punct": 0.7702349869451697 - } - }, - "ha": { - "opus100": { - "u": 0.8282700421940928, - "t": 0.8853166986564299, - "punct": 0.9162038155035016 - } - }, - "he": { - "opus100": { - "u": 0.9102564102564102, - "t": 0.9050647820965843, - "punct": 0.9406800694961529 - }, - "ud": { - "u": 0.9627791563275434, - "t": 0.9579617834394903, - "punct": 0.9731800766283524 - } - }, - "hi": { - "ersatz": { - "u": 0.945510360706063, - "t": 0.9423307969563476, - "punct": 0.9722110350382601 - }, - "opus100": { - "u": 0.6538637402834934, - "t": 0.6493955094991365, - "punct": 0.7571084337349397 - }, - "ud": { - "u": 0.9758369723435225, - "t": 0.9756526840715753, - "punct": 0.9991095280498665 - } - }, - "hu": { - "opus100": { - "u": 0.9207688701359589, - "t": 0.9309852451213707, - "punct": 0.9644520715861731 - }, - "ud": { - "u": 0.9713656387665198, - "t": 0.9688888888888889, - "punct": 0.9944506104328524 - } - }, - "hy": { - "opus100": { - "u": 0.8686219365206911, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9711934156378601, - "t": 0.9735099337748345, - "punct": 0.9805579036348266 - } - }, - "id": { - "opus100": { - "u": 0.8941517641372643, - "t": 0.9031941031941032, - "punct": 0.9425056490082852 - }, - "ud": { - "u": 0.9797730636408486, - "t": null, - "punct": null - } - }, - "ig": { - "opus100": { - "u": 0.8289199642963405, - "t": 0.8382568002339866, - "punct": 0.9083181542197936 - } - }, - "is": { - "opus100": { - "u": 0.9435151515151515, - "t": 0.9480614484272128, - "punct": 0.9693251533742331 - }, - "ud": { - "u": 0.8807307525010873, - "t": 0.9015301530153016, - "punct": 0.9651826484018265 - } - }, - "it": { - "opus100": { - "u": 0.8690396239086636, - "t": 0.8913597405605745, - "punct": 0.9383313180169286 - }, - "ud": { - "u": 0.9611166500498505, - "t": 0.9630369630369631, - "punct": 0.9958592132505175 - } - }, - "ja": { - "ersatz": { - "u": 0.8166519043401239, - "t": 0.814404432132964, - "punct": 0.9504761904761905 - }, - "opus100": { - "u": 0.3958990536277603, - "t": 0.7920887602508443, - "punct": 0.85578367551327 - }, - "ud": { - "u": 0.9661482159194876, - "t": 0.9685185185185186, - "punct": 0.9841269841269841 - } - }, - "jv": { - "ud": { - "u": 0.9615384615384615, - "t": null, - "punct": null - } - }, - "ka": { - "opus100": { - "u": 0.9111166130230255, - "t": 0.9110734189483887, - "punct": 0.93553963941714 - } - }, - "kk": { - "ersatz": { - "u": 0.9720837487537387, - "t": 0.9662468513853905, - "punct": 0.999001996007984 - }, - "opus100": { - "u": 0.70223752151463, - "t": 0.7317744154057771, - "punct": 0.9178117708063261 - }, - "ud": { - "u": 0.9790874524714829, - "t": 0.9010875194199897, - "punct": 0.9725759059745347 - } - }, - "km": { - "ersatz": { - "u": 0.8385155466399198, - "t": 0.907604994324631, - "punct": 0.9140170174652933 - }, - "opus100": { - "u": 0.6823529411764706, - "t": 0.689974293059126, - "punct": 0.7785007072135784 - } - }, - "kn": { - "opus100": { - "u": 0.6185804962492787, - "t": 0.6036406341749854, - "punct": 0.7552941176470587 - } - }, - "ko": { - "opus100": { - "u": 0.5300564596479574, - "t": 0.676056338028169, - "punct": 0.8004807692307694 - }, - "ud": { - "u": 0.9934782608695651, - "t": 0.9938944614042738, - "punct": 0.9995627459554001 - } - }, - "ku": { - "opus100": { - "u": 0.7825696316262354, - "t": 0.6381450032658393, - "punct": 0.753441802252816 - } - }, - "ky": { - "opus100": { - "u": 0.8018154311649017, - "t": 0.8296967979597619, - "punct": 0.9150249926492208 - } - }, - "la": { - "ud": { - "u": 0.8448862150143867, - "t": 0.8992822412595509, - "punct": 0.9638895444890252 - } - }, - "lt": { - "ersatz": { - "u": 0.9690927218344965, - "t": 0.966144517433047, - "punct": 0.9939698492462311 - }, - "opus100": { - "u": 0.8024457094665823, - "t": 0.8585343228200372, - "punct": 0.9071253071253071 - }, - "ud": { - "u": 0.9754689754689754, - "t": 0.9730909090909091, - "punct": 0.9904481998530493 - } - }, - "lv": { - "ersatz": { - "u": 0.9699029126213592, - "t": 0.9733727810650888, - "punct": 0.9970297029702969 - }, - "opus100": { - "u": 0.8087012156110044, - "t": 0.8719789423307012, - "punct": 0.9230007427581084 - }, - "ud": { - "u": 0.9622479622479622, - "t": 0.9625212947189097, - "punct": 0.9903412749517064 - } - }, - "mg": { - "opus100": { - "u": 0.8779491833030852, - "t": 0.9071770334928229, - "punct": 0.9548834903321765 - } - }, - "mk": { - "opus100": { - "u": 0.9245855708615456, - "t": 0.930562116202173, - "punct": 0.9582726831635129 - } - }, - "ml": { - "opus100": { - "u": 0.8089579524680073, - "t": 0.8171443063657113, - "punct": 0.8585690515806987 - } - }, - "mn": { - "opus100": { - "u": 0.49153140437544107, - "t": null, - "punct": null - } - }, - "mr": { - "opus100": { - "u": 0.9043736100815418, - "t": 0.9066400399400899, - "punct": 0.9320099255583127 - }, - "ud": { - "u": 0.9591836734693878, - "t": 0.9591836734693878, - "punct": 0.9278350515463918 - } - }, - "ms": { - "opus100": { - "u": 0.8829578844270323, - "t": 0.8845208845208845, - "punct": 0.9398852581691195 - } - }, - "mt": { - "opus100": { - "u": 0.6946308724832216, - "t": 0.8259642521166509, - "punct": 0.8906326630701324 - }, - "ud": { - "u": 0.9102927289896129, - "t": 0.8956692913385829, - "punct": 0.950533462657614 - } - }, - "my": { - "opus100": { - "u": 0.6822682268226822, - "t": 0.7147192716236721, - "punct": 0.7964649169791109 - } - }, - "ne": { - "opus100": { - "u": 0.6960061053167133, - "t": 0.6967840735068913, - "punct": 0.7452552793370757 - } - }, - "nl": { - "opus100": { - "u": 0.9234022556390977, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9324437030859049, - "t": 0.9283387622149837, - "punct": 0.9673093042749371 - } - }, - "no": { - "opus100": { - "u": 0.9505574406204557, - "t": 0.9526354141365072, - "punct": 0.967156862745098 - }, - "ud": { - "u": 0.9831124967524032, - "t": 0.986060918946825, - "punct": 0.9938112429087158 - } - }, - "pa": { - "opus100": { - "u": 0.5778849050918952, - "t": 0.6543370669445168, - "punct": 0.7706855791962176 - } - }, - "pl": { - "ersatz": { - "u": 0.9465267366316842, - "t": 0.9393166751657318, - "punct": 0.9474689589302769 - }, - "opus100": { - "u": 0.921353670162059, - "t": 0.926923076923077, - "punct": 0.9608470819995075 - }, - "ud": { - "u": 0.9471649484536082, - "t": 0.9582696089141359, - "punct": 0.99389278443791 - } - }, - "ps": { - "ersatz": { - "u": 0.8680991735537189, - "t": 0.9250589729631645, - "punct": 0.9568168960965491 - }, - "opus100": { - "u": 0.6168853010427179, - "t": 0.6971046770601337, - "punct": 0.7366809552969994 - } - }, - "pt": { - "opus100": { - "u": 0.9085807809212064, - "t": 0.9217183770883055, - "punct": 0.9560975609756096 - }, - "ud": { - "u": 0.9589041095890412, - "t": 0.9597257926306769, - "punct": 0.9866551872578562 - } - }, - "ro": { - "ersatz": { - "u": 0.976343026121242, - "t": 0.9675471698113207, - "punct": 0.9955089820359282 - }, - "opus100": { - "u": 0.9041222788327928, - "t": 0.9119437939110069, - "punct": 0.9699378881987578 - }, - "ud": { - "u": 0.8701406120760959, - "t": 0.92, - "punct": 0.9947494033412888 - } - }, - "ru": { - "ersatz": { - "u": 0.9788519637462236, - "t": 0.9770759042282221, - "punct": 0.992936427850656 - }, - "opus100": { - "u": 0.834896401308615, - "t": null, - "punct": null - }, - "ud": { - "u": 0.863849765258216, - "t": 0.8775176918889493, - "punct": 0.933870040253019 - } - }, - "si": { - "opus100": { - "u": 0.8039075530140576, - "t": 0.8087215064420218, - "punct": 0.8630517023959646 - } - }, - "sk": { - "opus100": { - "u": 0.9132867132867134, - "t": 0.9353980159690297, - "punct": 0.9608470819995075 - }, - "ud": { - "u": 0.9646393210749646, - "t": 0.9634898055950688, - "punct": 0.9886363636363636 - } - }, - "sl": { - "opus100": { - "u": 0.9191588785046729, - "t": 0.9323809523809523, - "punct": 0.9529326574945691 - }, - "ud": { - "u": 0.9659395331037123, - "t": 0.9675675675675676, - "punct": 0.9925694172858818 - } - }, - "sq": { - "opus100": { - "u": 0.9055542535739396, - "t": 0.9129916567342073, - "punct": 0.9604743083003953 - }, - "ud": { - "u": 1.0, - "t": null, - "punct": null - } - }, - "sr": { - "opus100": { - "u": 0.9460043196544277, - "t": 0.9463813416686703, - "punct": 0.9641025641025641 - }, - "ud": { - "u": 0.9816779170684666, - "t": 0.9846449136276392, - "punct": 0.9990375360923965 - } - }, - "sv": { - "opus100": { - "u": 0.9182624941616068, - "t": 0.9336515513126492, - "punct": 0.9575963169372427 - }, - "ud": { - "u": 0.9532273152478952, - "t": 0.9534992954438704, - "punct": 0.9717568214456678 - } - }, - "ta": { - "ersatz": { - "u": 0.9442591710338257, - "t": 0.9541561712846347, - "punct": 0.9811881188118811 - }, - "opus100": { - "u": 0.6329223447977504, - "t": 0.6214819069500288, - "punct": 0.7178631051752922 - }, - "ud": { - "u": 0.9836065573770492, - "t": 0.975, - "punct": 1.0 - } - }, - "te": { - "opus100": { - "u": 0.778990104034509, - "t": 0.7778355879292405, - "punct": 0.8427267847557702 - } - }, - "tg": { - "opus100": { - "u": 0.8075240594925635, - "t": 0.8382676780034255, - "punct": 0.915362035225049 - } - }, - "th": { - "opus100": { - "u": 0.6914950760966876, - "t": 0.7093405530845098, - "punct": 0.729114971050455 - }, - "ud": { - "u": 0.6843946815955213, - "t": null, - "punct": null - } - }, - "tr": { - "ersatz": { - "u": 0.9383794274623968, - "t": 0.9359605911330049, - "punct": 0.9843905679176355 - }, - "opus100": { - "u": 0.9317907929621596, - "t": 0.9343385214007782, - "punct": 0.9564575645756458 - }, - "ud": { - "u": 0.9411764705882353, - "t": 0.9498031496062992, - "punct": 0.9923896499238964 - } - }, - "uk": { - "opus100": { - "u": 0.8894565722248027, - "t": 0.8961466165413533, - "punct": 0.9445794846864365 - }, - "ud": { - "u": 0.9175824175824177, - "t": 0.9276500280426249, - "punct": 0.9826135726303982 - } - }, - "ur": { - "opus100": { - "u": 0.5175487465181058, - "t": 0.5174244938599403, - "punct": 0.64049955396967 - }, - "ud": { - "u": 0.9472743521000894, - "t": 0.9768303985171455, - "punct": 0.9943714821763602 - } - }, - "uz": { - "opus100": { - "u": 0.7887205387205387, - "t": 0.803689687795648, - "punct": 0.855132249454016 - } - }, - "vi": { - "opus100": { - "u": 0.9129994941831058, - "t": 0.9157678479712377, - "punct": 0.951329653788259 - }, - "ud": { - "u": 0.7474081055607916, - "t": 0.9346733668341709, - "punct": 0.9769757311761046 - } - }, - "xh": { - "opus100": { - "u": 0.7912243453644727, - "t": 0.7911466917824348, - "punct": 0.9008455034588778 - } - }, - "yi": { - "opus100": { - "u": 0.7369872457773182, - "t": 0.722509899482181, - "punct": 0.8082644628099174 - } - }, - "yo": { - "opus100": { - "u": 0.76493060592872, - "t": null, - "punct": null - }, - "ud": { - "u": 0.8575342465753424, - "t": null, - "punct": null - } - }, - "zh": { - "ersatz": { - "u": 0.8674121405750799, - "t": 0.9067713444553485, - "punct": 0.9694258016405666 - }, - "opus100": { - "u": 0.6529873264936632, - "t": 0.7129197242606181, - "punct": 0.8749367088607596 - }, - "ud": { - "u": 0.9755102040816326, - "t": 0.989010989010989, - "punct": 0.996 - } - }, - "zu": { - "opus100": { - "u": 0.7922840369024322, - "t": 0.8506444275966641, - "punct": 0.8975054229934925 - } - } -} \ No newline at end of file diff --git a/wtpsplit/evaluation/evaluation_results/wtp-canine-s-9l_intrinsic_results.json b/wtpsplit/evaluation/evaluation_results/wtp-canine-s-9l_intrinsic_results.json deleted file mode 100644 index ab9c6bd9..00000000 --- a/wtpsplit/evaluation/evaluation_results/wtp-canine-s-9l_intrinsic_results.json +++ /dev/null @@ -1,977 +0,0 @@ -{ - "af": { - "opus100": { - "u": 0.7588595790034638, - "t": 0.8030054003287156, - "punct": 0.8931548370800706 - }, - "ud": { - "u": 0.9941520467836257, - "t": 0.9917550058892816, - "punct": 0.9976470588235294 - } - }, - "am": { - "opus100": { - "u": 0.6770488612030368, - "t": 0.6951663832888025, - "punct": 0.7405775819872736 - } - }, - "ar": { - "ersatz": { - "u": 0.8910505836575875, - "t": 0.9011432414256894, - "punct": 0.9258876249569113 - }, - "opus100": { - "u": 0.7012609117361784, - "t": 0.6966452533904355, - "punct": 0.8080357142857142 - }, - "ud": { - "u": 0.851255634256278, - "t": 0.87292817679558, - "punct": 0.8969359331476323 - } - }, - "az": { - "opus100": { - "u": 0.7819611088125776, - "t": 0.7668368587405786, - "punct": 0.8520884520884521 - } - }, - "be": { - "opus100": { - "u": 0.7593913456966239, - "t": 0.7644485378770435, - "punct": 0.895484525621512 - }, - "ud": { - "u": 0.9, - "t": 0.9014844804318488, - "punct": 0.9367327667610954 - } - }, - "bg": { - "opus100": { - "u": 0.9381591562799617, - "t": 0.9245593419506464, - "punct": 0.9656013661868749 - }, - "ud": { - "u": 0.9851685393258426, - "t": 0.9835042353990191, - "punct": 0.9973142345568488 - } - }, - "bn": { - "opus100": { - "u": 0.8097624754848551, - "t": 0.8495228774161977, - "punct": 0.8939207482156042 - }, - "ud": { - "u": 1.0, - "t": null, - "punct": null - } - }, - "ca": { - "opus100": { - "u": 0.8938156359393232, - "t": 0.9010158280179542, - "punct": 0.9524283935242839 - }, - "ud": { - "u": 0.9861900893582453, - "t": 0.9851471779638131, - "punct": 0.999729070712544 - } - }, - "ceb": { - "ud": { - "u": 0.9973474801061007, - "t": null, - "punct": null - } - }, - "cs": { - "ersatz": { - "u": 0.9331727874774232, - "t": 0.9475534720187518, - "punct": 0.9936342592592592 - }, - "opus100": { - "u": 0.8941068139963166, - "t": 0.9080244016893476, - "punct": 0.9531598513011152 - }, - "ud": { - "u": 0.9244332493702772, - "t": 0.9245937924442879, - "punct": 0.9711028500619577 - } - }, - "cy": { - "opus100": { - "u": 0.7078955268688083, - "t": 0.7689576464620848, - "punct": 0.8349301943607993 - }, - "ud": { - "u": 0.9910761154855643, - "t": 0.9936842105263157, - "punct": 0.9957939011566773 - } - }, - "da": { - "opus100": { - "u": 0.9058358521274122, - "t": 0.9098993681254388, - "punct": 0.9550342130987292 - }, - "ud": { - "u": 0.9507908611599297, - "t": 0.9507908611599297, - "punct": 0.9838420107719928 - } - }, - "de": { - "ersatz": { - "u": 0.9667170953101362, - "t": 0.9657099314198628, - "punct": 0.9949005609382967 - }, - "opus100": { - "u": 0.8083989501312336, - "t": 0.8601997146932953, - "punct": 0.9085334695963209 - }, - "ud": { - "u": 0.9627740948495664, - "t": 0.9644513137557958, - "punct": 0.9608879492600424 - } - }, - "el": { - "opus100": { - "u": 0.924056603773585, - "t": 0.9342294767162745, - "punct": 0.9638198375584544 - }, - "ud": { - "u": 0.9724972497249724, - "t": 0.9735099337748344, - "punct": 0.9766925638179799 - } - }, - "en": { - "ersatz": { - "u": 0.9751205400192864, - "t": 0.974869177595452, - "punct": 0.9891248058001035 - }, - "opus100": { - "u": 0.9191572758451738, - "t": 0.9060434372049102, - "punct": 0.9502572898799314 - }, - "ud": { - "u": 0.9495875343721357, - "t": 0.9506398537477148, - "punct": 0.9717984281091077 - } - }, - "eo": { - "opus100": { - "u": 0.9215066828675577, - "t": 0.9192965550469766, - "punct": 0.9594460929772501 - } - }, - "es": { - "ersatz": { - "u": 0.9875816993464053, - "t": 0.9795176742649487, - "punct": 0.995591836734694 - }, - "opus100": { - "u": 0.9216938727229713, - "t": 0.9231863442389758, - "punct": 0.9575518262586378 - }, - "ud": { - "u": 0.968372627947096, - "t": 0.9691019347386659, - "punct": 0.995937318630296 - } - }, - "et": { - "ersatz": { - "u": 0.9607990012484394, - "t": 0.9491611591255719, - "punct": 0.9849774661992989 - }, - "opus100": { - "u": 0.8641534686593799, - "t": 0.9034564958283672, - "punct": 0.9541420118343193 - }, - "ud": { - "u": 0.9425287356321838, - "t": 0.9422689986070267, - "punct": 0.9813975300922307 - } - }, - "eu": { - "opus100": { - "u": 0.8570145903479237, - "t": 0.8643604915794265, - "punct": 0.9316406250000001 - }, - "ud": { - "u": 0.965592572364828, - "t": 0.967991169977925, - "punct": 0.9986091794158554 - } - }, - "fa": { - "opus100": { - "u": 0.6538112522686026, - "t": 0.6497484139138044, - "punct": 0.7698315840859166 - }, - "ud": { - "u": 0.9709321750751754, - "t": 0.98110661268556, - "punct": 0.998627316403569 - } - }, - "fi": { - "ersatz": { - "u": 0.9796226415094339, - "t": 0.9807932152656523, - "punct": 0.9962453066332917 - }, - "opus100": { - "u": 0.9227863046044864, - "t": 0.9324389075227599, - "punct": 0.963107744930369 - }, - "ud": { - "u": 0.9339593114241002, - "t": 0.9339652448657189, - "punct": 0.9842291599613775 - } - }, - "fr": { - "ersatz": { - "u": 0.9744047619047619, - "t": 0.9707037148897614, - "punct": 0.9915915915915916 - }, - "opus100": { - "u": 0.8919043238270469, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9694835680751175, - "t": 0.9773539928486292, - "punct": 0.9866989117291415 - } - }, - "fy": { - "opus100": { - "u": 0.5809735921094495, - "t": 0.6807576173483393, - "punct": 0.876847290640394 - } - }, - "ga": { - "opus100": { - "u": 0.8220150909897914, - "t": 0.8349647165945823, - "punct": 0.9072011878247958 - }, - "ud": { - "u": 0.8598665395614872, - "t": 0.9050505050505051, - "punct": 0.98787210584344 - } - }, - "gd": { - "opus100": { - "u": 0.860616587060356, - "t": 0.8685762426284751, - "punct": 0.9408450704225352 - }, - "ud": { - "u": 0.7101769911504425, - "t": 0.7439446366782007, - "punct": 0.8306306306306306 - } - }, - "gl": { - "opus100": { - "u": 0.9071090047393364, - "t": 0.9117929050814957, - "punct": 0.942969518190757 - }, - "ud": { - "u": 0.9899749373433584, - "t": 0.9886792452830189, - "punct": 0.9925373134328358 - } - }, - "gu": { - "ersatz": { - "u": 0.9251700680272108, - "t": 0.9131513647642681, - "punct": 0.9783677482792528 - }, - "opus100": { - "u": 0.7474370922646785, - "t": 0.746879680479281, - "punct": 0.7906976744186045 - } - }, - "ha": { - "opus100": { - "u": 0.8791506663654844, - "t": 0.9042089985486211, - "punct": 0.9183374083129584 - } - }, - "he": { - "opus100": { - "u": 0.91412213740458, - "t": 0.9086617890016521, - "punct": 0.9417018109650211 - }, - "ud": { - "u": 0.9612983770287141, - "t": 0.961104140526976, - "punct": 0.989821882951654 - } - }, - "hi": { - "ersatz": { - "u": 0.9547758284600389, - "t": 0.9496893164962918, - "punct": 0.9745324313569439 - }, - "opus100": { - "u": 0.6857646540326134, - "t": 0.6840180684018069, - "punct": 0.7901468498342018 - }, - "ud": { - "u": 0.9817109144542772, - "t": 0.9832303618711384, - "punct": 0.9991095280498665 - } - }, - "hu": { - "opus100": { - "u": 0.9264253607759642, - "t": 0.9326968973747016, - "punct": 0.9635607728050869 - }, - "ud": { - "u": 0.9653631284916202, - "t": 0.9632925472747497, - "punct": 0.991130820399113 - } - }, - "hy": { - "opus100": { - "u": 0.8740435158693194, - "t": null, - "punct": null - }, - "ud": { - "u": 0.967266775777414, - "t": 0.9660314830157416, - "punct": 0.980849292256453 - } - }, - "id": { - "opus100": { - "u": 0.9024031387935262, - "t": 0.9048678033110946, - "punct": 0.9458717884759291 - }, - "ud": { - "u": 0.9865871833084948, - "t": null, - "punct": null - } - }, - "ig": { - "opus100": { - "u": 0.7557480839720093, - "t": 0.8528649582493522, - "punct": 0.9136712749615975 - } - }, - "is": { - "opus100": { - "u": 0.943579766536965, - "t": 0.9453658536585366, - "punct": 0.9698455503799951 - }, - "ud": { - "u": 0.8831214689265537, - "t": 0.8917927145797906, - "punct": 0.9699176087373059 - } - }, - "it": { - "opus100": { - "u": 0.8759550561797754, - "t": 0.9035294117647059, - "punct": 0.9428431851489985 - }, - "ud": { - "u": 0.9554013875123885, - "t": 0.9404878048780487, - "punct": 0.9948293691830403 - } - }, - "ja": { - "ersatz": { - "u": 0.8428949691085613, - "t": 0.8548463356973994, - "punct": 0.96548463356974 - }, - "opus100": { - "u": 0.5768435561681599, - "t": 0.8266918179674606, - "punct": 0.8858148893360162 - }, - "ud": { - "u": 0.9603603603603604, - "t": 0.9676225716928769, - "punct": 0.9860205032618825 - } - }, - "jv": { - "ud": { - "u": 0.9575289575289575, - "t": null, - "punct": null - } - }, - "ka": { - "opus100": { - "u": 0.9069600574025353, - "t": 0.9068767908309456, - "punct": 0.9344181459566073 - } - }, - "kk": { - "ersatz": { - "u": 0.9512069851052901, - "t": 0.9479112944816916, - "punct": 0.9965017491254372 - }, - "opus100": { - "u": 0.7036496350364964, - "t": 0.7466373867691464, - "punct": 0.9261186264308012 - }, - "ud": { - "u": 0.9701923076923077, - "t": 0.9178990311065783, - "punct": 0.9856870229007634 - } - }, - "km": { - "ersatz": { - "u": 0.751590224305323, - "t": 0.9269827005167378, - "punct": 0.9322071571460325 - }, - "opus100": { - "u": 0.7332139659803043, - "t": 0.7349609866599547, - "punct": 0.8132413793103448 - } - }, - "kn": { - "opus100": { - "u": 0.717741935483871, - "t": 0.6535714285714286, - "punct": 0.7923854848304581 - } - }, - "ko": { - "opus100": { - "u": 0.6097635861221984, - "t": 0.7293544457978076, - "punct": 0.8260147163541419 - }, - "ud": { - "u": 0.992595818815331, - "t": 0.9925990422289944, - "punct": 0.999343975508419 - } - }, - "ku": { - "opus100": { - "u": 0.7962752668635021, - "t": 0.721331689272503, - "punct": 0.8354503464203233 - } - }, - "ky": { - "opus100": { - "u": 0.8592178770949721, - "t": 0.8534046981861433, - "punct": 0.9181008902077151 - } - }, - "la": { - "ud": { - "u": 0.8504398826979472, - "t": 0.9034945614441102, - "punct": 0.9744812783210112 - } - }, - "lt": { - "ersatz": { - "u": 0.9753892516323456, - "t": 0.9722362443210499, - "punct": 0.9934967483741871 - }, - "opus100": { - "u": 0.8150800336983993, - "t": 0.8669905158454776, - "punct": 0.926829268292683 - }, - "ud": { - "u": 0.9773887673231219, - "t": 0.9765739385065886, - "punct": 0.9948792977322602 - } - }, - "lv": { - "ersatz": { - "u": 0.9770955165692008, - "t": 0.9781487846795974, - "punct": 0.9975222993062439 - }, - "opus100": { - "u": 0.8343824614716735, - "t": 0.8857278330962541, - "punct": 0.9319812020776651 - }, - "ud": { - "u": 0.9656061908856407, - "t": 0.9702127659574469, - "punct": 0.9941847943140212 - } - }, - "mg": { - "opus100": { - "u": 0.9239904988123516, - "t": 0.9318892900120337, - "punct": 0.962889658584859 - } - }, - "mk": { - "opus100": { - "u": 0.9330819981149859, - "t": 0.9320159962361798, - "punct": 0.9592825981580222 - } - }, - "ml": { - "opus100": { - "u": 0.8146620847651774, - "t": 0.8229376257545271, - "punct": 0.8737266050698886 - } - }, - "mn": { - "opus100": { - "u": 0.8186798137496576, - "t": null, - "punct": null - } - }, - "mr": { - "opus100": { - "u": 0.9055867284703586, - "t": 0.9105367793240557, - "punct": 0.9324586977648203 - }, - "ud": { - "u": 0.9375000000000001, - "t": 0.9292929292929293, - "punct": 0.9894736842105264 - } - }, - "ms": { - "opus100": { - "u": 0.8851897184822521, - "t": 0.8895463510848126, - "punct": 0.9461615154536391 - } - }, - "mt": { - "opus100": { - "u": 0.6833514689880306, - "t": 0.8293706293706292, - "punct": 0.8995886765061697 - }, - "ud": { - "u": 0.9048543689320389, - "t": 0.9014634146341464, - "punct": 0.9506292352371731 - } - }, - "my": { - "opus100": { - "u": 0.6614913176710929, - "t": 0.7849546044098573, - "punct": 0.842409892133649 - } - }, - "ne": { - "opus100": { - "u": 0.7364672364672364, - "t": 0.7340134361781538, - "punct": 0.7772435897435898 - } - }, - "nl": { - "opus100": { - "u": 0.9267831837505904, - "t": null, - "punct": null - }, - "ud": { - "u": 0.9493243243243243, - "t": 0.9246963562753037, - "punct": 0.9603305785123966 - } - }, - "no": { - "opus100": { - "u": 0.9491525423728814, - "t": 0.949842500605767, - "punct": 0.9661598822952429 - }, - "ud": { - "u": 0.9791013584117032, - "t": 0.9845995893223819, - "punct": 0.9935550399587523 - } - }, - "pa": { - "opus100": { - "u": 0.5848580441640379, - "t": 0.643453237410072, - "punct": 0.799583007557988 - } - }, - "pl": { - "ersatz": { - "u": 0.9441056910569106, - "t": 0.9435897435897436, - "punct": 0.9493487698986975 - }, - "opus100": { - "u": 0.9293413173652695, - "t": 0.9350084561488282, - "punct": 0.9593933463796478 - }, - "ud": { - "u": 0.9617319046577738, - "t": 0.9645764576457646, - "punct": 0.99479520253451 - } - }, - "ps": { - "ersatz": { - "u": 0.8401403956604977, - "t": 0.9120141342756183, - "punct": 0.9661458333333334 - }, - "opus100": { - "u": 0.653194263363755, - "t": 0.7293274531422271, - "punct": 0.7882517482517484 - } - }, - "pt": { - "opus100": { - "u": 0.9147540983606557, - "t": 0.9249521988527725, - "punct": 0.9578151670324311 - }, - "ud": { - "u": 0.9667099005620406, - "t": 0.967686342093925, - "punct": 0.9836488812392428 - } - }, - "ro": { - "ersatz": { - "u": 0.9789447609611097, - "t": 0.9639593908629441, - "punct": 0.9955112219451371 - }, - "opus100": { - "u": 0.9104895104895104, - "t": 0.9143393569584604, - "punct": 0.9724907063197027 - }, - "ud": { - "u": 0.8342585249801745, - "t": 0.9223702342673403, - "punct": 0.9947494033412888 - } - }, - "ru": { - "ersatz": { - "u": 0.9797160243407708, - "t": 0.9823499747856783, - "punct": 0.9939455095862765 - }, - "opus100": { - "u": 0.8411174159755566, - "t": null, - "punct": null - }, - "ud": { - "u": 0.8675703858185609, - "t": 0.8689295039164491, - "punct": 0.9313669880884856 - } - }, - "si": { - "opus100": { - "u": 0.8043581241117953, - "t": 0.8105705112373425, - "punct": 0.8682323856613101 - } - }, - "sk": { - "opus100": { - "u": 0.9199623352165724, - "t": 0.936678032148076, - "punct": 0.9636767976278724 - }, - "ud": { - "u": 0.9630331753554502, - "t": 0.9585121602288984, - "punct": 0.9896324222431668 - } - }, - "sl": { - "opus100": { - "u": 0.9276114124027351, - "t": 0.9351454458750595, - "punct": 0.9568589432864759 - }, - "ud": { - "u": 0.9638183217859891, - "t": 0.9631925610228594, - "punct": 0.99375 - } - }, - "sq": { - "opus100": { - "u": 0.9118755890669179, - "t": 0.9217391304347826, - "punct": 0.9610261470152937 - }, - "ud": { - "u": 1.0, - "t": null, - "punct": null - } - }, - "sr": { - "opus100": { - "u": 0.9464585834333734, - "t": 0.9484585741811176, - "punct": 0.9676787463271304 - }, - "ud": { - "u": 0.9776046738072055, - "t": 0.985645933014354, - "punct": 0.9990375360923965 - } - }, - "sv": { - "opus100": { - "u": 0.925064599483204, - "t": 0.9357841967056577, - "punct": 0.9600778967867575 - }, - "ud": { - "u": 0.960489181561618, - "t": 0.9591740966682308, - "punct": 0.9741379310344828 - } - }, - "ta": { - "ersatz": { - "u": 0.9557266602502407, - "t": 0.9511465603190429, - "punct": 0.9797930014785609 - }, - "opus100": { - "u": 0.6882303839732888, - "t": 0.6838674633351439, - "punct": 0.7681867535287731 - }, - "ud": { - "u": 0.9876543209876543, - "t": 0.9833333333333333, - "punct": 0.9917355371900827 - } - }, - "te": { - "opus100": { - "u": 0.7778936392075078, - "t": 0.7825436408977557, - "punct": 0.8459709379128136 - } - }, - "tg": { - "opus100": { - "u": 0.8056027164685908, - "t": 0.8613393079412496, - "punct": 0.905284656624968 - } - }, - "th": { - "opus100": { - "u": 0.6984069514844315, - "t": 0.7222884386174016, - "punct": 0.7515409139213602 - }, - "ud": { - "u": 0.7521755580779418, - "t": null, - "punct": null - } - }, - "tr": { - "ersatz": { - "u": 0.9504145667371159, - "t": 0.9491691104594331, - "punct": 0.9873879853966147 - }, - "opus100": { - "u": 0.9374999999999999, - "t": 0.9366902957712051, - "punct": 0.9569945625308947 - }, - "ud": { - "u": 0.9439844130540672, - "t": 0.9490000000000001, - "punct": 0.9913749365804161 - } - }, - "uk": { - "opus100": { - "u": 0.9006529850746269, - "t": 0.9051886792452831, - "punct": 0.949514563106796 - }, - "ud": { - "u": 0.9270482603815937, - "t": 0.9279279279279279, - "punct": 0.9876819708846584 - } - }, - "ur": { - "opus100": { - "u": 0.6002858504049547, - "t": 0.5684727336489568, - "punct": 0.6860493525531395 - }, - "ud": { - "u": 0.9566003616636527, - "t": 0.9658986175115206, - "punct": 0.9934272300469483 - } - }, - "uz": { - "opus100": { - "u": 0.8007720351704911, - "t": 0.8186602870813398, - "punct": 0.8712178044511127 - } - }, - "vi": { - "opus100": { - "u": 0.9117572692793932, - "t": 0.9129327902240325, - "punct": 0.9562609347663084 - }, - "ud": { - "u": 0.7814504193389245, - "t": 0.9123867069486404, - "punct": 0.9837702871410736 - } - }, - "xh": { - "opus100": { - "u": 0.793065125583463, - "t": 0.8298536117110632, - "punct": 0.9095593919093017 - } - }, - "yi": { - "opus100": { - "u": 0.5842490842490843, - "t": 0.6279982555604011, - "punct": 0.7867549668874172 - } - }, - "yo": { - "opus100": { - "u": 0.7815097784825332, - "t": null, - "punct": null - }, - "ud": { - "u": 0.8831908831908832, - "t": null, - "punct": null - } - }, - "zh": { - "ersatz": { - "u": 0.8890670943597969, - "t": 0.9194029850746269, - "punct": 0.9758382642998028 - }, - "opus100": { - "u": 0.8357732233164759, - "t": 0.811303129378795, - "punct": 0.9091821374811841 - }, - "ud": { - "u": 0.968335035750766, - "t": 0.979187314172448, - "punct": 0.9979959919839679 - } - }, - "zu": { - "opus100": { - "u": 0.6777281429483089, - "t": 0.8616417139907072, - "punct": 0.9095773140716961 - } - } -} \ No newline at end of file diff --git a/wtpsplit/evaluation/intrinsic.py b/wtpsplit/evaluation/intrinsic.py index 10e132bc..18b6b60a 100644 --- a/wtpsplit/evaluation/intrinsic.py +++ b/wtpsplit/evaluation/intrinsic.py @@ -1,25 +1,26 @@ import copy import json -from dataclasses import dataclass -from typing import List, Union -import os -import time import logging +import os import sys +import time +from dataclasses import dataclass +from typing import List, Union import h5py +import numpy as np +import skops.io as sio import torch from datasets import load_dataset from tqdm.auto import tqdm from transformers import AutoModelForTokenClassification, HfArgumentParser -import numpy as np -import adapters +import adapters import wtpsplit.models # noqa: F401 -from wtpsplit.models import SubwordXLMConfig, SubwordXLMForTokenClassification -from wtpsplit.evaluation import evaluate_mixture, get_labels, train_mixture, token_to_char_probs +from wtpsplit.evaluation import evaluate_mixture, get_labels, token_to_char_probs, train_mixture from wtpsplit.evaluation.intrinsic_baselines import split_language_data from wtpsplit.extract import PyTorchWrapper, extract +from wtpsplit.models import SubwordXLMConfig, SubwordXLMForTokenClassification from wtpsplit.utils import Constants logger = logging.getLogger() @@ -43,8 +44,7 @@ class Args: # } # } # } - # TODO: for songs/etc., maybe feed in each sample separately? - eval_data_path: str = "data/all_data_11_05-all.pth" + eval_data_path: str = "data/all_data.pth" valid_text_path: str = None # "data/sentence/valid.parquet" device: str = "cpu" block_size: int = 512 @@ -54,7 +54,7 @@ class Args: custom_language_list: str = None threshold: float = 0.01 max_n_train_sentences: int = 10000 - max_n_test_sentences: int = -1 + max_n_test_sentences: int = -1 # -1 is all keep_logits: bool = False skip_adaptation: bool = False skip_punct: bool = True @@ -63,14 +63,14 @@ class Args: return_indices: bool = True exclude_every_k: int = 10 save_suffix: str = "" - num_hidden_layers: Union[int, None] = None + num_hidden_layers: Union[int, None] = None # for original XLM-R def process_logits(text, model, lang_code, args): # Extract necessary data if isinstance(text, list): logits = [] - for short_seq in tqdm(text, desc="Short sequences", disable=False): + for short_seq in tqdm(text, desc="Listwise", disable=False): current_logits, current_offsets_mapping, tokenizer = extract( [short_seq], model, @@ -111,16 +111,11 @@ def process_logits(text, model, lang_code, args): if "xlm" in model.config.model_type: tokens = tokenizer.tokenize(text, verbose=False) - # Use the vectorized function to convert token probabilities to character probabilities for the entire array + # convert token probabilities to character probabilities for the entire array char_probs = token_to_char_probs(text, tokens, logits, tokenizer, offsets_mapping) logits = char_probs - if len(model.model.config.id2label) == 2: - # Igor's old models: take winning logit - logits = np.expand_dims(logits.argmax(axis=1), axis=1) - # we apply sigmoid later; convert to fake logits - logits = np.log((logits + 1e-8) / (1 - logits + 1e-8)) return logits @@ -137,7 +132,7 @@ def load_or_compute_logits(args, model, eval_data, valid_data=None, save_str: st else: use_langs = eval_data.keys() - total_test_time = 0 # Initialize total test processing time + total_test_time = 0 with h5py.File(logits_path, "a") as f, torch.no_grad(): for lang_code in tqdm(use_langs, desc="Languages"): @@ -187,15 +182,7 @@ def load_or_compute_logits(args, model, eval_data, valid_data=None, save_str: st if args.adapter_path: if args.clf_from_scratch: model.model.classifier = torch.nn.Linear(model.model.classifier.in_features, 1) - # if ( - # any(code in lang_code for code in ["ceb", "jv", "mn", "yo"]) - # and "ted2020" not in dataset_name - # ): - # # no ersatz for these either. - # dataset_load_name = "nllb" - # if "corrupted" in dataset_load_name: - # dataset_load_name += "-corrupted" - # else: + dataset_load_name = dataset_name model.model.load_adapter( args.adapter_path + "/" + dataset_load_name + "/" + lang_code, @@ -212,21 +199,9 @@ def load_or_compute_logits(args, model, eval_data, valid_data=None, save_str: st dset_group = lang_group[dataset_name] if "test_logits" not in dset_group: - # logger.warning(f"RUN: {lang_code} {dataset_name}") test_sentences = dataset["data"] if not test_sentences: continue - # if ( - # isinstance(test_sentences[0], list) - # and "lyrics" not in dataset_name - # and "short" not in dataset_name - # ): - # # documents: only 10% of documents. 1000 sentences --> 100 docs - # max_n_sentences = args.max_n_test_sentences // 10 - # # shuffle sentences - # np.random.seed(42) - # test_sentences = np.random.permutation(test_sentences).tolist() - # else: max_n_sentences = args.max_n_test_sentences test_sentences = test_sentences[:max_n_sentences] if isinstance(test_sentences[0], list): @@ -237,10 +212,10 @@ def load_or_compute_logits(args, model, eval_data, valid_data=None, save_str: st else: test_text = Constants.SEPARATORS.get(lang_code, " ").join(test_sentences) - start_time = time.time() # Start timing for test logits processing + start_time = time.time() test_logits = process_logits(test_text, model, lang_code, args) - end_time = time.time() # End timing for test logits processing - total_test_time += end_time - start_time # Accumulate test processing time + end_time = time.time() + total_test_time += end_time - start_time if isinstance(test_sentences[0], list): test_logit_lengths = [] # store start and end indices for each pair, used later to slice the logits @@ -252,7 +227,7 @@ def load_or_compute_logits(args, model, eval_data, valid_data=None, save_str: st test_logits = np.concatenate(test_logits) # NOTE: handled differently than in intrinsic_pairwise.py # here, we keep the label at the end - # in intrinsic_pairwise.py, we only consider the labels in the middle. + # in intrinsic_pairwise.py, we only consider non-ending labels. test_labels = [ get_labels(lang_code, short_seq, after_space=False)[:-1] for short_seq in test_sentences ] @@ -311,7 +286,7 @@ def compute_statistics(values): if not values: # Check for empty values list return {"mean": None, "median": None, "std": None, "min": None, "min_lang": None, "max": None, "max_lang": None} - scores, langs = zip(*values) # Unpack scores and languages + scores, langs = zip(*values) min_index = np.argmin(scores) max_index = np.argmax(scores) return { @@ -342,6 +317,7 @@ def main(args): logger.warning("Loading model...") model_path = args.model_path if args.model_path == "xlm-roberta-base" or args.model_path == "xlm-roberta-large": + # init models here config = SubwordXLMConfig.from_pretrained( args.model_path, num_hidden_layers=args.num_hidden_layers, @@ -359,9 +335,6 @@ def main(args): adapters.init(model.model) # reset model type (used later) model.model.config.model_type = model_type - if "meta-clf" in args.adapter_path: - clf = model.model.classifier - model.model.classifier = torch.nn.Sequential(clf, torch.nn.Linear(clf.out_features, 1)) save_str += f"{args.save_suffix}" if args.max_n_test_sentences < sys.maxsize and args.max_n_test_sentences != -1: @@ -381,7 +354,7 @@ def main(args): clfs = {} if args.return_indices: indices = {} - # Initialize lists to store scores for each metric across all languages + u_scores, t_scores, punct_scores = [], [], [] for lang_code, dsets in tqdm(eval_data.items()): @@ -398,13 +371,6 @@ def main(args): sentences = dataset["data"] if not sentences: continue - # if isinstance(sentences[0], list) and "lyrics" not in dataset_name and "short" not in dataset_name: - # # documents: only 10% of documents. 1000 sentences --> 100 docs - # max_n_sentences = args.max_n_test_sentences // 10 - # # shuffle sentences - # np.random.seed(42) - # sentences = np.random.permutation(sentences).tolist() - # else: max_n_sentences = args.max_n_test_sentences sentences = sentences[:max_n_sentences] if len(sentences) == 0: @@ -412,6 +378,7 @@ def main(args): if lang_code not in f or dataset_name not in f[lang_code]: continue + # to be in line w/ LLM eval; for fair comparison if "lyrics" in dataset_name or "short" in dataset_name: exclude_every_k = 0 else: @@ -589,13 +556,14 @@ def main(args): "include_langs": args.include_langs, } - # sio.dump( - # clfs, - # open( - # Constants.CACHE_DIR / "intrinsic" / f"{save_str}.skops", - # "wb", - # ), - # ) + if not args.skip_adaptation: + sio.dump( + clfs, + open( + Constants.CACHE_DIR / "intrinsic" / f"{save_str}.skops", + "wb", + ), + ) json.dump( results, open( @@ -606,7 +574,6 @@ def main(args): ) print(Constants.CACHE_DIR / "intrinsic" / f"{save_str}.json") - # Write results_avg to JSON json.dump( results_avg, open( diff --git a/wtpsplit/evaluation/intrinsic_baselines.py b/wtpsplit/evaluation/intrinsic_baselines.py index dd6a46fe..97ba0fa3 100644 --- a/wtpsplit/evaluation/intrinsic_baselines.py +++ b/wtpsplit/evaluation/intrinsic_baselines.py @@ -10,7 +10,6 @@ LanguageError, ersatz_sentencize, evaluate_sentences, - preprocess_sentence, punkt_sentencize, pysbd_sentencize, spacy_dp_sentencize, @@ -20,6 +19,7 @@ def split_language_data(eval_data): + # used if 2 language codes given (i.e., code-switching) new_eval_data = {} for lang_code, lang_data in eval_data.items(): @@ -28,7 +28,6 @@ def split_language_data(eval_data): new_lang1 = f"{lang_code}_{lang1.upper()}" new_lang2 = f"{lang_code}_{lang2.upper()}" - # Adding the same content for both new language keys new_eval_data[new_lang1] = lang_data new_eval_data[new_lang2] = lang_data else: @@ -39,7 +38,7 @@ def split_language_data(eval_data): @dataclass class Args: - eval_data_path: str = "data/all_data_11_05-all.pth" + eval_data_path: str = "data/all_data.pth" include_langs: List[str] = None exclude_every_k: int = 10 @@ -62,28 +61,10 @@ class Args: for dataset_name, dataset in lang_data["sentence"].items(): if "nllb" in dataset_name: continue - # if "corrupted" in dataset_name and dataset_name != "ted2020-corrupted-asr": - # print("SKIP: ", lang, dataset_name) - # continue - # if "legal" in dataset_name and not ("laws" in dataset_name or "judgements" in dataset_name): - # print("SKIP: ", lang, dataset_name) - # continue - # if "ted2020-corrupted-asr" not in dataset_name: - # continue if not dataset["data"]: continue results[lang][dataset_name] = {} indices[lang][dataset_name] = {} - if "asr" in dataset_name and not any( - x in dataset_name for x in ["lyrics", "short", "code", "ted2020", "legal"] - ): - continue - if "legal" in dataset_name and not ("laws" in dataset_name or "judgements" in dataset_name): - continue - if "social-media" in dataset_name: - continue - if "nllb" in dataset_name: - continue if "-" in lang: # code-switched data: eval 2x @@ -106,7 +87,6 @@ class Args: exclude_every_k = args.exclude_every_k try: if isinstance(dataset["data"][0], list): - # all_sentences = [[preprocess_sentence(s) for s in doc] for doc in dataset["data"]] all_sentences = dataset["data"] metrics = [] for i, sentences in enumerate(all_sentences): @@ -172,8 +152,7 @@ class Args: indices[lang][dataset_name][name]["length"] = [metrics.pop("length")] results[lang][dataset_name][name] = metrics except LanguageError as e: - # print("Language not supported for", name) - # print(e) + print("Language not supported for", name) results[lang][dataset_name][name] = None json.dump(results, open(Constants.CACHE_DIR / "intrinsic_baselines.json", "w"), indent=4, default=int) diff --git a/wtpsplit/evaluation/intrinsic_baselines_multilingual.py b/wtpsplit/evaluation/intrinsic_baselines_multilingual.py index 9ad420ab..e1091ded 100644 --- a/wtpsplit/evaluation/intrinsic_baselines_multilingual.py +++ b/wtpsplit/evaluation/intrinsic_baselines_multilingual.py @@ -8,18 +8,14 @@ from wtpsplit.evaluation import ( LanguageError, - ersatz_sentencize, evaluate_sentences, - preprocess_sentence, - punkt_sentencize, - pysbd_sentencize, spacy_dp_sentencize, - spacy_sent_sentencize, ) from wtpsplit.utils import Constants def split_language_data(eval_data): + # used if 2 language codes given (i.e., code-switching) new_eval_data = {} for lang_code, lang_data in eval_data.items(): @@ -28,7 +24,6 @@ def split_language_data(eval_data): new_lang1 = f"{lang_code}_{lang1.upper()}" new_lang2 = f"{lang_code}_{lang2.upper()}" - # Adding the same content for both new language keys new_eval_data[new_lang1] = lang_data new_eval_data[new_lang2] = lang_data else: @@ -39,7 +34,7 @@ def split_language_data(eval_data): @dataclass class Args: - eval_data_path: str = "data/all_data_11_05-all.pth" + eval_data_path: str = "data/all_data.pth" include_langs: List[str] = None exclude_every_k: int = 10 @@ -62,27 +57,11 @@ class Args: for dataset_name, dataset in lang_data["sentence"].items(): if "nllb" in dataset_name: continue - # if "corrupted" in dataset_name and dataset_name != "ted2020-corrupted-asr": - # print("SKIP: ", lang, dataset_name) - # continue - # if "legal" in dataset_name and not ("laws" in dataset_name or "judgements" in dataset_name): - # print("SKIP: ", lang, dataset_name) - # continue if not dataset["data"]: continue results[lang][dataset_name] = {} indices[lang][dataset_name] = {} - if "asr" in dataset_name and not any( - x in dataset_name for x in ["lyrics", "short", "code", "ted2020", "legal"] - ): - continue - if "legal" in dataset_name and not ("laws" in dataset_name or "judgements" in dataset_name): - continue - if "social-media" in dataset_name: - continue - if "nllb" in dataset_name: - continue - + if "-" in lang: # code-switched data: eval 2x lang_code = lang.split("_")[1].lower() @@ -95,13 +74,13 @@ class Args: ]: print(f"Running {name} on {dataset_name} in {lang_code}...") indices[lang][dataset_name][name] = {} + # exclude also here for fair comparison with others if "lyrics" in dataset_name or "short" in dataset_name: exclude_every_k = 0 else: exclude_every_k = args.exclude_every_k try: if isinstance(dataset["data"][0], list): - # all_sentences = [[preprocess_sentence(s) for s in doc] for doc in dataset["data"]] all_sentences = dataset["data"] metrics = [] for i, sentences in enumerate(all_sentences): @@ -110,7 +89,7 @@ class Args: doc_metrics = evaluate_sentences( lang_code, sentences, - f("xx", text), + f("xx", text), # xx is multilingual key return_indices=True, exclude_every_k=exclude_every_k, ) @@ -143,11 +122,9 @@ class Args: if avg_results[key]: avg_results[key] = sum(avg_results[key]) / len(avg_results[key]) - # Store the results and indices results[lang][dataset_name][name] = avg_results indices[lang][dataset_name][name] = concat_indices else: - # sentences = [preprocess_sentence(s) for s in dataset["data"]] sentences = dataset["data"] text = Constants.SEPARATORS[lang_code].join(sentences) diff --git a/wtpsplit/evaluation/intrinsic_list.py b/wtpsplit/evaluation/intrinsic_list.py deleted file mode 100644 index fa7be03a..00000000 --- a/wtpsplit/evaluation/intrinsic_list.py +++ /dev/null @@ -1,493 +0,0 @@ -import copy -import json -import logging -import os -import time -from dataclasses import dataclass -from typing import List - -import h5py -import numpy as np -import torch -from datasets import load_dataset -from tqdm.auto import tqdm -from transformers import AutoModelForTokenClassification, HfArgumentParser - -import adapters -import wtpsplit.models # noqa: F401 -from wtpsplit.evaluation import evaluate_mixture, get_labels, token_to_char_probs, train_mixture -from wtpsplit.extract import PyTorchWrapper, extract -from wtpsplit.utils import Constants -from collections import defaultdict - -logger = logging.getLogger() -logger.setLevel(logging.INFO) - - -@dataclass -class Args: - model_path: str - adapter_path: str = None - # eval data in the format: - # { - # "": { - # "sentence": { - # "": { - # "meta": { - # "train_data": ["train sentence 1", "train sentence 2"] - # }, - # "data": ["test sentence 1", "test sentence 2"] - # } - # } - # } - # } - # TODO: for songs/etc., maybe feed in each sample separately? - eval_data_path: str = "data/eval.pth" - valid_text_path: str = None # "data/sentence/valid.parquet" - device: str = "cpu" - block_size: int = 510 - stride: int = 64 - batch_size: int = 1 - include_langs: List[str] = None - custom_language_list: str = None - threshold: float = 0.01 - max_n_train_sentences: int = 10_000 - save_suffix: str = "" - do_lowercase: bool = False - do_remove_punct: bool = False - do_strip: bool = False - tqdm: bool = False - skip_adaptation: bool = False - clf_from_scratch: bool = False - - -def process_logits_list(text, model, lang_code, block_size, stride, batch_size, verbose=True) -> List[np.ndarray]: - logits_list = [] - - for chunk in tqdm(text, disable=not verbose): - merged_chunk = Constants.SEPARATORS[lang_code].join(chunk) - # Extract necessary data - logits, offsets_mapping, tokenizer = extract( - [merged_chunk], - model, - lang_code=lang_code, - stride=args.stride, - block_size=block_size, - batch_size=1, - pad_last_batch=True, - ) - logits = logits[0] - if offsets_mapping is not None: - offsets_mapping = offsets_mapping[0] - - if "xlm" in model.config.model_type: - tokens = tokenizer.tokenize(merged_chunk, verbose=False) - - # padding is also removed here (via offset_mapping) - logits = token_to_char_probs(merged_chunk, tokens, logits, tokenizer, offsets_mapping) - if len(model.model.config.id2label) == 2: - # Igor's models: take winning logit - logits = np.expand_dims(logits.argmax(axis=1), axis=1) - # we apply sigmoid later; convert to fake logits - logits = np.log((logits + 1e-8) / (1 - logits + 1e-8)) - logits_list.append(logits) - else: - raise NotImplementedError("Only XLM models are supported for now") - - return logits_list - - -def corrupt(text: str, do_lowercase: bool, do_remove_punct: bool): - if do_lowercase: - text = text.lower() - if do_remove_punct: - for punct in Constants.PUNCTUATION_CHARS: - text = text.replace(punct, "") - return text - - -def load_or_compute_logits(args, model, eval_data, valid_data=None, save_str: str = None): - logits_path = Constants.CACHE_DIR / "intrinsic_list" / f"{save_str}.h5" - - if not os.path.exists(Constants.CACHE_DIR / "intrinsic_list"): - os.makedirs(Constants.CACHE_DIR / "intrinsic_list") - - if args.custom_language_list is not None: - with open(args.custom_language_list, "r") as f: - # file is a csv: l1,l2,... - use_langs = f.read().strip().split(",") - else: - use_langs = Constants.LANGINFO.index - - total_test_time = 0 # Initialize total test processing time - - # TODO: revert to "a" - with h5py.File(logits_path, "a") as f, torch.no_grad(): - for lang_code in use_langs: - if args.include_langs is not None and lang_code not in args.include_langs: - continue - - print(f"Processing {lang_code}...") - if lang_code not in f: - lang_group = f.create_group(lang_code) - else: - lang_group = f[lang_code] - - # eval data - for dataset_name, dataset in eval_data[lang_code]["sentence"].items(): - # train on all mldb, eval on mldbW - if "mldbW" in args.eval_data_path and ( - "mldbW" not in args.model_path and "mldbW" not in args.adapter_path - ): - dataset_load_name = "unk" - else: - dataset_load_name = dataset_name - try: - if args.adapter_path: - if args.clf_from_scratch: - model.model.classifier = torch.nn.Linear(model.model.classifier.in_features, 1) - model.model.load_adapter( - args.adapter_path + "/" + dataset_load_name + "/" + lang_code, - set_active=True, - with_head=True, - load_as="text", - ) - if hasattr(model.model.config, "unfreeze_ln"): - if model.model.config.unfreeze_ln: - ln_dict = torch.load( - args.adapter_path + "/" + dataset_load_name + "/" + lang_code + "/ln_dict.pth" - ) - for n, p in model.backbone.named_parameters(): - if "LayerNorm" in n: - p.data = ln_dict[n].data - if not os.path.exists(os.path.join(args.model_path, "pytorch_model.bin")) and not os.path.exists( - os.path.join(args.model_path, "model.safetensors") - ): - model_path = os.path.join(args.model_path, dataset_load_name, "en") - print(model_path) - model = PyTorchWrapper( - AutoModelForTokenClassification.from_pretrained(model_path).to(args.device) - ) - except Exception as e: - print(f"Error loading adapter for {dataset_load_name} in {lang_code}: {e}") - continue - print(dataset_name, dataset_load_name) - if dataset_name not in lang_group: - dset_group = lang_group.create_group(dataset_name) - else: - dset_group = lang_group[dataset_name] - - if "test_logits" not in dset_group: - test_sentences = dataset["data"] - if args.do_strip: - test_sentences = [ - [sentence.lstrip("-").strip() for sentence in chunk] for chunk in test_sentences - ] - test_sentences = [ - [ - corrupt(sentence, do_lowercase=args.do_lowercase, do_remove_punct=args.do_remove_punct) - for sentence in chunk - ] - for chunk in test_sentences - ] - - start_time = time.time() # Start timing for test logits processing - test_logits = process_logits_list( - test_sentences, - model, - lang_code, - args.block_size, - args.stride, - args.batch_size, - ) - end_time = time.time() # End timing for test logits processing - total_test_time += end_time - start_time # Accumulate test processing time - test_logit_lengths = [] - # store start and end indices for each pair, used later to slice the logits - all_logit_lengths = np.append(0, np.cumsum([len(logits) for logits in test_logits])) - # append tuple of start and end indices for each pair - for i in range(len(test_logits)): - test_logit_lengths.append((all_logit_lengths[i], all_logit_lengths[i + 1] - 1)) - - test_logits = np.concatenate(test_logits) - test_labels = [ - get_labels(lang_code, test_chunk, after_space=False)[:-1] for test_chunk in test_sentences - ] - test_labels = np.append(np.concatenate(test_labels), 0) - assert len(test_labels) == len(test_logits) + 1 - - dset_group.create_dataset("test_logits", data=test_logits) - dset_group.create_dataset("test_labels", data=test_labels) - dset_group.create_dataset("test_logit_lengths", data=test_logit_lengths) - - train_sentences = dataset["meta"].get("train_data") - if train_sentences is not None and "train_logits" not in dset_group and not args.skip_adaptation: - train_sentences = [ - [ - corrupt(sentence, do_lowercase=args.do_lowercase, do_remove_punct=args.do_remove_punct) - for sentence in chunk - ] - for chunk in train_sentences - ] - if args.do_strip: - train_sentences = [ - [sentence.lstrip("-").strip() for sentence in chunk] for chunk in train_sentences - ] - train_sentences = train_sentences[: args.max_n_train_sentences] - - train_logits = process_logits_list( - train_sentences, - model, - lang_code, - args.block_size, - args.stride, - args.batch_size, - ) - train_logit_lengths = [] - # store start and end indices for each pair, used later to slice the logits - all_logit_lengths = np.append(0, np.cumsum([len(logits) for logits in train_logits])) - # append tuple of start and end indices for each pair - for i in range(len(train_logits)): - train_logit_lengths.append((all_logit_lengths[i], all_logit_lengths[i + 1] - 1)) - - train_logits = np.concatenate(train_logits) - train_labels = [ - get_labels(lang_code, train_chunk, after_space=False)[:-1] for train_chunk in train_sentences - ] - train_labels = np.append(np.concatenate(train_labels), 0) - assert len(train_labels) == len(train_logits) + 1 - - dset_group.create_dataset("train_logits", data=train_logits) - dset_group.create_dataset("train_labels", data=train_labels) - - end_time = time.time() - return h5py.File(logits_path, "r"), total_test_time / 60 # to minutes - - -def compute_statistics(values): - if not values: - return {} - - # Extract all possible keys (metrics) from the first score dictionary, assuming all dicts have the same keys - all_metrics = values[0][0].keys() - - # Prepare a dictionary to store statistics for each metric - stats_dict = {} - - for metric in all_metrics: - scores = [score[metric] for score, lang in values] - langs = [lang for score, lang in values] - - # Calculate statistics for the current metric - min_index = np.argmin(scores) - max_index = np.argmax(scores) - - stats_dict[metric] = { - "mean": np.mean(scores), - "median": np.median(scores), - "std": np.std(scores), - "min": scores[min_index], - "min_lang": langs[min_index], - "max": scores[max_index], - "max_lang": langs[max_index], - } - - return stats_dict - - -def main(args): - save_model_path = args.model_path - if args.adapter_path: - save_model_path = args.adapter_path - save_str = ( - f"{save_model_path.replace('/','_')}_b{args.block_size}_s{args.stride}_u{args.threshold}{args.save_suffix}" - ) - if args.do_lowercase: - save_str += "_lc" - if args.do_remove_punct: - save_str += "_rmp" - - eval_data = torch.load(args.eval_data_path) - if args.valid_text_path is not None: - valid_data = load_dataset("parquet", data_files=args.valid_text_path, split="train") - else: - valid_data = None - - print("Loading model...") - # if model_path does not contain a model, take first subfolder - if not os.path.exists(os.path.join(args.model_path, "pytorch_model.bin")) and not os.path.exists( - os.path.join(args.model_path, "model.safetensors") - ): - model_path = os.path.join(args.model_path, os.listdir(args.model_path)[0], "en") - print("joined") - print(model_path) - else: - model_path = args.model_path - model = PyTorchWrapper(AutoModelForTokenClassification.from_pretrained(model_path).to(args.device)) - if args.adapter_path: - model_type = model.model.config.model_type - # adapters need xlm-roberta as model type. - model.model.config.model_type = "xlm-roberta" - adapters.init(model.model) - # reset model type (used later) - model.model.config.model_type = model_type - if "meta-clf" in args.adapter_path: - clf = model.model.classifier - model.model.classifier = torch.nn.Sequential(clf, torch.nn.Linear(clf.out_features, 1)) - - # first, logits for everything. - f, total_test_time = load_or_compute_logits(args, model, eval_data, valid_data, save_str) - - # now, compute the intrinsic scores. - results = {} - clfs = {} - # Initialize lists to store scores for each metric across all languages - u_scores, t_scores, punct_scores = [], [], [] - - for lang_code, dsets in tqdm(eval_data.items()): - if args.include_langs is not None and lang_code not in args.include_langs: - continue - - print(f"Predicting {lang_code}...") - results[lang_code] = {} - clfs[lang_code] = {} - - for dataset_name, dataset in dsets["sentence"].items(): - sentences = dataset["data"] - if args.do_strip: - sentences = [[sentence.lstrip("-").strip() for sentence in chunk] for chunk in sentences] - sentences = [ - [ - corrupt(sentence, do_lowercase=args.do_lowercase, do_remove_punct=args.do_remove_punct) - for sentence in chunk - ] - for chunk in sentences - ] - # check if f[lang_code][dataset_name] exists - if lang_code not in f or dataset_name not in f[lang_code]: - continue - - if "train_logits" in f[lang_code][dataset_name] and not args.skip_adaptation: - feature_indices = None - clf = train_mixture( - [lang_code], - f[lang_code][dataset_name]["train_logits"][:], - f[lang_code][dataset_name]["train_labels"][:], - features=feature_indices, - ) - if clf[0] is not None: - print(clf) - - score_t = defaultdict(list) - score_punct = defaultdict(list) - for i, chunk in tqdm(enumerate(sentences), total=len(sentences), disable=args.tqdm): - start, end = f[lang_code][dataset_name]["test_logit_lengths"][i] - single_score_t, single_score_punct, info = evaluate_mixture( - lang_code, - f[lang_code][dataset_name]["test_logits"][:][start:end], - list(chunk), - *clf, - ) - score_t["f1"].append(single_score_t) - for key in ["precision", "recall", "correct_pairwise"]: - score_t[key].append(info["info_newline"][key]) - score_punct["f1"].append(single_score_punct if single_score_punct is not None else 0.0) - for key in ["precision", "recall", "correct_pairwise"]: - score_punct[key].append( - info["info_transformed"][key] - if single_score_punct is not None - else info["info_newline"][key] - ) - - clfs[lang_code][dataset_name] = clf - - clf = list(copy.deepcopy(clf)) - clf[-1] = args.threshold - else: - clf = [None, None, None, args.threshold] - score_t = score_punct = None - - score_u = defaultdict(list) - for i, chunk in tqdm(enumerate(sentences), total=len(sentences), disable=args.tqdm): - start, end = f[lang_code][dataset_name]["test_logit_lengths"][i] - single_score_u, _, info = evaluate_mixture( - lang_code, - f[lang_code][dataset_name]["test_logits"][:][start:end], - list(chunk), - *clf, - ) - - score_u["f1"].append(single_score_u) - for key in ["precision", "recall", "correct_pairwise"]: - score_u[key].append(info["info_newline"][key]) - - score_u = {key: np.mean(value) for key, value in score_u.items()} - score_t = {key: np.mean(value) for key, value in score_t.items()} if score_t else None - score_punct = {key: np.mean(value) for key, value in score_punct.items()} if score_punct else None - - results[lang_code][dataset_name] = { - "u": score_u["f1"], - "t": score_t["f1"], - "punct": score_punct["f1"], - "u_precision": score_u["precision"], - "t_precision": score_t["precision"], - "punct_precision": score_punct["precision"], - "u_recall": score_u["recall"], - "t_recall": score_t["recall"], - "punct_recall": score_punct["recall"], - "u_acc": score_u["correct_pairwise"], - "t_acc": score_t["correct_pairwise"], - "punct_acc": score_punct["correct_pairwise"], - } - - # just for printing - score_t = score_t or 0.0 - score_punct = score_punct or 0.0 - - u_scores.append((score_u, lang_code)) - t_scores.append((score_t, lang_code)) - punct_scores.append((score_punct, lang_code)) - - print(f"{lang_code} {dataset_name} {score_u['f1']:.3f} {score_t['f1']:.3f} {score_punct['f1']:.3f}") - - # Compute statistics for each metric across all languages - results_avg = { - "u": compute_statistics(u_scores), - "t": compute_statistics(t_scores), - "punct": compute_statistics(punct_scores), - "include_langs": args.include_langs, - } - - # sio.dump( - # clfs, - # open( - # Constants.CACHE_DIR / "intrinsic_list" / f"{save_str}.skops", - # "wb", - # ), - # ) - json.dump( - results, - open( - Constants.CACHE_DIR / "intrinsic_list" / f"{save_str}.json", - "w", - ), - indent=4, - ) - - # Write results_avg to JSON - json.dump( - results_avg, - open( - Constants.CACHE_DIR / "intrinsic_list" / f"{save_str}_AVG.json", - "w", - ), - indent=4, - ) - os.remove(f.filename) - return results, results_avg, total_test_time - - -if __name__ == "__main__": - (args,) = HfArgumentParser([Args]).parse_args_into_dataclasses() - results, results_avg, total_test_time = main(args) - print(total_test_time) diff --git a/wtpsplit/evaluation/intrinsic_pairwise.py b/wtpsplit/evaluation/intrinsic_pairwise.py index 0b90b095..18845289 100644 --- a/wtpsplit/evaluation/intrinsic_pairwise.py +++ b/wtpsplit/evaluation/intrinsic_pairwise.py @@ -46,7 +46,7 @@ class Args: # } # } # } - eval_data_path: str = "data/all_data_11_05-all.pth" + eval_data_path: str = "data/all_data.pth" valid_text_path: str = None # "data/sentence/valid.parquet" device: str = "cpu" block_size: int = 512 @@ -56,8 +56,6 @@ class Args: max_n_train_sentences: int = 1_000 max_n_test_sentences: int = sys.maxsize save_suffix: str = "" - do_lowercase: bool = False - do_remove_punct: bool = False skip_adaptation: bool = False keep_logits: bool = True skip_corrupted: bool = True @@ -66,47 +64,13 @@ class Args: clf_from_scratch: bool = False # k_mer-specific args + # k=2 means pairwise, k=3 triplets, ... k: int = 2 max_n_samples: int = sys.maxsize sample_pct: float = 0.5 min_k_mer_length: int = 0 - adjust_threshold: bool = False - # threshold - threshold_increase_type: str = "linear" - threshold_min_length: int = 0 - threshold_max_length: int = 256 - threshold_max: float = 0.1 -def calculate_threshold(sequence_length, max_length, min_length, max_threshold, default_threshold=0.01): - """ - Calculates the threshold based on the sequence length with various increase types - ('linear', 'logarithmic', 'quadratic', 'exponential', 'sigmoidal') from default_threshold - to max_threshold as sequence length decreases from max_length to min_length. - - :param sequence_length: The current sequence length - :param max_length: The sequence length at which the default_threshold is applied - :param min_length: The sequence length at which the max_threshold is applied - :param max_threshold: The maximum threshold value - :param increase_type: Type of increase - :param default_threshold: The default threshold value (minimum threshold) - :return: The calculated threshold for the given sequence length - """ - - # Normalize sequence length to a range [0, 1] - if max_length == min_length: - # Ensure no division by zero - normalized_length = 0 - else: - normalized_length = (sequence_length - max_length) / (min_length - max_length) - - threshold = normalized_length * (max_threshold - default_threshold) + default_threshold - - # Ensure the threshold does not exceed the bounds - threshold = min(max(threshold, default_threshold), max_threshold) - - return threshold - def process_logits_k_mers(pairs, model, lang_code, block_size, batch_size, verbose=True) -> List[np.ndarray]: logits_list = [] @@ -144,55 +108,9 @@ def process_logits_k_mers(pairs, model, lang_code, block_size, batch_size, verbo return logits_list, n_tokens_list -def generate_pairs( - sentences: List[str], - do_lowercase: bool, - do_remove_punct: bool, - sample_pct: float = 1, - max_n_samples: int = sys.maxsize, - min_k_mer_length: int = 0, -) -> List[Tuple[str, str]]: - """Generate sentence pairs from a list of sentences. - - Args: - sentences (List[str]): Input list of sentences. - sample_pct (float): Percentage of pairs to sample. - max_n_samples (int): Maximum number of pairs to sample. - min_k_mer_length (int): Minimum length of a sentence pair. - do_lowercase (bool): Whether to lowercase the sentences. - do_remove_punct (bool): Whether to remove punctuation from the sentences. - - Returns: - List[Tuple[str, str]]: List of sentence pairs. - """ - random.seed(42) - n_pairs = len(sentences) // 2 - sample_size = min(round(n_pairs * sample_pct), max_n_samples) - - # If we need to sample a subset of all possible pairs, do so efficiently - if sample_size < n_pairs: - sampled_indices = set(random.sample(range(n_pairs), sample_size)) - all_pairs = [ - (sentences[2 * i], sentences[2 * i + 1]) - for i in sampled_indices - if len(sentences[2 * i]) + len(sentences[2 * i + 1]) > min_k_mer_length - ] - else: - # Generate all pairs that meet the min_k_mer_length criterion - all_pairs = [ - (sentences[i], sentences[i + 1]) - for i in range(0, len(sentences) - 1, 2) - if len(sentences[i]) + len(sentences[i + 1]) > min_k_mer_length - ] - - return all_pairs - - def generate_k_mers( sentences: List[str], k: int, - do_lowercase: bool, - do_remove_punct: bool, sample_pct: float = 1, max_n_samples: int = sys.maxsize, min_k_mer_length: int = 0, @@ -205,8 +123,6 @@ def generate_k_mers( sample_pct (float): Percentage of k-mers to sample. max_n_samples (int): Maximum number of k-mers to sample. min_k_mer_length (int): Minimum length of a k-mer. - do_lowercase (bool): Whether to lowercase the sentences. - do_remove_punct (bool): Whether to remove punctuation from the sentences. Returns: List[Tuple[str, ...]]: List of k-mers. @@ -215,7 +131,7 @@ def generate_k_mers( n_k_mers = len(sentences) // k sample_size = min(round(n_k_mers * sample_pct), max_n_samples) - # Efficient sampling of a subset of all possible k-mers if needed + # sample if needed if sample_size < n_k_mers: sampled_indices = set(random.sample(range(n_k_mers), sample_size)) all_k_mers = [ @@ -224,7 +140,7 @@ def generate_k_mers( if sum(len(sentences[i * k + j]) for j in range(k)) > min_k_mer_length ] else: - # Generate all k-mers that meet the min_k_mer_length criterion + # all all_k_mers = [ tuple(sentences[i + j] for j in range(k)) for i in range(0, len(sentences) - k + 1, k) @@ -240,7 +156,7 @@ def load_or_compute_logits(args, model, eval_data, valid_data=None, save_str: st if not os.path.exists(Constants.CACHE_DIR / "intrinsic_pairwise"): os.makedirs(Constants.CACHE_DIR / "intrinsic_pairwise") - total_test_time = 0 # Initialize total test processing time + total_test_time = 0 start_time = time.time() with h5py.File(logits_path, "a") as f, torch.no_grad(): @@ -264,23 +180,13 @@ def load_or_compute_logits(args, model, eval_data, valid_data=None, save_str: st if args.adapter_path: if args.clf_from_scratch: model.model.classifier = torch.nn.Linear(model.model.classifier.in_features, 1) - # elif model.model.classifier.out_features == 2: elif args.model_path == "xlm-roberta-base" or args.model_path == "xlm-roberta-large": # we train XLM-R using our wrapper, needs to be adapted for adapters to be loaded model.model.classifier = torch.nn.Linear( model.model.classifier.in_features, - 1, # FIXME: hardcoded? + 1, ) model.model.__class__.__name__ = "SubwordXLMForTokenClassification" - # if ( - # any(code in lang_code for code in ["ceb", "jv", "mn", "yo"]) - # and "ted2020" not in dataset_name - # ): - # # no ersatz for these either. - # dataset_load_name = "nllb" - # if "corrupted" in dataset_load_name: - # dataset_load_name += "-corrupted" - # else: dataset_load_name = dataset_name model.model.load_adapter( args.adapter_path + "/" + dataset_load_name + "/" + lang_code, @@ -288,14 +194,6 @@ def load_or_compute_logits(args, model, eval_data, valid_data=None, save_str: st with_head=True, load_as="text", ) - if hasattr(model.model.config, "unfreeze_ln"): - if model.model.config.unfreeze_ln: - ln_dict = torch.load( - args.adapter_path + "/" + dataset_name + "/" + lang_code + "/ln_dict.pth" - ) - for n, p in model.backbone.named_parameters(): - if "LayerNorm" in n: - p.data = ln_dict[n].data except Exception as e: print(f"Error loading adapter for {dataset_name} in {lang_code}: {e}") continue @@ -314,14 +212,12 @@ def load_or_compute_logits(args, model, eval_data, valid_data=None, save_str: st all_pairs_test = generate_k_mers( test_sentences, k=args.k, - do_lowercase=args.do_lowercase, - do_remove_punct=args.do_remove_punct, sample_pct=args.sample_pct, max_n_samples=args.max_n_samples, min_k_mer_length=args.min_k_mer_length, ) - start_time = time.time() # Start timing for test logits processing + start_time = time.time() test_logits, test_n_logits = process_logits_k_mers( all_pairs_test, model, @@ -339,7 +235,7 @@ def load_or_compute_logits(args, model, eval_data, valid_data=None, save_str: st test_logit_lengths.append((all_logit_lengths[i], all_logit_lengths[i + 1] - 1)) test_logits = np.concatenate(test_logits) - total_test_time += end_time - start_time # Accumulate test processing time + total_test_time += end_time - start_time # get_labels returns 2nd label at end of seq, which we do not want. # label is at position -2 --> remove and add back 0 to end of sequence @@ -362,8 +258,6 @@ def load_or_compute_logits(args, model, eval_data, valid_data=None, save_str: st all_pairs_train = generate_k_mers( train_sentences, k=args.k, - do_lowercase=args.do_lowercase, - do_remove_punct=args.do_remove_punct, sample_pct=args.sample_pct, max_n_samples=args.max_n_samples, min_k_mer_length=args.min_k_mer_length, @@ -394,11 +288,6 @@ def main(args): save_model_path = args.adapter_path save_str = f"{save_model_path.replace('/','_')}_b{args.block_size}_k{args.k}{args.save_suffix}" - if args.do_lowercase: - save_str += "_lc" - if args.do_remove_punct: - save_str += "_rmp" - print(save_str) eval_data = torch.load(args.eval_data_path) if "canine" in args.model_path and not "no-adapters" in args.model_path: @@ -417,25 +306,17 @@ def main(args): adapters.init(model.model) # reset model type (used later) model.model.config.model_type = model_type - if "meta-clf" in args.adapter_path: - clf = model.model.classifier - model.model.classifier = torch.nn.Sequential(clf, torch.nn.Linear(clf.out_features, 1)) # first, logits for everything. f, total_test_time = load_or_compute_logits(args, model, eval_data, valid_data, save_str) save_str += f"_u{args.threshold}" - if args.adjust_threshold: - save_str += ( - f"_adj_{args.threshold_increase_type}_{args.threshold_min_length}_{args.threshold_max_length}" - f"_{args.threshold}_{args.threshold_max}" - ) # now, compute the intrinsic scores. results = {} clfs = {} if args.return_indices: indices = {} - # Initialize lists to store scores for each metric across all languages + # lists to store scores for each metric across *all* languages u_scores, t_scores, punct_scores = [], [], [] u_accs, t_accs, punct_accs = [], [], [] thresholds_t, thresholds_adj = [], [] @@ -459,8 +340,6 @@ def main(args): sent_k_mers = generate_k_mers( sentences, k=args.k, - do_lowercase=args.do_lowercase, - do_remove_punct=args.do_remove_punct, sample_pct=args.sample_pct, max_n_samples=args.max_n_samples, min_k_mer_length=args.min_k_mer_length, @@ -485,7 +364,7 @@ def main(args): score_t = [] score_punct = [] - # acc: average of correct 100% pairwise segmentation + # acc: average of correct 100% pairwise (or: k-mer) segmentation acc_t = [] acc_punct = [] @@ -523,19 +402,7 @@ def main(args): length = [] for i, k_mer in enumerate(sent_k_mers): start, end = f[lang_code][dataset_name]["test_logit_lengths"][i] - if args.adjust_threshold: - seq_len = f[lang_code][dataset_name]["test_n_logits"][i] - threshold_adjusted = calculate_threshold( - sequence_length=seq_len, - max_length=args.threshold_max_length, - min_length=args.threshold_min_length, - max_threshold=args.threshold_max, - default_threshold=args.threshold, - ) - clf[-1] = threshold_adjusted - thresholds.append(threshold_adjusted) - else: - thresholds.append(args.threshold) + thresholds.append(args.threshold) single_score_u, _, info, cur_u_indices, _ = evaluate_mixture( lang_code, f[lang_code][dataset_name]["test_logits"][:][start:end], @@ -560,7 +427,6 @@ def main(args): acc_punct = np.mean(acc_punct) if score_punct else None threshold = np.mean(thresholds) - results[lang_code][dataset_name] = { "u": score_u, "t": score_t, diff --git a/wtpsplit/evaluation/intrinsic_ted.py b/wtpsplit/evaluation/intrinsic_ted.py index f8fc872b..8ada2671 100644 --- a/wtpsplit/evaluation/intrinsic_ted.py +++ b/wtpsplit/evaluation/intrinsic_ted.py @@ -8,6 +8,7 @@ import h5py import numpy as np +import spacy_alignments as tokenizations import torch from datasets import load_dataset from tqdm.auto import tqdm @@ -16,7 +17,7 @@ import adapters import wtpsplit.models # noqa: F401 from wtpsplit.evaluation import get_labels, train_mixture -from wtpsplit.evaluation.evaluate_sepp_nlg_2021_subtask1 import evaluate_subtask1 +from wtpsplit.evaluation.evaluate_sepp_nlg_subtask1 import evaluate_subtask1 from wtpsplit.evaluation.intrinsic import process_logits from wtpsplit.extract import PyTorchWrapper, extract from wtpsplit.utils import Constants, sigmoid @@ -24,8 +25,6 @@ logger = logging.getLogger() logger.setLevel(logging.INFO) -import spacy_alignments as tokenizations - def get_token_labels(a, b, a_labels): a2b, b2a = tokenizations.get_alignments(a, b) @@ -75,7 +74,6 @@ class Args: batch_size: int = 32 include_langs: List[str] = None include_splits: List[str] = None - custom_language_list: str = None threshold: float = 0.025 max_n_train_sentences: int = 100 max_n_test_sentences: int = sys.maxsize @@ -86,11 +84,11 @@ class Args: def process_logits_and_tokens(text, model, lang_code, args): - # variation of process_logits for word-based evals, returning tokens as well. + # variation of process_logits used in intrinsic.py for word-based evals, returning tokens as well. if isinstance(text, list): logits = [] tokens = [] - for short_seq in tqdm(text, desc="Short sequences", disable=False): + for short_seq in tqdm(text, desc="Evaluating...", disable=False): current_logits, current_offsets_mapping, tokenizer = extract( [short_seq], model, @@ -120,12 +118,7 @@ def load_or_compute_logits(args, model, eval_data, valid_data=None, save_str: st if not os.path.exists(Constants.CACHE_DIR / "ted2020"): os.makedirs(Constants.CACHE_DIR / "ted2020") - if args.custom_language_list is not None: - with open(args.custom_language_list, "r") as f: - # file is a csv: l1,l2,... - use_langs = f.read().strip().split(",") - else: - use_langs = eval_data.keys() + use_langs = eval_data.keys() total_test_time = 0 # Initialize total test processing time @@ -138,7 +131,6 @@ def load_or_compute_logits(args, model, eval_data, valid_data=None, save_str: st if args.include_langs is not None and lang_code not in args.include_langs: continue - # print(f"Processing {lang_code}...") if lang_code not in f: lang_group = f.create_group(lang_code) else: @@ -152,6 +144,7 @@ def load_or_compute_logits(args, model, eval_data, valid_data=None, save_str: st if args.adapter_path: if args.clf_from_scratch: model.model.classifier = torch.nn.Linear(model.model.classifier.in_features, 1) + # we trained adapters on "train" split but uniformly save as "surprise_test" model.model.load_adapter( args.adapter_path + "/" + "surprise_test" + "/" + lang_code, set_active=True, @@ -164,14 +157,12 @@ def load_or_compute_logits(args, model, eval_data, valid_data=None, save_str: st model_path = os.path.join(args.model_path, dataset_name, "en") if not os.path.exists(model_path): model_path = args.model_path - # print(model_path) model = PyTorchWrapper( AutoModelForTokenClassification.from_pretrained(model_path).to(args.device) ) except Exception as e: print(f"Error loading adapter for {dataset_name} in {lang_code}: {e}") continue - # print(dataset_name) if dataset_name not in lang_group: dset_group = lang_group.create_group(dataset_name) else: @@ -187,10 +178,10 @@ def load_or_compute_logits(args, model, eval_data, valid_data=None, save_str: st else: raise NotImplementedError - start_time = time.time() # Start timing for test logits processing + start_time = time.time() test_logits, test_tokens = process_logits_and_tokens(test_text, model, lang_code, args) - end_time = time.time() # End timing for test logits processing - total_test_time += end_time - start_time # Accumulate test processing time + end_time = time.time() + total_test_time += end_time - start_time if isinstance(test_sentences[0], list): test_logit_lengths = [] # store start and end indices for each pair, used later to slice the logits @@ -237,7 +228,7 @@ def compute_statistics(values): if not values: # Check for empty values list return {"mean": None, "median": None, "std": None, "min": None, "min_lang": None, "max": None, "max_lang": None} - scores, langs = zip(*values) # Unpack scores and languages + scores, langs = zip(*values) min_index = np.argmin(scores) max_index = np.argmax(scores) return { @@ -283,9 +274,6 @@ def main(args): adapters.init(model.model) # reset model type (used later) model.model.config.model_type = model_type - if "meta-clf" in args.adapter_path: - clf = model.model.classifier - model.model.classifier = torch.nn.Sequential(clf, torch.nn.Linear(clf.out_features, 1)) save_str += f"{args.save_suffix}" if args.max_n_test_sentences < sys.maxsize: @@ -362,12 +350,12 @@ def main(args): else: t_preds = None if not args.skip_adaptation and not args.skip_punct: - # TODO: punct, T on tokens, too? punct_preds = clfs[lang_code][0].predict_proba(current_logits)[:, 1] > clf[2] else: punct_preds = None # write to tsv as per the challenge reqs + # can then be evaluated via evaluate_sepp_nlg_subtask1.py for supervision, preds in zip(["u", "t", "punct"], [u_preds, t_preds, punct_preds]): if preds is None: continue diff --git a/wtpsplit/evaluation/kmer_optuna.py b/wtpsplit/evaluation/kmer_optuna.py deleted file mode 100644 index 17159c74..00000000 --- a/wtpsplit/evaluation/kmer_optuna.py +++ /dev/null @@ -1,245 +0,0 @@ -import copy -import logging -import sys -from dataclasses import dataclass -from datetime import datetime -from multiprocessing import Process -from typing import List - -import numpy as np -import optuna -import torch -from tqdm import tqdm -from transformers import AutoModelForTokenClassification, HfArgumentParser - -import wtpsplit.models # noqa: F401 -from wtpsplit.evaluation import evaluate_mixture -from wtpsplit.evaluation.intrinsic import compute_statistics -from wtpsplit.evaluation.intrinsic_pairwise import calculate_threshold, generate_k_mers, load_or_compute_logits -from wtpsplit.extract import PyTorchWrapper - -logger = logging.getLogger() -logger.setLevel(logging.INFO) - - -@dataclass -class Args: - model_path: str - adapter_path: str = None - # eval data in the format: - # { - # "": { - # "sentence": { - # "": { - # "meta": { - # "train_data": ["train sentence 1", "train sentence 2"] - # }, - # "data": ["test sentence 1", "test sentence 2"] - # } - # } - # } - # } - eval_data_path: str = "data/eval.pth" - valid_text_path: str = None # "data/sentence/valid.parquet" - device: str = "cpu" - block_size: int = 512 - batch_size: int = 128 - include_langs: List[str] = None - threshold: float = 0.01 - max_n_train_sentences: int = 10_000 - save_suffix: str = "" - do_lowercase: bool = False - do_remove_punct: bool = False - skip_adaptation: bool = True - keep_logits: bool = True - - # k_mer-specific args - min_k: int = 2 - max_k: int = 4 - max_n_samples: int = sys.maxsize - sample_pct: float = 0.5 - min_k_mer_length: int = 0 - adjust_threshold: bool = True - # threshold - # threshold_increase_type: str = "linear" - threshold_min_length: int = 0 - threshold_max_length: int = 256 - threshold_max: float = 0.1 - # optuna args - n_trials: int = 16 - n_jobs: int = 32 - - -def objective(trial: optuna.Trial, args: Args, eval_data: dict, f_list) -> float: - # Suggest values for the hyperparameters we want to optimize - args.threshold_min_length = trial.suggest_int("threshold_min_length", 0, 256) - args.threshold_max_length = trial.suggest_int("threshold_max_length", 0, 256) - args.threshold_max = trial.suggest_float("threshold_max", 0.00, 0.5) - - # Execute the main function and retrieve results - all_results = [] - all_results_avg = [] - all_mean_u_acc = [] - for i, k in enumerate(range(args.min_k, args.max_k + 1)): - args.k = k - f = f_list[i] - results, results_avg = main(args, eval_data, f) - all_results.append(results) - all_results_avg.append(results_avg) - all_mean_u_acc.append(results_avg["u_acc"]["mean"]) - - # Store results in the trial's user attributes - trial.set_user_attr(f"{k}_detailed_results", results) - trial.set_user_attr(f"{k}_average_results", results_avg) - - # Objective is to maximize the average U accuracy - # return list as tuple - return tuple(all_mean_u_acc) - - -def load_data_and_model(args): - logger.info("Loading model...") - model = PyTorchWrapper(AutoModelForTokenClassification.from_pretrained(args.model_path).to(args.device)) - - logger.info("Loading evaluation data...") - eval_data = torch.load(args.eval_data_path) - - # Possibly other initialization code here - return model, eval_data - - -def main(args, eval_data, f): - # now, compute the intrinsic scores. - results = {} - clfs = {} - # Initialize lists to store scores for each metric across all languages - u_scores = [] - u_accs = [] - thresholds_adj = [] - - for lang_code, dsets in tqdm(eval_data.items(), desc="Languages", total=len(eval_data), disable=True): - if args.include_langs is not None and lang_code not in args.include_langs: - continue - - results[lang_code] = {} - clfs[lang_code] = {} - - for dataset_name, dataset in dsets["sentence"].items(): - sentences = dataset["data"] - sent_k_mers = generate_k_mers( - sentences, - k=args.k, - do_lowercase=args.do_lowercase, - do_remove_punct=args.do_remove_punct, - sample_pct=args.sample_pct, - max_n_samples=args.max_n_samples, - min_k_mer_length=args.min_k_mer_length, - ) - - clf = [None, None, None, args.threshold] - - score_u = [] - acc_u = [] - thresholds = [] - for i, k_mer in enumerate(sent_k_mers): - start, end = f[lang_code][dataset_name]["test_logit_lengths"][i] - if args.adjust_threshold: - seq_len = f[lang_code][dataset_name]["test_n_logits"][i] - threshold_adjusted = calculate_threshold( - sequence_length=seq_len, - max_length=args.threshold_max_length, - min_length=args.threshold_min_length, - max_threshold=args.threshold_max, - default_threshold=args.threshold, - ) - clf[-1] = threshold_adjusted - thresholds.append(threshold_adjusted) - else: - raise NotImplementedError("Optuna runs are to select the optimal threshold config!") - single_score_u, _, info = evaluate_mixture( - lang_code, - f[lang_code][dataset_name]["test_logits"][:][start:end], - list(k_mer), - *clf, - ) - score_u.append(single_score_u) - acc_u.append(info["info_newline"]["correct_pairwise"]) - - score_u = np.mean(score_u) - acc_u = np.mean(acc_u) - threshold = np.mean(thresholds) - - results[lang_code][dataset_name] = { - "u": score_u, - "u_acc": acc_u, - "threshold_adj": threshold, - } - - u_scores.append((score_u, lang_code)) - u_accs.append((acc_u, lang_code)) - thresholds_adj.append((threshold, lang_code)) - - # Compute statistics for each metric across all languages - results_avg = { - "u": compute_statistics(u_scores), - "u_acc": compute_statistics(u_accs), - "threshold_adj": compute_statistics(thresholds_adj), - "include_langs": args.include_langs, - } - - return results, results_avg - - -def run_optimization(storage_url, study_name, args, eval_data, f_list): - """ - Function to run Optuna optimization in a separate process. - """ - study = optuna.load_study(study_name=study_name, storage=storage_url) - study.optimize( - lambda trial: objective(trial, copy.deepcopy(args), eval_data, f_list), - n_trials=args.n_trials, - show_progress_bar=True, - ) - - print(f"Completed optimization for study: {study_name}") - - -if __name__ == "__main__": - (args,) = HfArgumentParser([Args]).parse_args_into_dataclasses() - - model, eval_data = load_data_and_model(args) - - # first, logits for everything. - f_list = [] - for k in range(args.min_k, args.max_k + 1): - args.k = k - save_str = f"{args.model_path.replace('/','_')}_b{args.block_size}_u{args.threshold}_k_{k}{args.save_suffix}" - print(save_str) - out, _ = load_or_compute_logits(args, model, eval_data, None, save_str) - f_list.append(out) - - # replace k_[max_k] with k_mink-max_k in save_str - save_str = save_str.replace(f"k_{args.max_k}", f"k_{args.min_k}-{args.max_k}") - - # storage using SQLite URL - storage_url = "mysql://root@localhost/example" - study_name = f"{save_str}_{datetime.now().strftime('%Y%m%d%H%M%S')}" - - study = optuna.create_study( - study_name=study_name, - storage=storage_url, - directions=["maximize"] * (args.max_k - args.min_k + 1), - load_if_exists=True, - ) - - # Create multiple studies and launch them in separate processes - processes = [] - for i in range(args.n_jobs): - proc = Process(target=run_optimization, args=(storage_url, study_name, args, eval_data, f_list)) - processes.append(proc) - proc.start() - - # Wait for all processes to complete - for proc in processes: - proc.join() - diff --git a/wtpsplit/evaluation/law_bert.py b/wtpsplit/evaluation/legal_baselines.py similarity index 95% rename from wtpsplit/evaluation/law_bert.py rename to wtpsplit/evaluation/legal_baselines.py index a3efc9c9..d1d9382d 100644 --- a/wtpsplit/evaluation/law_bert.py +++ b/wtpsplit/evaluation/legal_baselines.py @@ -58,15 +58,14 @@ def get_law_preds(texts, model, model_name, args) -> List[List[int]]: def load_or_compute_logits(args, eval_data, save_str: str = None): logits_path = Constants.CACHE_DIR / "law_bert" / f"{save_str}.h5" - base_name = "rcds/distilbert-SBD" + base_name = "rcds/distilbert-SBD" # take from HF hub if not os.path.exists(Constants.CACHE_DIR / "law_bert"): os.makedirs(Constants.CACHE_DIR / "law_bert") use_langs = ["fr", "es", "it", "en", "de", "pt"] - # law eval data is only one with _ - total_test_time = 0 # Initialize total test processing time + total_test_time = 0 with h5py.File(logits_path, "a") as f, torch.no_grad(): for lang_code in tqdm(use_langs, desc="Languages"): @@ -79,13 +78,14 @@ def load_or_compute_logits(args, eval_data, save_str: str = None): # eval data for dataset_name, dataset in tqdm(eval_data[lang_code]["sentence"].items(), desc=lang_code): - if not "legal" in dataset_name: + if "legal" not in dataset_name: continue if "legal" in dataset_name and not ("laws" in dataset_name or "judgements" in dataset_name): continue if "social-media" in dataset_name: continue current_name = base_name + # map to correct model if args.lang_support == "multi": current_name += "-fr-es-it-en-de" elif args.lang_support == "mono": @@ -129,10 +129,10 @@ def load_or_compute_logits(args, eval_data, save_str: str = None): else: raise NotImplementedError - start_time = time.time() # Start timing for test logits processing + start_time = time.time() test_logits = get_law_preds(test_text, model, current_name, args) - end_time = time.time() # End timing for test logits processing - total_test_time += end_time - start_time # Accumulate test processing time + end_time = time.time() + total_test_time += end_time - start_time if isinstance(test_sentences[0], list): test_logit_lengths = [] # store start and end indices for each pair, used later to slice the logits @@ -182,12 +182,12 @@ def main(args): # first, logits for everything. f, total_test_time = load_or_compute_logits(args, eval_data, save_str) - # now, compute the law_bert scores. + # now, compute scores. results = {} clfs = {} if args.return_indices: indices = {} - # Initialize lists to store scores for each metric across all languages + u_scores = [] for lang_code, dsets in tqdm(eval_data.items()): diff --git a/wtpsplit/evaluation/time_intrinsic.py b/wtpsplit/evaluation/time_intrinsic.py deleted file mode 100644 index 0c8a0b89..00000000 --- a/wtpsplit/evaluation/time_intrinsic.py +++ /dev/null @@ -1,78 +0,0 @@ -import argparse -import sys -import pandas as pd -import math -from multiprocessing import Process, Queue -import intrinsic - - -def run_intrinsic_with_stride(stride, args, results_queue): - modified_args = argparse.Namespace(**vars(args)) - modified_args.stride = stride - results, results_avg, total_test_time = intrinsic.main(modified_args) # Capture results - results_queue.put((stride, results, results_avg, total_test_time)) - - -def benchmark_strides(low_stride, high_stride, args): - stride_values = [2**i for i in range(int(math.log2(low_stride)), int(math.log2(high_stride)) + 1)] - results_data = [] - - for stride in stride_values: - results_queue = Queue() - p = Process(target=run_intrinsic_with_stride, args=(stride, args, results_queue)) - p.start() - p.join() - - # intrinsic.main() returns a tuple of (results, results_avg, total_test_timee) - stride, stride_results, stride_results_avg, total_test_time = results_queue.get() - - results_data.append( - { - "stride": stride, - "block_size": args.block_size, - "batch_size": args.batch_size, - "execution_time": total_test_time, - "results": stride_results, - "results_avg": stride_results_avg, - "threshold": args.threshold, - "include_langs": args.include_langs, - "max_n_train_sentences": args.max_n_train_sentences, - } - ) - print(results_data) - - return pd.DataFrame(results_data) - - -if __name__ == "__main__": - # Extract low_stride and high_stride values - stride_args = ["--low_stride", "--high_stride"] - strides = {} - - # Iterate over stride_args to extract and remove them from sys.argv - for stride_arg in stride_args: - if stride_arg in sys.argv: - index = sys.argv.index(stride_arg) - try: - strides[stride_arg] = int(sys.argv[index + 1]) - # Remove the stride argument and its value - del sys.argv[index : index + 2] - except (IndexError, ValueError): - raise ValueError(f"Invalid or missing value for {stride_arg}.") - - if "--low_stride" not in strides or "--high_stride" not in strides: - raise ValueError("Both --low_stride and --high_stride must be provided.") - - low_stride = strides["--low_stride"] - high_stride = strides["--high_stride"] - - # Remaining arguments are passed to intrinsic.Args - args = intrinsic.HfArgumentParser(intrinsic.Args).parse_args_into_dataclasses()[0] - - df_results = benchmark_strides(low_stride, high_stride, args) - print(df_results) - # Optionally save df_results to a file - # to csv - df_results.to_csv( - f"timing_results_{args.model_path.replace('/','__')}_batch{args.batch_size}_b{args.block_size}+s{args.stride}_n{args.max_n_train_sentences}_u{args.threshold}_AVG.csv" - ) diff --git a/wtpsplit/tokenization_utils.py b/wtpsplit/tokenization_utils.py deleted file mode 100644 index cab53dcf..00000000 --- a/wtpsplit/tokenization_utils.py +++ /dev/null @@ -1,104 +0,0 @@ -import numpy as np -from wtpsplit.utils import Constants - - -def tokenize_and_get_labels(sentences, tokenizer, separator, lang_code): - joined_sentence = "" - sentence_start_positions = [] - current_position = 0 - - for sentence in sentences: - if joined_sentence: - joined_sentence += separator - current_position += len(separator) - start_position = current_position - joined_sentence += sentence - current_position += len(sentence) - sentence_start_positions.append(start_position + len(sentence) - 1) - - tokenized_input = tokenizer( - joined_sentence, - return_offsets_mapping=True, - add_special_tokens=False, - truncation=False, - verbose=False, - padding=False, - ) - - tokens = tokenized_input.tokens() - offsets = tokenized_input["offset_mapping"] - sentence_ending_labels = [0] * len(tokens) - - sentence_index = 0 - for i, (token_start, token_end) in enumerate(offsets): - if token_start > sentence_start_positions[sentence_index]: - sentence_ending_labels[i - 1] = 1 - sentence_index += 1 - # if any(start < token_end for start in sentence_start_positions if start >= token_start): - # print(tokens[i - 2 : i + 3]) - - # assert sum(sentence_ending_labels) == len(sentence_start_positions) - - return tokenized_input["input_ids"], sentence_ending_labels - - -def pack_sentences(examples, block_size, tokenizer, underflow_size=0, min_sentence_length=10): - all_input_blocks = [] - all_label_blocks = [] - all_langs = [] - - # group by langs first - lang_grouped_examples = {lang: [] for lang in set(examples["lang"])} - for sentence, lang in zip(examples["text"], examples["lang"]): - lang_grouped_examples[lang].append(sentence.strip("\n")) - - for current_lang, sentences in lang_grouped_examples.items(): - separator = Constants.SEPARATORS.get(current_lang, " ") - token_count, one_block_sentences = 0, [] - - # tokenization mapping gets problematic in such instances - sentences = [sentence.replace("\ufffd", "").strip() for sentence in sentences] - sentences = [sentence for sentence in sentences if len(sentence) > min_sentence_length] - if not sentences: - continue - - # batch tokenize sentences - tokenized_sentences = tokenizer(sentences, add_special_tokens=False, verbose=False, padding=False) - input_ids_list = tokenized_sentences["input_ids"] - - for sentence, input_ids in zip(sentences, input_ids_list): - if not sentence or sentence.isnumeric(): - continue - num_sentence_tokens = len(input_ids) - - # check if block limit is exceeded - if token_count > block_size - underflow_size: - # limit exceeded, process the current block - if one_block_sentences: - input_ids, labels = tokenize_and_get_labels(one_block_sentences, tokenizer, separator, current_lang) - all_input_blocks.append(input_ids) - all_label_blocks.append(labels) - all_langs.append(current_lang) - # reset - token_count, one_block_sentences = 0, [] - - # add sentence to block - one_block_sentences.append(sentence) - token_count += num_sentence_tokens - - # ensure last batch of sentences is processed - if one_block_sentences: - input_ids, labels = tokenize_and_get_labels(one_block_sentences, tokenizer, separator, current_lang) - all_input_blocks.append(input_ids) - all_label_blocks.append(labels) - all_langs.append(current_lang) - - # only return label indices, ie == 1 --> save memory - all_label_blocks = [[i for i, label in enumerate(labels) if label == 1] for labels in all_label_blocks] - - # TODO: in addition, truncate blocks here already? (storage reasons) - return { - "input_ids": all_input_blocks, - "labels": all_label_blocks, - "lang": all_langs, - } diff --git a/wtpsplit/train/adapter_utils.py b/wtpsplit/train/adapter_utils.py deleted file mode 100644 index 2f2e5d0b..00000000 --- a/wtpsplit/train/adapter_utils.py +++ /dev/null @@ -1,117 +0,0 @@ -from transformers import TrainingArguments -from transformers.training_args import ParallelMode -from transformers.utils import ( - is_sagemaker_dp_enabled, - is_sagemaker_mp_enabled, - is_torch_available, - is_torch_tpu_available, - requires_backends, -) -import numbers -import os - - -from transformers.utils import logging -from transformers.integrations import ( - rewrite_logs, - WandbCallback, - AzureMLCallback, - CometCallback, - MLflowCallback, - NeptuneCallback, - TensorBoardCallback, - CodeCarbonCallback, - ClearMLCallback, - DagsHubCallback, -) - -logger = logging.get_logger(__name__) -if is_torch_available(): - import torch - import torch.distributed as dist - -if is_sagemaker_mp_enabled(): - import smp.distributed.modelparallel.torch as smp - - smp.init() - - -class ParallelTPUAdapterTrainingArguments(TrainingArguments): - """ - Subclass of `TrainingArguments`, specific to training on TPU VMs in parallel using different data. - (Different optimization on different TPU cores, different data on different TPU cores, etc.) - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - @property - def world_size(self): - """ - The number of processes used in parallel. - """ - requires_backends(self, ["torch"]) - - if is_torch_tpu_available(): - # MODIFIED: otherwise, Trainer only logs on main (0) process, and DataLoader is of distributed type - return 1 - elif is_sagemaker_mp_enabled(): - return smp.dp_size() if not smp.state.cfg.prescaled_batch else smp.rdp_size() - elif is_sagemaker_dp_enabled(): - return dist.get_world_size() - elif self.parallel_mode == ParallelMode.DISTRIBUTED: - return torch.distributed.get_world_size() - return 1 - - -class ParallelTPUWandbCallback(WandbCallback): - """ - A [`TrainerCallback`] that logs metrics, media, model checkpoints to [Weight and Biases](https://www.wandb.com/). - """ - - def __init__(self): - super().__init__() - - def on_log(self, args, state, control, model=None, logs=None, **kwargs): - if self._wandb is None: - return - if not self._initialized: - self.setup(args, state, model) - # MODIFIED: log on all processes - # if state.is_world_process_zero: - logs = rewrite_logs(logs) - self._wandb.log({**logs, "train/global_step": state.global_step}) - - def on_save(self, args, state, control, **kwargs): - # MODIFIED: save on all - if self._log_model == "checkpoint" and self._initialized: - checkpoint_metadata = { - k: v - for k, v in dict(self._wandb.summary).items() - if isinstance(v, numbers.Number) and not k.startswith("_") - } - - ckpt_dir = f"checkpoint-{state.global_step}" - artifact_path = os.path.join(args.output_dir, ckpt_dir) - logger.info(f"Logging checkpoint artifacts in {ckpt_dir}. ...") - checkpoint_name = ( - f"checkpoint-{self._wandb.run.id}" - if (args.run_name is None or args.run_name == args.output_dir) - else f"checkpoint-{self._wandb.run.name}" - ) - artifact = self._wandb.Artifact(name=checkpoint_name, type="model", metadata=checkpoint_metadata) - artifact.add_dir(artifact_path) - self._wandb.log_artifact(artifact, aliases=[f"checkpoint-{state.global_step}"]) - - -INTEGRATION_TO_CALLBACK = { - "azure_ml": AzureMLCallback, - "comet_ml": CometCallback, - "mlflow": MLflowCallback, - "neptune": NeptuneCallback, - "tensorboard": TensorBoardCallback, - "wandb": ParallelTPUWandbCallback, - "codecarbon": CodeCarbonCallback, - "clearml": ClearMLCallback, - "dagshub": DagsHubCallback, -} \ No newline at end of file diff --git a/wtpsplit/train/adaptertrainer.py b/wtpsplit/train/adaptertrainer.py index 9725e4df..36883761 100644 --- a/wtpsplit/train/adaptertrainer.py +++ b/wtpsplit/train/adaptertrainer.py @@ -1,23 +1,9 @@ -import math import os -import shutil -import sys -import time from typing import Dict import numpy as np import torch from torch import nn -from tqdm.auto import tqdm - -# Integrations must be imported before ML frameworks: - -# isort: off -from transformers.integrations import ( - hp_params, -) - -# isort: on from transformers import PreTrainedModel from transformers.modeling_utils import unwrap_model @@ -40,7 +26,6 @@ nested_numpify, nested_truncate, ) -from transformers.trainer_utils import TrainOutput, speed_metrics from wtpsplit.train.utils import Model @@ -52,31 +37,21 @@ import re from typing import Callable, Tuple, Union -import torch.distributed as dist from packaging import version -from torch.utils.data import Dataset, RandomSampler -from torch.utils.data.distributed import DistributedSampler +from torch.utils.data import Dataset from transformers import Trainer, __version__ from transformers.configuration_utils import PretrainedConfig from transformers.data.data_collator import DataCollator -from transformers.debug_utils import DebugOption, DebugUnderflowOverflow -from transformers.pytorch_utils import is_torch_less_than_1_11 from transformers.tokenization_utils_base import PreTrainedTokenizerBase -from transformers.trainer import TRAINER_STATE_NAME from transformers.trainer_callback import ( TrainerCallback, TrainerControl, TrainerState, ) -from transformers.trainer_pt_utils import ( - get_model_param_count, -) from transformers.trainer_utils import ( EvalPrediction, - HPSearchBackend, - ShardedDDPOption, ) -from transformers.training_args import ParallelMode, TrainingArguments +from transformers.training_args import TrainingArguments from transformers.utils import ( CONFIG_NAME, is_accelerate_available, @@ -88,7 +63,7 @@ from adapters.composition import AdapterCompositionBlock, Fuse if is_apex_available(): - from apex import amp + pass if is_sagemaker_mp_enabled(): import smdistributed.modelparallel.torch as smp @@ -98,7 +73,7 @@ from accelerate import __version__ as accelerate_version if version.parse(accelerate_version) >= version.parse("0.16"): - from accelerate import skip_first_batches + pass logger = logging.get_logger(__name__) @@ -306,491 +281,6 @@ def _load_best_model(self): model.load_adapter_fusion(fusion_dir) model.to(self.args.device) - # def _inner_training_loop( - # self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None - # ): - # self._train_batch_size = batch_size - # # Data loader and number of training steps - # train_dataloader = self.get_train_dataloader() - - # # Setting up training control variables: - # # number of training epochs: num_train_epochs - # # number of training steps per epoch: num_update_steps_per_epoch - # # total number of training steps to execute: max_steps - # total_train_batch_size = args.train_batch_size * args.gradient_accumulation_steps * args.world_size - - # len_dataloader = None - # if has_length(train_dataloader): - # len_dataloader = len(train_dataloader) - # num_update_steps_per_epoch = len_dataloader // args.gradient_accumulation_steps - # num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1) - # num_examples = self.num_examples(train_dataloader) - # if args.max_steps > 0: - # max_steps = args.max_steps - # num_train_epochs = args.max_steps // num_update_steps_per_epoch + int( - # args.max_steps % num_update_steps_per_epoch > 0 - # ) - # # May be slightly incorrect if the last batch in the training dataloader has a smaller size but it's - # # the best we can do. - # num_train_samples = args.max_steps * total_train_batch_size - # else: - # max_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch) - # num_train_epochs = math.ceil(args.num_train_epochs) - # num_train_samples = self.num_examples(train_dataloader) * args.num_train_epochs - # elif args.max_steps > 0: # Rely on max_steps when dataloader does not have a working size - # max_steps = args.max_steps - # # Setting a very large number of epochs so we go as many times as necessary over the iterator. - # num_train_epochs = sys.maxsize - # num_update_steps_per_epoch = max_steps - # num_examples = total_train_batch_size * args.max_steps - # num_train_samples = args.max_steps * total_train_batch_size - # else: - # raise ValueError( - # "args.max_steps must be set to a positive value if dataloader does not have a length, was" - # f" {args.max_steps}" - # ) - - # # Compute absolute values for logging, eval, and save if given as ratio - # if args.logging_steps and args.logging_steps < 1: - # args.logging_steps = math.ceil(max_steps * args.logging_steps) - # if args.eval_steps and args.eval_steps < 1: - # args.eval_steps = math.ceil(max_steps * args.eval_steps) - # if args.save_steps and args.save_steps < 1: - # args.save_steps = math.ceil(max_steps * args.save_steps) - - # if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug: - # if self.args.n_gpu > 1: - # # nn.DataParallel(model) replicates the model, creating new variables and module - # # references registered here no longer work on other gpus, breaking the module - # raise ValueError( - # "Currently --debug underflow_overflow is not supported under DP. Please use DDP" - # " (torch.distributed.launch)." - # ) - # else: - # debug_overflow = DebugUnderflowOverflow(self.model) # noqa - - # delay_optimizer_creation = ( - # self.sharded_ddp is not None - # and self.sharded_ddp != ShardedDDPOption.SIMPLE - # or is_sagemaker_mp_enabled() - # or self.fsdp is not None - # ) - # if args.deepspeed: - # deepspeed_engine, optimizer, lr_scheduler = deepspeed_init( - # self, num_training_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint - # ) - # self.model = deepspeed_engine.module - # self.model_wrapped = deepspeed_engine - # self.deepspeed = deepspeed_engine - # self.optimizer = optimizer - # self.lr_scheduler = lr_scheduler - # elif not delay_optimizer_creation: - # self.create_optimizer_and_scheduler(num_training_steps=max_steps) - - # self.state = TrainerState() - # self.state.is_hyper_param_search = trial is not None - - # # Activate gradient checkpointing if needed - # if args.gradient_checkpointing: - # self.model.gradient_checkpointing_enable() - - # model = self._wrap_model(self.model_wrapped) - - # if is_sagemaker_mp_enabled() and resume_from_checkpoint is not None: - # self._load_from_checkpoint(resume_from_checkpoint, model) - - # # for the rest of this function `model` is the outside model, whether it was wrapped or not - # if model is not self.model: - # self.model_wrapped = model - - # if delay_optimizer_creation: - # self.create_optimizer_and_scheduler(num_training_steps=max_steps) - - # # Check if saved optimizer or scheduler states exist - # self._load_optimizer_and_scheduler(resume_from_checkpoint) - - # # important: at this point: - # # self.model is the Transformers Model - # # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model), etc. - - # # Train! - # # MODIFIED: changed to warn, added device - # logger.warning(f"***** Running training on {xm.get_ordinal()}*****") - # logger.warning(f" Num examples = {num_examples:,}") - # logger.warning(f" Num Epochs = {num_train_epochs:,}") - # logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size:,}") - # logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}") - # logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") - # logger.warning(f" Total optimization steps = {max_steps:,}") - # logger.warning(f" Eval steps = {args.eval_steps}") - # logger.info(f" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}") - - # self.state.epoch = 0 - # start_time = time.time() - # epochs_trained = 0 - # steps_trained_in_current_epoch = 0 - # steps_trained_progress_bar = None - - # # Check if continuing training from a checkpoint - # if resume_from_checkpoint is not None and os.path.isfile( - # os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME) - # ): - # self.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)) - # epochs_trained = self.state.global_step // num_update_steps_per_epoch - # if not args.ignore_data_skip: - # steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch) - # steps_trained_in_current_epoch *= args.gradient_accumulation_steps - # else: - # steps_trained_in_current_epoch = 0 - - # logger.info(" Continuing training from checkpoint, will skip to saved global_step") - # logger.info(f" Continuing training from epoch {epochs_trained}") - # logger.info(f" Continuing training from global step {self.state.global_step}") - # if not args.ignore_data_skip: - # if skip_first_batches is None: - # logger.info( - # f" Will skip the first {epochs_trained} epochs then the first" - # f" {steps_trained_in_current_epoch} batches in the first epoch. If this takes a lot of time," - # " you can install the latest version of Accelerate with `pip install -U accelerate`.You can" - # " also add the `--ignore_data_skip` flag to your launch command, but you will resume the" - # " training on data already seen by your model." - # ) - # else: - # logger.info( - # f" Will skip the first {epochs_trained} epochs then the first" - # f" {steps_trained_in_current_epoch} batches in the first epoch." - # ) - # if self.is_local_process_zero() and not args.disable_tqdm and skip_first_batches is None: - # steps_trained_progress_bar = tqdm(total=steps_trained_in_current_epoch) - # steps_trained_progress_bar.set_description("Skipping the first batches") - - # # Update the references - # self.callback_handler.model = self.model - # self.callback_handler.optimizer = self.optimizer - # self.callback_handler.lr_scheduler = self.lr_scheduler - # self.callback_handler.train_dataloader = train_dataloader - # if self.hp_name is not None and self._trial is not None: - # # use self._trial because the SigOpt/Optuna hpo only call `_hp_search_setup(trial)` instead of passing trial - # # parameter to Train when using DDP. - # self.state.trial_name = self.hp_name(self._trial) - # if trial is not None: - # assignments = trial.assignments if self.hp_search_backend == HPSearchBackend.SIGOPT else trial - # self.state.trial_params = hp_params(assignments) - # else: - # self.state.trial_params = None - # # This should be the same if the state has been saved but in case the training arguments changed, it's safer - # # to set this after the load. - # self.state.max_steps = max_steps - # self.state.num_train_epochs = num_train_epochs - # self.state.is_local_process_zero = self.is_local_process_zero() - # self.state.is_world_process_zero = self.is_world_process_zero() - - # # tr_loss is a tensor to avoid synchronization of TPUs through .item() - # tr_loss = torch.tensor(0.0).to(args.device) - # # _total_loss_scalar is updated everytime .item() has to be called on tr_loss and stores the sum of all losses - # self._total_loss_scalar = 0.0 - # self._globalstep_last_logged = self.state.global_step - # model.zero_grad() - - # self.control = self.callback_handler.on_train_begin(args, self.state, self.control) - - # # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point. - # if not args.ignore_data_skip: - # for epoch in range(epochs_trained): - # is_random_sampler = hasattr(train_dataloader, "sampler") and isinstance( - # train_dataloader.sampler, RandomSampler - # ) - # if is_torch_less_than_1_11 or not is_random_sampler: - # # We just need to begin an iteration to create the randomization of the sampler. - # # That was before PyTorch 1.11 however... - # for _ in train_dataloader: - # break - # else: - # # Otherwise we need to call the whooooole sampler cause there is some random operation added - # # AT THE VERY END! - # _ = list(train_dataloader.sampler) - - # total_batched_samples = 0 - # for epoch in range(epochs_trained, num_train_epochs): - # if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler): - # train_dataloader.sampler.set_epoch(epoch) - # elif hasattr(train_dataloader, "dataset") and isinstance(train_dataloader.dataset, IterableDatasetShard): - # train_dataloader.dataset.set_epoch(epoch) - - # # if is_torch_tpu_available(): - # # parallel_loader = pl.MpDeviceLoader(train_dataloader, args.device) # .per_device_loader(args.device) - # # epoch_iterator = parallel_loader - # # else: - # # MODIFIED: no parallel loader - # epoch_iterator = train_dataloader - - # # Reset the past mems state at the beginning of each epoch if necessary. - # if args.past_index >= 0: - # self._past = None - - # steps_in_epoch = ( - # len(epoch_iterator) if len_dataloader is not None else args.max_steps * args.gradient_accumulation_steps - # ) - # self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control) - - # if epoch == epochs_trained and resume_from_checkpoint is not None and steps_trained_in_current_epoch == 0: - # self._load_rng_state(resume_from_checkpoint) - - # rng_to_sync = False - # steps_skipped = 0 - # if skip_first_batches is not None and steps_trained_in_current_epoch > 0: - # epoch_iterator = skip_first_batches(epoch_iterator, steps_trained_in_current_epoch) - # steps_skipped = steps_trained_in_current_epoch - # steps_trained_in_current_epoch = 0 - # rng_to_sync = True - - # step = -1 - # for step, inputs in enumerate(epoch_iterator): - # total_batched_samples += 1 - # if rng_to_sync: - # self._load_rng_state(resume_from_checkpoint) - # rng_to_sync = False - - # # Skip past any already trained steps if resuming training - # if steps_trained_in_current_epoch > 0: - # steps_trained_in_current_epoch -= 1 - # if steps_trained_progress_bar is not None: - # steps_trained_progress_bar.update(1) - # if steps_trained_in_current_epoch == 0: - # self._load_rng_state(resume_from_checkpoint) - # continue - # elif steps_trained_progress_bar is not None: - # steps_trained_progress_bar.close() - # steps_trained_progress_bar = None - - # if step % args.gradient_accumulation_steps == 0: - # self.control = self.callback_handler.on_step_begin(args, self.state, self.control) - - # if ( - # (total_batched_samples % args.gradient_accumulation_steps != 0) - # and args.parallel_mode == ParallelMode.DISTRIBUTED - # and args._no_sync_in_gradient_accumulation - # and hasattr(model, "no_sync") - # ): - # # Avoid unnecessary DDP synchronization since there will be no backward pass on this example. - # with model.no_sync(): - # tr_loss_step = self.training_step(model, inputs) - # else: - # tr_loss_step = self.training_step(model, inputs) - - # if ( - # args.logging_nan_inf_filter - # and not is_torch_tpu_available() - # and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) - # ): - # # if loss is nan or inf simply add the average of previous logged losses - # tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged) - # else: - # tr_loss += tr_loss_step - - # self.current_flos += float(self.floating_point_ops(inputs)) - - # # Optimizer step for deepspeed must be called on every step regardless of the value of gradient_accumulation_steps - # if self.deepspeed: - # self.deepspeed.step() - - # if total_batched_samples % args.gradient_accumulation_steps == 0 or ( - # # last step in epoch but step is always smaller than gradient_accumulation_steps - # steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch - # ): - # # Gradient clipping - # if args.max_grad_norm is not None and args.max_grad_norm > 0 and not self.deepspeed: - # # deepspeed does its own clipping - - # if self.do_grad_scaling: - # # Reduce gradients first for XLA - # if is_torch_tpu_available(): - # gradients = xm._fetch_gradients(self.optimizer) - # xm.all_reduce("sum", gradients, scale=1.0 / xm.xrt_world_size()) - # # AMP: gradients need unscaling - # self.scaler.unscale_(self.optimizer) - - # if is_sagemaker_mp_enabled() and args.fp16: - # self.optimizer.clip_master_grads(args.max_grad_norm) - # elif hasattr(self.optimizer, "clip_grad_norm"): - # # Some optimizers (like the sharded optimizer) have a specific way to do gradient clipping - # self.optimizer.clip_grad_norm(args.max_grad_norm) - # elif hasattr(model, "clip_grad_norm_"): - # # Some models (like FullyShardedDDP) have a specific way to do gradient clipping - # model.clip_grad_norm_(args.max_grad_norm) - # else: - # # Revert to normal clipping otherwise, handling Apex or full precision - # nn.utils.clip_grad_norm_( - # amp.master_params(self.optimizer) if self.use_apex else model.parameters(), - # args.max_grad_norm, - # ) - - # # Optimizer step - # optimizer_was_run = True - # if self.deepspeed: - # pass # called outside the loop - # elif is_torch_tpu_available(): - # if self.do_grad_scaling: - # self.scaler.step(self.optimizer) - # self.scaler.update() - # else: - # # xm.optimizer_step(self.optimizer) - # # MODIFIED: Crucial change! Do not aggregate gradients across TPU cores - # self.optimizer.step() - # xm.mark_step() - # elif self.do_grad_scaling: - # scale_before = self.scaler.get_scale() - # self.scaler.step(self.optimizer) - # self.scaler.update() - # scale_after = self.scaler.get_scale() - # optimizer_was_run = scale_before <= scale_after - # else: - # self.optimizer.step() - - # if optimizer_was_run and not self.deepspeed: - # # Delay optimizer scheduling until metrics are generated - # if not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): - # self.lr_scheduler.step() - - # model.zero_grad() - # self.state.global_step += 1 - # self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch - # self.control = self.callback_handler.on_step_end(args, self.state, self.control) - - # self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) - # else: - # self.control = self.callback_handler.on_substep_end(args, self.state, self.control) - - # if self.control.should_epoch_stop or self.control.should_training_stop: - # break - # if step < 0: - # logger.warning( - # "There seems to be not a single sample in your epoch_iterator, stopping training at step" - # f" {self.state.global_step}! This is expected if you're using an IterableDataset and set" - # f" num_steps ({max_steps}) higher than the number of available samples." - # ) - # self.control.should_training_stop = True - - # self.control = self.callback_handler.on_epoch_end(args, self.state, self.control) - # self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) - - # if DebugOption.TPU_METRICS_DEBUG in self.args.debug: - # if is_torch_tpu_available(): - # # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) - # xm.master_print(met.metrics_report()) - # else: - # logger.warning( - # "You enabled PyTorch/XLA debug metrics but you don't have a TPU " - # "configured. Check your training configuration if this is unexpected." - # ) - # if self.control.should_training_stop: - # break - - # if args.past_index and hasattr(self, "_past"): - # # Clean the state at the end of training - # delattr(self, "_past") - - # logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") - # if args.load_best_model_at_end and self.state.best_model_checkpoint is not None: - # # Wait for everyone to get here so we are sur the model has been saved by process 0. - # if is_torch_tpu_available(): - # xm.rendezvous("load_best_model_at_end") - # if args.parallel_mode == ParallelMode.DISTRIBUTED: - # dist.barrier() - # elif is_sagemaker_mp_enabled(): - # smp.barrier() - - # self._load_best_model() - - # # add remaining tr_loss - # self._total_loss_scalar += tr_loss.item() - # train_loss = self._total_loss_scalar / self.state.global_step - - # metrics = speed_metrics("train", start_time, num_samples=num_train_samples, num_steps=self.state.max_steps) - # self.store_flos() - # metrics["total_flos"] = self.state.total_flos - # metrics["train_loss"] = train_loss - - # self.is_in_train = False - - # self._memory_tracker.stop_and_update_metrics(metrics) - - # self.log(metrics) - - # run_dir = self._get_output_dir(trial) - # checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir) - - # # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save. - # if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1: - # for checkpoint in checkpoints_sorted: - # if checkpoint != self.state.best_model_checkpoint: - # logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit") - # shutil.rmtree(checkpoint) - - # self.control = self.callback_handler.on_train_end(args, self.state, self.control) - - # return TrainOutput(self.state.global_step, train_loss, metrics) - - # def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval): - # if self.control.should_log: - # # MODIFIED: removed --> faster - # # if is_torch_tpu_available(): - # # xm.mark_step() - - # logs: Dict[str, float] = {} - - # # all_gather + mean() to get average loss over all processes - # # tr_loss_scalar = self._nested_gather(tr_loss).mean().item() - # # MODIFIED: no gather since we train independently - # tr_loss_scalar = tr_loss.item() - - # # reset tr_loss to zero - # tr_loss -= tr_loss - - # loss = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4) - # logs[f"{self.logging_prefix}loss"] = loss - # logs["loss"] = loss - - # logs[f"{self.logging_prefix}learning_rate"] = self._get_learning_rate() - # # prepend logging_prefix to epoch - # if self.state.epoch is not None: - # logs[f"{self.logging_prefix}epoch"] = round(self.state.epoch, 2) - # logs[f"{self.logging_prefix}global_step"] = self.state.global_step - - # self._total_loss_scalar += tr_loss_scalar - # self._globalstep_last_logged = self.state.global_step - # self.store_flos() - - # self.log(logs) - - # metrics = None - # if self.control.should_evaluate: - # metrics = self.evaluate(ignore_keys=ignore_keys_for_eval) - # self._report_to_hp_search(trial, self.state.global_step, metrics) - - # if self.control.should_save: - # self._save_checkpoint(model, trial, metrics=metrics) - # self.control = self.callback_handler.on_save(self.args, self.state, self.control) - - # def log(self, logs: Dict[str, float]) -> None: - # """ - # Log `logs` on the various objects watching training. - - # Subclass and override this method to inject custom behavior. - - # Args: - # logs (`Dict[str, float]`): - # The values to log. - # """ - # if self.state.epoch is not None: - # logs["epoch"] = round(self.state.epoch, 2) - - # output = {**logs, **{"step": self.state.global_step}} - # self.state.log_history.append(output) - # # MODIFIED: also log current device - # logger.warning(f"{xm.get_ordinal()}: {output}") - # self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs) - def evaluation_loop( self, dataloader: DataLoader, @@ -805,7 +295,7 @@ def evaluation_loop( Works both with or without labels. """ args = self.args - + prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only # if eval is called w/o train init deepspeed here @@ -820,7 +310,7 @@ def evaluation_loop( self.deepspeed = deepspeed_engine model = self._wrap_model(self.model, training=False, dataloader=dataloader) - + if not self.skip_eval_loss: # if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called # while ``train`` is running, cast it to the right dtype first and then put on device @@ -878,7 +368,9 @@ def evaluation_loop( batch_size = observed_batch_size # Prediction step - loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys) + loss, logits, labels = self.prediction_step( + model, inputs, prediction_loss_only, ignore_keys=ignore_keys + ) inputs_decode = self._prepare_input(inputs["input_ids"]) if args.include_inputs_for_metrics else None # MODIFIED: not necessary. @@ -894,7 +386,9 @@ def evaluation_loop( labels = self._pad_across_processes(labels) # MODIFIED: do not gather across devices. labels = self._nested_gather(labels) - labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100) + labels_host = ( + labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100) + ) if inputs_decode is not None: inputs_decode = self._pad_across_processes(inputs_decode) # MODIFIED: do not gather across devices. @@ -919,7 +413,9 @@ def evaluation_loop( all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0) if preds_host is not None: logits = nested_numpify(preds_host) - all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100) + all_preds = ( + logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100) + ) if inputs_host is not None: inputs_decode = nested_numpify(inputs_host) all_inputs = ( @@ -929,7 +425,9 @@ def evaluation_loop( ) if labels_host is not None: labels = nested_numpify(labels_host) - all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100) + all_labels = ( + labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100) + ) # Set back to None to begin a new accumulation losses_host, preds_host, inputs_host, labels_host = ( @@ -953,7 +451,9 @@ def evaluation_loop( if inputs_host is not None: inputs_decode = nested_numpify(inputs_host) all_inputs = ( - inputs_decode if all_inputs is None else nested_concat(all_inputs, inputs_decode, padding_index=-100) + inputs_decode + if all_inputs is None + else nested_concat(all_inputs, inputs_decode, padding_index=-100) ) if labels_host is not None: labels = nested_numpify(labels_host) @@ -987,7 +487,6 @@ def evaluation_loop( all_losses, all_preds, all_labels, all_inputs, num_samples = None, None, None, None, 0 # Metrics! - # MODIFIED: removed since done in compute_metrics xm.rendezvous("eval_metrics") # MODIFIED: always compute metrics if self.compute_metrics is not None: diff --git a/wtpsplit/train/train.py b/wtpsplit/train/train.py index ea9cad87..96cc7c11 100644 --- a/wtpsplit/train/train.py +++ b/wtpsplit/train/train.py @@ -32,7 +32,7 @@ SubwordXLMConfig, SubwordXLMForTokenClassification, ) -from wtpsplit.train.evaluate import evaluate_sentence, evaluate_sentence_kmers, evaluate_sentence_pairwise +from wtpsplit.train.evaluate import evaluate_sentence from wtpsplit.train.trainer import Trainer from wtpsplit.train.utils import Model, cleanup_cache_files from wtpsplit.utils import Constants, LabelArgs, corrupt_training, get_label_dict, get_subword_label_dict @@ -40,9 +40,6 @@ logger = logging.getLogger(__name__) -# os.environ["PJRT_DEVICE"] = "None" - - def setup_logging(training_args: transformers.TrainingArguments) -> None: logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -62,7 +59,6 @@ def setup_logging(training_args: transformers.TrainingArguments) -> None: + f"distributed training: {training_args.local_rank != -1}, 16-bits training: {training_args.fp16}" ) ) - # logger.info(f"Training/evaluation parameters {training_args}") @dataclass @@ -72,20 +68,16 @@ class Args: use_logits: bool = False is_decoder: bool = False use_bert: bool = False - # TODO: adapt to HF Hub train_text_path: str = "data/train.parquet" valid_text_path: str = "data/valid.parquet" include_languages: List[str] = None - eval_data_path: str = "data/all_data_24_04.pth" - num_hidden_layers: int = 1 + eval_data_path: str = "data/all_data.pth" + num_hidden_layers: int = 3 preprocessing_num_workers: int = 6 block_size: int = 512 overflow_size: int = 16 eval_stride: int = 256 - lookahead: int = None loss_margin: float = 0.5 - ngram_order: int = 1 - language_adapter: str = "on" from_scratch: bool = False pack_samples: bool = False one_sample_per_line: bool = False @@ -95,13 +87,16 @@ class Args: aux_training_weight: float = 1.0 ignore_non_hyphen: bool = False non_punctuation_sample_ratio: float = None + text_column: str = "text" + threshold: float = 0.01 # just for eval + # WtP-related args + ngram_order: int = 1 + language_adapter: str = "on" adapter_warmup_steps: int = 0 adapter_lr_multiplier: float = 1.0 - text_column: str = "text" - - # NEW PARAMS - use_subwords: bool = False - threshold: float = 0.01 + # SaT-related args + use_subwords: bool = False # uses XLM-R + lookahead: int = None lookahead_split_layers: Optional[int] = None sample_non_whitespace: int = 1 @@ -118,6 +113,7 @@ def collate_fn(batch, args, label_args, label_dict, tokenizer, add_lang_ids: boo for sample in batch: # subword-level if args.use_subwords: + # already tokenized! input_ids = sample["input_ids"] # char-level else: @@ -151,7 +147,6 @@ def collate_fn(batch, args, label_args, label_dict, tokenizer, add_lang_ids: boo labels = labels[start : start + actual_block_size] elif len(input_ids) < actual_block_size: padding = actual_block_size - len(input_ids) - # print(padding, lang) input_ids += [tokenizer.pad_token_id] * padding if tokenizer else [0] * padding labels += [0] * padding @@ -202,7 +197,7 @@ def main(): (args, training_args, label_args) = parser.parse_args_into_dataclasses() wandb_name = None if xm.xrt_world_size() == 4: - # ensure same batch size on TPUv3 and TPUv4 + # ensure same batch size on TPUv3 and TPUv4 using same config.json training_args.per_device_train_batch_size *= 2 logger.warning(f"Per device train batch size: {training_args.per_device_train_batch_size}") logger.warning( @@ -216,6 +211,7 @@ def main(): num_labels = Constants.AUX_OFFSET + ((1 + len(Constants.PUNCTUATION_CHARS)) if args.do_auxiliary_training else 0) if args.use_subwords: + # SaT models if args.from_scratch: config = SubwordXLMConfig( args.model_name_or_path, @@ -248,9 +244,11 @@ def main(): special_tokens_ids = set(tokenizer.all_special_ids) special_tokens_ids.discard(custom_token_id) if args.lookahead: + # we split lookahead evenly into N layers assert args.lookahead % args.num_hidden_layers == 0 else: + # WtP models (char-based) tokenizer = None config = LACanineConfig.from_pretrained( args.model_name_or_path, @@ -290,7 +288,6 @@ def main(): if training_args.local_rank == 0: logger.warning(summary(model, depth=4)) - # backbone.push_to_hub("markus583/xlm-token-untrained", private=True) def prepare_dataset( num_workers=1, @@ -299,8 +296,10 @@ def prepare_dataset( split="train", ): with training_args.main_process_first(): - dlconf = DownloadConfig(cache_dir="/home/Markus/.cache/huggingface/datasets") - dataset = load_dataset("markus583/mC4-TEST", split=split, download_config=dlconf) + # this can be used if space issues arise + # dlconf = DownloadConfig(cache_dir="/home/Markus/.cache/huggingface/datasets") + # dataset = load_dataset("markus583/mC4-TEST", split=split, download_config=dlconf) + dataset = load_dataset("markus583/mC4-TEST", split=split) logger.warning(f"Loaded {split} dataset.") # optional: delete downloaded dataset, it is stored in cache_dir now (but we delete it later) # ~40GB on disk @@ -319,7 +318,7 @@ def prepare_dataset( dataset = dataset.shuffle(seed=42) logger.warning("Shuffled dataset.") - # very likely not relevant / used only for the compound part + # not used for sentence segmentation, ignore. if args.ignore_non_hyphen: with training_args.main_process_first(): dataset = dataset.filter( @@ -328,7 +327,7 @@ def prepare_dataset( ) logger.info(f"Filtered to {len(dataset)} examples.") - # "punctuation-specific sampling" in the paper + # "punctuation-specific sampling" in the WtP paper if args.non_punctuation_sample_ratio is not None: languages_without_punctuation = { lang_code @@ -514,18 +513,19 @@ def maybe_pad(text): remove_columns=[args.text_column], ) else: - # this is no longer used and would cause an error otherwise + # this column is no longer used and would cause an error otherwise with training_args.main_process_first(): dataset = dataset.rename_column(args.text_column, "input_ids") logger.warning(f"Tokenized {split} dataset.") - if split == "train" and args.use_subwords: - with training_args.main_process_first(): - for root, dirs, files in os.walk(os.environ.get("HF_DATASETS_CACHE")): - for file in files: - if file.startswith("m_c4-test-train"): - logger.warning(f"Removing {os.path.join(root, file)}") - os.remove(os.path.join(root, file)) + # uncomment if space issues arise (e.g., on TPU VMs): + # if split == "train" and args.use_subwords: + # with training_args.main_process_first(): + # for root, dirs, files in os.walk(os.environ.get("HF_DATASETS_CACHE")): + # for file in files: + # if file.startswith("m_c4-test-train"): + # logger.warning(f"Removing {os.path.join(root, file)}") + # os.remove(os.path.join(root, file)) if not args.one_sample_per_line: with training_args.main_process_first(): @@ -582,21 +582,19 @@ def compute_metrics(trainer): continue if trainer.args.process_index == 0 and args.do_sentence_training: - # with training_args.main_process_first(): for dataset_name, dataset in lang_data["sentence"].items(): - # if "corrupt" in dataset_name: - # continue if not dataset["data"][0]: continue if isinstance(dataset["data"][0], list): + # too slow here continue score, info = evaluate_sentence( lang_code, dataset["data"], model, - stride=128, - block_size=512, + stride=args.eval_stride, + block_size=args.block_size, batch_size=training_args.per_device_eval_batch_size, threshold=args.threshold, ) @@ -608,68 +606,7 @@ def compute_metrics(trainer): avg_metrics[f"average_{dataset_name}_f1"].append(info["f1"]) avg_metrics[f"average_{dataset_name}_f1_best"].append(info["f1_best"]) avg_metrics[f"average_{dataset_name}_threshold_best"].append(info["threshold_best"]) - # if lang_code in ["zh", "ja", "my", "km"]: - # avg_metrics[f"average_nonwhitespace_{dataset_name}_pr_auc"].append(score) - # else: - # avg_metrics[f"average_whitespace_{dataset_name}_pr_auc"].append(score) - # score, _ = evaluate_sentence( - # lang_code, - # dataset["data"], - # model, - # stride=args.eval_stride, - # block_size=args.block_size, - # batch_size=training_args.per_device_eval_batch_size, - # do_lowercase=True, - # do_remove_punct=True, - # ) - # metrics[f"lower_rmp_{lang_code}_{dataset_name}_pr_auc"] = score - # avg_metrics[f"lower_rmp_average_{dataset_name}_pr_auc"].append(score) - # if lang_code in ["zh", "ja", "my", "km"]: - # avg_metrics[f"lower_rmp_average_nonwhitespace_{dataset_name}_pr_auc"].append(score) - # else: - # avg_metrics[f"lower_rmp_average_whitespace_{dataset_name}_pr_auc"].append(score) - # k-mer based evaluation - # for k in [2, 3, 4]: - # score, avg_acc, info = evaluate_sentence_kmers( - # lang_code, - # dataset["data"], - # model, - # stride=128, - # block_size=512, - # batch_size=training_args.per_device_eval_batch_size, - # k=k, - # # sample_pct=0.1, - # threshold=args.threshold, - # ) - # metrics[f"k_{k}_{lang_code}_{dataset_name}_pr_auc"] = score - # avg_metrics[f"k_{k}_average_{dataset_name}_pr_auc"].append(score) - # metrics[f"k_{k}_{lang_code}_{dataset_name}_acc"] = avg_acc - # avg_metrics[f"k_{k}_average_{dataset_name}_acc"].append(avg_acc) - # metrics[f"k_{k}_{lang_code}_{dataset_name}_f1"] = info["f1"] - # metrics[f"k_{k}_{lang_code}_{dataset_name}_f1_best"] = info["f1_best"] - # metrics[f"k_{k}_{lang_code}_{dataset_name}_threshold_best"] = info["threshold_best"] - # avg_metrics[f"k_{k}_average_{dataset_name}_f1"].append(info["f1"]) - # avg_metrics[f"k_{k}_average_{dataset_name}_f1_best"].append(info["f1_best"]) - # avg_metrics[f"k_{k}_average_{dataset_name}_threshold_best"].append(info["threshold_best"]) - - # # if lang_code in ["zh", "ja", "my", "km"]: - # # avg_metrics[f"k_{k}_average_nonwhitespace_{dataset_name}_pr_auc"].append(score) - # # avg_metrics[f"k_{k}_average_nonwhitespace_{dataset_name}_acc"].append(avg_acc) - # # else: - # # avg_metrics[f"k_{k}_average_whitespace_{dataset_name}_pr_auc"].append(score) - # # avg_metrics[f"k_{k}_average_whitespace_{dataset_name}_acc"].append(avg_acc) - # if k == 2: - # # keep keys for backwards compat in wandb - # metrics[f"pairwise_{lang_code}_{dataset_name}_pr_auc"] = score - # avg_metrics[f"pairwise_average_{dataset_name}_pr_auc"].append(score) - # metrics[f"pairwise_{lang_code}_{dataset_name}_acc"] = avg_acc - # avg_metrics[f"pairwise_average_{dataset_name}_acc"].append(avg_acc) - # metrics[f"pairwise_{lang_code}_{dataset_name}_f1"] = info["f1"] - # metrics[f"pairwise_{lang_code}_{dataset_name}_f1_best"] = info["f1_best"] - # metrics[f"pairwise_{lang_code}_{dataset_name}_threshold_best"] = info["threshold_best"] - # avg_metrics[f"pairwise_average_{dataset_name}_f1"].append(info["f1"]) - # avg_metrics[f"pairwise_average_{dataset_name}_f1_best"].append(info["f1_best"]) - # avg_metrics[f"pairwise_average_{dataset_name}_threshold_best"].append(info["threshold_best"]) + if lang_code in ["zh", "ja", "my", "km"]: avg_metrics[f"average_nonwhitespace_{dataset_name}_pr_auc"].append(score) avg_metrics[f"average_nonwhitespace_{dataset_name}_f1"].append(info["f1"]) @@ -690,7 +627,7 @@ def compute_metrics(trainer): return metrics if "wandb" in training_args.report_to and training_args.process_index == 0: - wandb.init(name=wandb_name, project="sentence", entity="markus_583") + wandb.init(name=wandb_name, project="sentence") wandb.config.update(args) wandb.config.update(training_args) wandb.config.update(label_args) @@ -709,14 +646,15 @@ def compute_metrics(trainer): training_args.adapter_warmup_steps = args.adapter_warmup_steps training_args.adapter_lr_multiplier = args.adapter_lr_multiplier + # again: uncomment this if space issues arise. # give .map in multiprocessing enough of time to finish, to be safe - time.sleep(10) - if training_args.local_rank == 0: - # since both share the *same* cache_dir, we cannot simply call dataset.cleanup_cache_files() - # because that would remove the cache files of the other dataset! - cleanup_cache_files([train_dataset, valid_dataset]) - logger.warning("Cleaned up cache files.") - time.sleep(10) + # time.sleep(10) + # if training_args.local_rank == 0: + # # since both share the *same* cache_dir, we cannot simply call dataset.cleanup_cache_files() + # # because that would remove the cache files of the other dataset! + # cleanup_cache_files([train_dataset, valid_dataset]) + # logger.warning("Cleaned up cache files.") + # time.sleep(10) trainer = Trainer( model, @@ -737,10 +675,10 @@ def compute_metrics(trainer): trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) trainer.save_model() trainer.save_state() - # Pattern for checkpoint directories + + # remove old checkpoints to save space checkpoint_pattern = os.path.join(training_args.output_dir, "checkpoint-*") - # Use glob.glob to find all directories matching the pattern for checkpoint_dir in glob(checkpoint_pattern): if os.path.isdir(checkpoint_dir): shutil.rmtree(checkpoint_dir) @@ -752,10 +690,4 @@ def _mp_fn(index): if __name__ == "__main__": - # try: main() - # except Exception: - # # extype, value, tb = sys.exc_info() - # # tb.print_exc() - # # pdb.post_mortem(tb) - # pass diff --git a/wtpsplit/train/train_adapter.py b/wtpsplit/train/train_adapter.py index bc38c30f..06b399f0 100644 --- a/wtpsplit/train/train_adapter.py +++ b/wtpsplit/train/train_adapter.py @@ -8,12 +8,13 @@ from dataclasses import dataclass from functools import partial from glob import glob -from typing import List +from typing import List, Optional, Union import datasets import numpy as np import torch from tokenizers import AddedToken +from tqdm import tqdm from transformers import AutoTokenizer, HfArgumentParser, TrainingArguments, set_seed import adapters @@ -21,13 +22,11 @@ from adapters import AdapterArguments from wtpsplit.models import SubwordXLMConfig, SubwordXLMForTokenClassification from wtpsplit.train.adaptertrainer import AdapterTrainer -from wtpsplit.train.trainer import Trainer from wtpsplit.train.evaluate import evaluate_sentence from wtpsplit.train.train import collate_fn, setup_logging +from wtpsplit.train.trainer import Trainer from wtpsplit.train.utils import Model -from wtpsplit.utils import Constants, LabelArgs, get_label_dict, get_subword_label_dict, corrupt -from tqdm import tqdm -from typing import Union, Optional +from wtpsplit.utils import Constants, LabelArgs, get_label_dict, get_subword_label_dict logger = logging.getLogger(__name__) @@ -39,7 +38,7 @@ class Args: model_name_or_path: str base_model: str = "xlm-roberta-base" shuffle: bool = True - text_path: str = "data/all_data_11_05-all.pth" + text_path: str = "data/all_data.pth" include_languages: List[str] = None preprocessing_num_workers: int = 1 block_size: int = 512 @@ -69,7 +68,6 @@ class Args: wandb_project: str = "sentence" eval_every: int = 5 # corruption - eval_pairwise: bool = False skip_eval_loss: bool = False subsample: Optional[float] = None @@ -112,7 +110,7 @@ def prepare_dataset( one_sample_per_line: bool = False, ): with training_args.main_process_first(): - # maybe we use more than 1 lang later at once. + # this only uses 1 dataset-language combination, but can be easily adapted if needed for lang in include_languages: if split == "train": dataset = data[lang]["sentence"][dataset_name]["meta"]["train_data"] @@ -160,7 +158,7 @@ def prepare_dataset( dataset = dataset.select(range(int(subsample * len(dataset)))) logger.warning(f"Subsampled {len(dataset)} examples from {old_length}.") - # very likely not relevant / used only for the compound part + # ignore if args.ignore_non_hyphen: with training_args.main_process_first(): dataset = dataset.filter( @@ -170,7 +168,7 @@ def prepare_dataset( with training_args.main_process_first(): logger.info(f"Filtered to {len(dataset)} examples.") - # "punctuation-specific sampling" in the paper + # "punctuation-specific sampling" in the WtP paper if args.non_punctuation_sample_ratio is not None: languages_without_punctuation = { lang_code @@ -369,7 +367,7 @@ def maybe_pad(text): data = torch.load( args.text_path, ) - # sort alphabetically by key to enable alphabetical filtering w/o losses + # sort alphabetically by key for alphabetical filtering data = dict(sorted(data.items())) if not args.include_languages: @@ -377,8 +375,7 @@ def maybe_pad(text): # 1 wandb run for all language-dataset combinations if "wandb" in training_args.report_to and training_args.process_index == 0: - # TODO: don't hardcode entity - wandb.init(name=wandb_name, project=args.wandb_project, group=wandb_name, entity="markus_583") + wandb.init(name=wandb_name, project=args.wandb_project, group=wandb_name) wandb.config.update(args) wandb.config.update(training_args) wandb.config.update(label_args) @@ -410,11 +407,10 @@ def maybe_pad(text): # not available. print("SKIP: ", lang, dataset_name) continue + if not any(x in dataset_name for x in ["ersatz", "opus", "ud"]): + print("SKIP: ", lang, dataset_name) + continue print("RUNNING:", dataset_name, lang) - # skip langs starting with a, b, ..., k - # if lang.startswith(tuple("abcd")): - # print(f"Skipping {lang} {dataset_name}") - # continue # do model stuff here; otherwise, head params would be overwritten every time backbone = SubwordXLMForTokenClassification.from_pretrained( args.model_name_or_path, config=copy.deepcopy(config), ignore_mismatched_sizes=True @@ -440,7 +436,7 @@ def maybe_pad(text): if "short" in dataset_name: one_sample_per_line = True else: - one_sample_per_line = args.one_sample_per_line + one_sample_per_line = args.one_sample_per_line # used for lyrics model = Model( backbone, @@ -496,8 +492,7 @@ def compute_metrics(trainer): model = trainer._wrap_model(trainer.model, training=False) with training_args.main_process_first(): - # XXX: feeding in single samples is too slow --> feed in as one long text - # also for lyrics, tweets, ... + # feeding in single samples is too slow --> feed in as one long text if args.one_sample_per_line: eval_data = [item for sublist in eval_data for item in sublist] elif isinstance(eval_data[0], list): @@ -506,26 +501,14 @@ def compute_metrics(trainer): lang, eval_data, model, - stride=128, - block_size=512, + stride=args.eval_stride, + block_size=args.block_size, batch_size=training_args.per_device_eval_batch_size, ) metrics[f"{dataset_name}/{lang}/pr_auc"] = score metrics[f"{dataset_name}/{lang}/f1"] = info["f1"] metrics[f"{dataset_name}/{lang}/f1_best"] = info["f1_best"] metrics[f"{dataset_name}/{lang}/threshold_best"] = info["threshold_best"] - if args.eval_pairwise: - score_pairwise, avg_acc = evaluate_sentence_pairwise( - lang, - eval_data, - model, - stride=args.eval_stride, - block_size=args.block_size, - batch_size=training_args.per_device_eval_batch_size, - threshold=0.1, - ) - metrics[f"{dataset_name}/{lang}/pairwise/pr_auc"] = score_pairwise - metrics[f"{dataset_name}/{lang}/pairwise/acc"] = avg_acc return metrics @@ -541,7 +524,7 @@ def compute_metrics(trainer): model.backbone.train_adapter("text") kwargs = {"logging_prefix": f"{dataset_name}/{lang}/", "skip_eval_loss": args.skip_eval_loss} else: - # needed in the trainer otherwise + # needed in the trainer otherwise (WtP-related only) training_args.adapter_warmup_steps = args.adapter_warmup_steps training_args.adapter_lr_multiplier = args.adapter_lr_multiplier kwargs = {} @@ -553,23 +536,10 @@ def compute_metrics(trainer): for n, p in model.backbone.named_parameters(): if "classifier" in n: p.requires_grad = False + # not done: keeping clf head is much better if args.clf_from_scratch: model.backbone.classifier = torch.nn.Linear(model.backbone.config.hidden_size, num_labels) - if args.unfreeze_ln: - for n, p in model.backbone.named_parameters(): - if "LayerNorm" in n: - p.requires_grad = True - - if args.meta_clf: - clf = model.backbone.classifier - model.backbone.classifier = torch.nn.Sequential( - clf, # original classifier - if frozen above, also frozen here - torch.nn.Linear(clf.out_features, 1), - ) - model.backbone.config.num_labels = 1 - - # if args.one_sample_per_line: # eval only 5x during the entire training training_args.evaluation_strategy = "steps" training_args.eval_steps = max( @@ -584,7 +554,6 @@ def compute_metrics(trainer): ) trainer_cls = AdapterTrainer if adapter_args.train_adapter else Trainer - # add logging_prefix and skip_eval_loss as args to trainer_cls if trainer_cls is AdapterTrainer only trainer = trainer_cls( model, @@ -604,6 +573,8 @@ def compute_metrics(trainer): ) trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) logger.warning(f"Finished training for {lang} {dataset_name}.") + + # only save trained module if training_args.local_rank == 0: if not os.path.exists(os.path.join(training_args.output_dir, dataset_name, lang)): os.makedirs(os.path.join(training_args.output_dir, dataset_name, lang)) @@ -617,34 +588,6 @@ def compute_metrics(trainer): ) else: save_model.to("cpu").save_pretrained(os.path.join(training_args.output_dir, dataset_name, lang)) - # TODO - # if training_args.local_rank == 0: - # # eval here within 1 go - # cmd = "" - - # if args.eval_pairwise: - # eval_function = "intrinsic_pairwise" - # elif args.one_sample_per_line: - # eval_function = "intrinsic_list" - # else: - # eval_function = "intrinsic" - # if args.do_lowercase and args.do_remove_punct: - # suffix = "--do_lowercase --do_remove_punct" - # elif "multilingual" in trainings_args.model_name_or_path: - # suffix = "--threshold 0.5" - # else: - # suffix = "" - # if "adapter" in training_args.output_dir: - # model_info = f"--model_path {args.model_name_or_path} --adapter_path {training_args.output_dir}" - # else: - # model_info = f"--model_path {training_args.output_dir}" - - # if "verses" in args.text_path or "lines" in args.text_path: - # cmd = f"python3 wtpsplit/evaluation/{eval_function}.py {model_info} --threshold 0.1 --custom_language_list data/mldb_langs.csv --eval_data_path {args.text_path} {suffix}" - # else: - # cmd = f"python3 wtpsplit/evaluation/{eval_function}.py {model_info} --threshold 0.1 {suffix}" - # print(cmd) - # os.system(cmd) def _mp_fn(index): @@ -653,5 +596,4 @@ def _mp_fn(index): if __name__ == "__main__": - # try: main() diff --git a/wtpsplit/train/train_adapter_parallel.py b/wtpsplit/train/train_adapter_parallel.py deleted file mode 100644 index b67760ba..00000000 --- a/wtpsplit/train/train_adapter_parallel.py +++ /dev/null @@ -1,774 +0,0 @@ -import copy -import dataclasses -import json -import logging -import math -import os -import random -import sys -from collections import Counter -from dataclasses import dataclass, field -from functools import partial -from glob import glob -from typing import List - -import datasets -import numpy as np -import torch -import torch_xla.core.xla_model as xm -import transformers -from tokenizers import AddedToken -from tqdm import tqdm -from transformers import AutoTokenizer, HfArgumentParser, set_seed - -import adapters -import wandb -from adapters import AdapterArguments -from wtpsplit.models import SubwordXLMConfig, SubwordXLMForTokenClassification -from wtpsplit.train.adapter_utils import ( - ParallelTPUAdapterTrainingArguments as TrainingArguments, -) -from wtpsplit.train.adapter_utils import ( - ParallelTPUWandbCallback as WandbCallback, -) -from wtpsplit.train.adaptertrainer import AdapterTrainer -from wtpsplit.train.evaluate import evaluate_sentence, evaluate_sentence_pairwise -from wtpsplit.train.train import collate_fn -from wtpsplit.train.utils import Model -from wtpsplit.utils import Constants, LabelArgs, get_label_dict, get_subword_label_dict, corrupt - -os.environ["TOKENIZERS_PARALLELISM"] = "false" - - -def setup_logging(training_args, job_id=None) -> None: - # Generate a unique logger name based on the job_id or process identifier - unique_logger_name = f"{__name__}.{job_id}" if job_id is not None else __name__ - logger = logging.getLogger(unique_logger_name) - - # Clear existing handlers to avoid duplicate logging - if logger.hasHandlers(): - logger.handlers.clear() - - # Disable propagation to prevent logs from being handled elsewhere - logger.propagate = False # Note the correct attribute is `propagate` - - # Set the logger's level based on training arguments - log_level = training_args.get_process_log_level() - logger.setLevel(log_level) - - # Create and add a console handler - console_handler = logging.StreamHandler(sys.stdout) - console_handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(name)s - %(message)s")) - logger.addHandler(console_handler) - - # Add a file handler if a job_id is provided, open in write mode to start from scratch - if job_id is not None: - file_handler = logging.FileHandler(f"logs/log_{job_id}.log", mode="w") # Open in write mode - file_handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(name)s - %(message)s")) - logger.addHandler(file_handler) - - # Adjust verbosity settings for datasets and transformers - datasets.utils.logging.set_verbosity_warning() - transformers.utils.logging.set_verbosity_warning() - transformers.utils.logging.enable_default_handler() - transformers.utils.logging.enable_explicit_format() - - # Log a summary message using the newly configured logger - logger.warning( - ( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " - + f"distributed training: {training_args.local_rank != -1}, 16-bits training: {training_args.fp16}" - ) - ) - - # Return the configured logger for use in the rest of the process - return logger - - -@dataclass -class Args: - model_name_or_path: str - base_model: str = "xlm-roberta-base" - shuffle: bool = True - text_path: str = "data/eval.pth" - include_languages: List[str] = None - preprocessing_num_workers: int = 1 - block_size: int = 512 - overflow_size: int = 16 - eval_stride: int = 256 - loss_margin: float = 0.5 - pack_samples: bool = False - one_sample_per_line: bool = False - use_loss_weights: bool = False - do_sentence_training: bool = True - do_auxiliary_training: bool = True - aux_training_weight: float = 1.0 - ignore_non_hyphen: bool = False - non_punctuation_sample_ratio: float = None - adapter_warmup_steps: int = 0 - adapter_lr_multiplier: float = 1.0 - text_column: str = "text" - - # NEW PARAMS - use_subwords: bool = False - freeze_classifier: bool = False - clf_from_scratch: bool = False - unfreeze_ln: bool = False - do_process: bool = False - n_train_steps: List[int] = field(default_factory=lambda: [1000, 10000, 100000]) - meta_clf: bool = False - wandb_project: str = "sentence" - # corruption - do_lowercase: bool = False - do_remove_punct: bool = False - eval_pairwise: bool = False - skip_eval_loss: bool = False - - -def main( - tpu_core_idx, - args, - training_args, - label_args, - adapter_args, - data, - train_ds, - valid_ds, - lang_groups, - train_steps, -): - wandb_name = training_args.output_dir - - logger = setup_logging(training_args, job_id=tpu_core_idx) - set_seed(training_args.seed) - logger.warning(f"{tpu_core_idx}: LANG GROUP {lang_groups}") - - num_labels = Constants.AUX_OFFSET + ( - (1 + len(Constants.PUNCTUATION_CHARS)) if (label_args.use_auxiliary or args.do_auxiliary_training or args.meta_clf) else 0 - ) - config = SubwordXLMConfig.from_pretrained( - args.model_name_or_path, - num_labels=num_labels, - ) - - # 1 wandb run for all language-dataset combinations - if "wandb" in training_args.report_to: - wandb.init(name=f"{wandb_name}-{tpu_core_idx}", project=args.wandb_project, group=wandb_name) - wandb.config.update(args) - wandb.config.update(training_args) - wandb.config.update(label_args) - wandb.config.update(adapter_args) - - for file in glob(os.path.join(os.path.dirname(__file__), "*.py")): - wandb.save(os.path.abspath(file), policy="now") - wandb.log({"train/total_n_batches": len(lang_groups)}) - training_args.report_to = [] - callbacks = WandbCallback() - else: - callbacks = None - - xm.rendezvous("wandb init done") - - for i, ((lang, dataset_name), train_step) in tqdm(enumerate(zip(lang_groups, train_steps)), total=len(lang_groups)): - # do model stuff here; otherwise, head params would be overwritten every time - backbone = SubwordXLMForTokenClassification.from_pretrained( - args.model_name_or_path, config=copy.deepcopy(config), ignore_mismatched_sizes=True - ) - logger.warning(f"{tpu_core_idx}: Loaded backbone {args.model_name_or_path}.") - backbone.config.base_model = args.base_model - - # setup adapters - model_type = backbone.config.model_type - # adapters need xlm-roberta as model type. - backbone.config.model_type = "xlm-roberta" # needed for adapter setup - adapters.init(backbone) - # reset model type (used later) - backbone.config.model_type = model_type - - tokenizer = AutoTokenizer.from_pretrained(args.base_model) - # needed since we create labels in collate_fn based on tokens - tokenizer.add_special_tokens({"additional_special_tokens": [AddedToken("\n")]}) - - model = Model( - backbone, - loss_margin=args.loss_margin, - use_loss_weights=args.use_loss_weights, - do_sentence_training=args.do_sentence_training, - do_auxiliary_training=args.do_auxiliary_training, - aux_training_weight=args.aux_training_weight, - ) - - # train for as many steps as the current group's steps. - training_args.max_steps = train_step - training_args.evaluation_strategy = "steps" - training_args.eval_steps = (train_step // training_args.num_train_epochs) + 1 - - # print some samples from the dataset - count = 0 - while count < 0: - index = random.choice(range(len(train_ds[(lang, dataset_name)]))) - sample = train_ds[(lang, dataset_name)][index] - - logger.warning(f"{tpu_core_idx}: Sample {index} of the training set: {sample}.") - if tokenizer: - logger.warning(tokenizer.decode(sample["input_ids"])) - count += 1 - - def compute_metrics(trainer): - metrics = {} - eval_data = data[lang]["sentence"][dataset_name]["data"] - - model = trainer._wrap_model(trainer.model, training=False) - - score, info = evaluate_sentence( - lang, - eval_data, - model, - stride=64, - block_size=512, - batch_size=training_args.per_device_eval_batch_size, - ) - metrics[f"{dataset_name}/{lang}/pr_auc"] = score - metrics[f"{dataset_name}/{lang}/f1"] = info["f1"] - metrics[f"{dataset_name}/{lang}/f1_best"] = info["f1_best"] - metrics[f"{dataset_name}/{lang}/threshold_best"] = info["threshold_best"] - if args.do_lowercase and args.do_remove_punct: - score_corrupted, info_corrupted = evaluate_sentence( - lang, - eval_data, - model, - stride=64, - block_size=512, - batch_size=training_args.per_device_eval_batch_size, - do_lowercase=True, - do_remove_punct=True - ) - metrics[f"{dataset_name}/{lang}/corrupted/pr_auc"] = score_corrupted - metrics[f"{dataset_name}/{lang}/corrupted/f1"] = info_corrupted["f1"] - metrics[f"{dataset_name}/{lang}/corrupted/f1_best"] = info_corrupted["f1_best"] - metrics[f"{dataset_name}/{lang}/corrupted/threshold_best"] = info_corrupted["threshold_best"] - elif args.do_lowercase or args.do_remove_punct: - raise NotImplementedError("Currently we only corrupt both ways!") - if args.eval_pairwise: - score_pairwise, avg_acc = evaluate_sentence_pairwise( - lang, - eval_data, - model, - stride=args.eval_stride, - block_size=args.block_size, - batch_size=training_args.per_device_eval_batch_size, - threshold=0.1, - ) - metrics[f"{dataset_name}/{lang}/pairwise/pr_auc"] = score_pairwise - metrics[f"{dataset_name}/{lang}/pairwise/acc"] = avg_acc - xm.rendezvous("eval log done") - - return metrics - - label_dict = get_subword_label_dict(label_args, tokenizer) if args.use_subwords else get_label_dict(label_args) - - # init new adapter - model.backbone.add_adapter("text", config=adapter_args.adapter_config, set_active=True, overwrite_ok=True) - model.backbone.train_adapter("text") - if tpu_core_idx == 0: - logger.warning(f"{tpu_core_idx}: {model.backbone.adapter_summary()}") - - if args.freeze_classifier: - for n, p in model.backbone.named_parameters(): - if "classifier" in n: - p.requires_grad = False - if args.clf_from_scratch: - model.backbone.classifier = torch.nn.Linear(model.backbone.config.hidden_size, num_labels) - - if args.unfreeze_ln: - for n, p in model.backbone.named_parameters(): - if "LayerNorm" in n: - p.requires_grad = True - - if args.meta_clf: - clf = model.backbone.classifier - model.backbone.classifier = torch.nn.Sequential( - clf, # original classifier - if frozen above, also frozen here - torch.nn.Linear(clf.out_features, 1) - ) - model.backbone.config.num_labels = 1 - - trainer = AdapterTrainer( - model, - training_args, - train_dataset=train_ds[(lang, dataset_name)], - eval_dataset=valid_ds[(lang, dataset_name)], - compute_metrics=compute_metrics, - data_collator=partial( - collate_fn, - args=args, - label_args=label_args, - label_dict=label_dict, - tokenizer=tokenizer, - add_lang_ids=False - ), - logging_prefix=f"{dataset_name}/{lang}/", - skip_eval_loss=args.skip_eval_loss - ) - if callbacks: - trainer.add_callback(callbacks) - - logger.warning(f"{tpu_core_idx}: START TRAIN {lang} {dataset_name}.") - # wait until all TPUs are ready - xm.rendezvous("start_training") - - trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) - - if not os.path.exists(os.path.join(training_args.output_dir, dataset_name, lang)): - os.makedirs(os.path.join(training_args.output_dir, dataset_name, lang)) - save_model = copy.deepcopy(model.backbone) - save_model = save_model.to("cpu") - save_model.save_adapter( - adapter_name="text", - save_directory=os.path.join(training_args.output_dir, dataset_name, lang), - with_head=True, - ) - - if args.unfreeze_ln: - # no way within adapters to do this, need to do it manually - ln_dict = {n: p for n, p in save_model.named_parameters() if "LayerNorm" in n} - torch.save(ln_dict, os.path.join(training_args.output_dir, dataset_name, lang, "ln_dict.pth")) - logger.warning(f"{tpu_core_idx}: DONE TRAIN {lang} {dataset_name}.") - - if callbacks: - wandb.log({"train/batch_progress": (i + 1) / len(lang_groups)}) - - xm.rendezvous("end_training") - xm.mark_step() - xm.rendezvous("all_done") - wandb.finish() - - -# split languages into groups of equal size for TPUs -def split_langs_into_groups(langs, n_groups=8): - return [langs[i::n_groups] for i in range(n_groups)] - - -def _mp_fn(index): - # For xla_spawn (TPUs) - setup(index) - - -def setup(index): - config_path = sys.argv[1] - parser = HfArgumentParser([Args, TrainingArguments, LabelArgs, AdapterArguments]) - (args, training_args, label_args, adapter_args) = parser.parse_json_file(config_path) - - data = torch.load( - args.text_path, - ) - if index == 0: - print(f"Using {xm.xrt_world_size()} processes/TPUs.") - print("Loaded data.") - print(f"Using step sizes {args.n_train_steps}.") - # create a csv file that writes the length of each train dataset - # used to sort the datasets by length and assign them to workers - if not os.path.exists("logs"): - os.makedirs("logs", exist_ok=True) - with open("logs/train_dataset_lengths.csv", "w") as f: - f.write("lang,dataset_name,token_length,original_length\n") - - def prepare_dataset( - data, - num_workers=1, - include_languages=None, - dataset_name="ud", - shuffle=False, - split="train", - do_lowercase=False, - do_remove_punct=False, - ): - # maybe we use more than 1 lang later at once. - for lang in include_languages: - if split == "train": - dataset = data[lang]["sentence"][dataset_name]["meta"]["train_data"] - elif split == "valid": - dataset = data[lang]["sentence"][dataset_name]["data"] - if dataset is None: - return None - dataset = datasets.Dataset.from_list( - [ - { - args.text_column: corrupt(sample, do_lowercase, do_remove_punct) + "\n" if sample and sample[-1] != "\n" else corrupt(sample, do_lowercase, do_remove_punct), - "lang": lang, - "ends_with_punctuation": sample.endswith(tuple(Constants.PUNCTUATION_CHARS)), - } - for sample in dataset - ] - ) - - if shuffle: - dataset = dataset.shuffle(seed=42) - - # very likely not relevant / used only for the compound part - if args.ignore_non_hyphen: - dataset = dataset.filter( - lambda sample: any(c in sample[args.text_column] for c in label_args.hyphen_chars), - num_proc=args.preprocessing_num_workers, - ) - - # "punctuation-specific sampling" in the paper - if args.non_punctuation_sample_ratio is not None: - languages_without_punctuation = { - lang_code - for lang_code in Constants.LANGINFO.index - if Constants.LANGINFO.loc[lang_code, "no_punctuation"] - } - - def drop_some_non_punctuation_samples(examples): - include_indices = set( - np.where([lang_code not in languages_without_punctuation for lang_code in examples["lang"]])[0] - ) - punctuation_indices = { - i for i in np.where(examples["ends_with_punctuation"])[0] if i in include_indices - } - - target_n_non_punct = int( - (len(punctuation_indices) * args.non_punctuation_sample_ratio) - / (1 - args.non_punctuation_sample_ratio) - ) - n_drop = (len(include_indices) - len(punctuation_indices)) - target_n_non_punct - - out = [True for _ in range(len(examples["ends_with_punctuation"]))] - - if n_drop <= 0: - return out - drop_indices = np.random.choice( - list(include_indices - punctuation_indices), - n_drop, - replace=False, - ) - - for i in drop_indices: - out[i] = False - - return out - - dataset = dataset.filter( - drop_some_non_punctuation_samples, - batched=True, - batch_size=1_000_000, - num_proc=num_workers, - ) - - def tokenize_texts(examples): - # do not return CLS and SEP token here - # there should only be 1 of these per block later, not multiple - # we still can't use return_special_tokens=False since we need the \n token later for the labels - tokenized = tokenizer(examples[args.text_column], verbose=False) - return {"input_ids": [example[1:-1] for example in tokenized["input_ids"]]} - - # similar to group_texts in huggingface's run_clm.py / run_mlm.py: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py - def group_texts(examples): - all_input_blocks = [] - all_input_block_lengths = [] - all_langs = [] - - for current_lang in set(examples["lang"]): - # only retain current_lang examples (all columns) - lang_subwords = [ - subwords for subwords, lang in zip(examples["input_ids"], examples["lang"]) if lang == current_lang - ] - # filter out some special tokens - # from html tags, mostly in Latin, Thai & Korean - lang_subwords = [ - [subword for subword in subwords if subword not in special_tokens_ids] for subwords in lang_subwords - ] - # concatenate token lists - concatenated_texts = [item for sublist in lang_subwords for item in sublist] - concatenated_ids = [i for i, subwords in enumerate(lang_subwords) for _ in subwords] - - total_length = len(concatenated_texts) - - best_length = math.ceil(total_length / args.block_size) * args.block_size + args.overflow_size - while best_length > total_length: - best_length -= args.block_size - - if best_length < 0: - continue - - concatenated_texts = concatenated_texts[:best_length] - concatenated_ids = concatenated_ids[:best_length] - - blocks = [ - concatenated_texts[i : i + args.block_size + args.overflow_size] - for i in range(0, best_length - args.block_size, args.block_size) - ] - block_ids = [ - concatenated_ids[i : i + args.block_size + args.overflow_size] - for i in range(0, best_length - args.block_size, args.block_size) - ] - - block_langs = [current_lang] * len(blocks) - - all_input_blocks.extend(blocks) - all_input_block_lengths.extend([list(Counter(ids).values()) for ids in block_ids]) - all_langs.extend(block_langs) - - return { - "input_ids": all_input_blocks, - "block_lengths": all_input_block_lengths, - "lang": all_langs, - } - - if args.pack_samples: - raise NotImplementedError("Packing samples not implemented for subword-based models.") - - if args.use_subwords: - dataset = dataset.map( - tokenize_texts, - batched=True, - num_proc=num_workers, - remove_columns=[args.text_column], - ) - else: - # this is no longer used and would cause an error otherwise - dataset = dataset.rename_column(args.text_column, "input_ids") - - if not args.one_sample_per_line: - dataset = dataset.map( - group_texts, - batched=True, - num_proc=num_workers, - # a bit hacky but oh well, only drop if sentence - remove_columns=["ends_with_punctuation"] if args.text_column == "text" else [], - ) - - return dataset - - if not args.include_languages: - args.include_languages = list(data.keys()) # use all - # XXX: for testing - # args.include_languages = ["af", "az", "kk", "te", "tg", "be", "km",] # "ps", "ru"] - # get all lang-dataset combinations and their lengths - all_keys = [] - for lang in data.keys(): - for dataset_name in data[lang]["sentence"].keys(): - if lang in args.include_languages: - valid = data[lang]["sentence"][dataset_name]["data"] - train = data[lang]["sentence"][dataset_name]["meta"]["train_data"] - if train is not None and valid is not None: - all_keys.append((lang, dataset_name, len(train))) - # sort by length of train dataset - all_keys = sorted(all_keys, key=lambda x: x[2], reverse=True) - all_lang_groups = split_langs_into_groups(list(all_keys), n_groups=int(xm.xrt_world_size())) - current_lang_groups = [(lang, ds) for (lang, ds, _) in all_lang_groups[index]] - # TODO: check speed of parallelism here - # should be: longest (parallel), ..., shortest (parallel) - - tokenizer = AutoTokenizer.from_pretrained(args.base_model) - # needed since we create labels in collate_fn based on tokens - tokenizer.add_special_tokens({"additional_special_tokens": [AddedToken("\n")]}) - custom_token_id = tokenizer.convert_tokens_to_ids("\n") - # used later to filter out special tokens - special_tokens_ids = set(tokenizer.all_special_ids) - special_tokens_ids.discard(custom_token_id) - - xm.rendezvous("loading data") - - if not os.path.exists("data/ft_data"): - os.makedirs("data/ft_data", exist_ok=True) - - def process_datasets(data, args, current_lang_groups, do_process=True, do_write=True): - all_ds = {"train": {}, "valid": {}} - - for lang in data.keys(): - if lang in args.include_languages: - for dataset_name in data[lang]["sentence"].keys(): - if (lang, dataset_name) not in current_lang_groups: - continue - train_path = f"data/ft_data/{lang}_{dataset_name}_train.pth" - valid_path = f"data/ft_data/{lang}_{dataset_name}_valid.pth" - - if not do_process and os.path.exists(train_path) and os.path.exists(valid_path): - # if exists and we don't want to process, load - train_dataset = torch.load(train_path) - valid_dataset = torch.load(valid_path) - - all_ds["train"][(lang, dataset_name)] = train_dataset - all_ds["valid"][(lang, dataset_name)] = valid_dataset - else: - # Process datasets - valid_dataset = prepare_dataset( - data=data, - num_workers=1, - include_languages=[lang], - dataset_name=dataset_name, - shuffle=False, - split="valid", - do_lowercase=args.do_lowercase, - do_remove_punct=args.do_remove_punct, - ) - - train_dataset = prepare_dataset( - data=data, - num_workers=args.preprocessing_num_workers, - include_languages=[lang], - dataset_name=dataset_name, - shuffle=args.shuffle, - split="train", - do_lowercase=args.do_lowercase, - do_remove_punct=args.do_remove_punct, - ) - - all_ds["valid"][(lang, dataset_name)] = valid_dataset - all_ds["train"][(lang, dataset_name)] = train_dataset - - torch.save(train_dataset, train_path) - torch.save(valid_dataset, valid_path) - - # Write length of train dataset to CSV - if do_write: - print(f"Valid ds for {lang} {dataset_name} has {len(valid_dataset)} examples.") - print(f"Train ds for {lang} {dataset_name} has {len(train_dataset)} examples.") - with open("logs/train_dataset_lengths.csv", "a") as f: - train_data_len = len(data[lang]["sentence"][dataset_name]["meta"]["train_data"]) - f.write(f"{lang},{dataset_name},{len(train_dataset)},{train_data_len}\n") - - if do_process and do_write and index == 0: - with open("data/ft_data/args.json", "w") as f: - json.dump(dataclasses.asdict(args), f) - return all_ds - - # first, pre-process datasets in distributed manner - # assignment of lang-dataset combinations to workers is done based on length of train dataset (string length) - _ = process_datasets(data, args, current_lang_groups, do_process=args.do_process, do_write=True) - xm.rendezvous("data loaded") - - # synchronize number of steps before training, for each worker - with open("logs/train_dataset_lengths.csv", "r") as f: - lines = f.readlines() - lines = [line.strip().split(",") for line in lines[1:]] - lines = sorted(lines, key=lambda x: int(x[2]), reverse=True) - # as tuple - lines = [ - ( - x[0], - x[1], - int(x[2]), - int(x[3]), - # calculate number of steps based on train dataset token length - # XXX: steps are dependent on epoch, too! So we set target number of epochs and calculate steps based on that - math.ceil( - (training_args.num_train_epochs * int(x[2])) - / (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps) - ), - ) - for x in lines - ] - - all_keys = lines # now contains lang, dataset, token length (!), original length, original train steps - - # bin the keys into groups based on number of steps - grouped_keys = {n_steps: [x for x in all_keys if x[4] <= n_steps] for n_steps in args.n_train_steps[:-1]} - # last group is all keys with >10_000 steps - grouped_keys[args.n_train_steps[-1]] = [x for x in all_keys if x[4] > args.n_train_steps[-2]] - - # split each group into equal parts for each worker - grouped_lang_groups = { - n_steps: split_langs_into_groups(grouped_keys[n_steps], n_groups=xm.xrt_world_size()) - for n_steps in args.n_train_steps - } - # ensure each last group is of equal length - for n_steps in args.n_train_steps: - for i, group in enumerate(grouped_lang_groups[n_steps]): - if len(group) < len(grouped_lang_groups[n_steps][0]): - grouped_lang_groups[n_steps][i].append(grouped_lang_groups[n_steps][0][-1]) - assert all([len(group) == len(grouped_lang_groups[n_steps][0]) for group in grouped_lang_groups[n_steps]]) - - # unwrap dict of lists (just remove dict dimension) - all_lang_groups = [] - - def merge_dict_into_sublists(d): - # Initialize a list with 8 empty sublists - merged_lists = [[] for _ in range(xm.xrt_world_size())] - - # Sort keys in descending order and iterate through them - for key in sorted(d.keys(), reverse=True): - # Iterate through each of the 8 sublists for the current key - for index, sublist in enumerate(d[key]): - # add key (number of used steps) to each item in the sublist - merged_lists[index].extend([item + (key,) for item in sublist]) - return merged_lists - - all_lang_groups = merge_dict_into_sublists(grouped_lang_groups) - train_steps = [x[5] for x in all_lang_groups[index]] - current_lang_groups = [(x[0], x[1]) for x in all_lang_groups[index]] - - all_ds = process_datasets(data, args, current_lang_groups, do_process=False, do_write=False) - - # all lang groups should be of equal length - assert all([len(lang_group) == len(all_lang_groups[0]) for lang_group in all_lang_groups]) - # all lang groups should contain unique lang-dataset combinations - assert all([len(lang_group) == len(set(lang_group)) for lang_group in all_lang_groups]) - - if index == 0: - # just for sanity chacking - with open("logs/all_lang_groups.txt", "w") as f: - f.write(f"{training_args.num_train_epochs}\n") - f.write("\n".join([str(x) for x in all_lang_groups])) - print(all_lang_groups) - xm.rendezvous("data sorted") - - main( - index, - args, - training_args, - label_args, - adapter_args, - data, - all_ds["train"], - all_ds["valid"], - current_lang_groups, - train_steps, - ) - - xm.rendezvous("all training done") - if index == 0: - # eval here within 1 go - if args.do_lowercase and args.do_remove_punct: - os.system( - f"python3 wtpsplit/evaluation/intrinsic.py --model_path {args.model_name_or_path} --adapter_path {training_args.output_dir} --threshold 0.1 --do_lowercase --do_remove_punct" - ) - elif args.eval_pairwise: - os.system( - f"python3 wtpsplit/evaluation/intrinsic_pairwise.py --model_path {args.model_name_or_path} --adapter_path {training_args.output_dir} --threshold 0.1" - ) - elif "lines" in args.text_path: - if args.do_lowercase and args.do_remove_punct: - os.system( - f"python3 wtpsplit/evaluation/intrinsic.py --model_path {args.model_name_or_path} --adapter_path {training_args.output_dir} --threshold 0.1--custom_language_list data/lyrics_langs.csv --eval_data_path data/lyrics_lines.pt --save_suffix lines --do_lowercase --do_remove_punct" - ) - else: - os.system( - f"python3 wtpsplit/evaluation/intrinsic.py --model_path {args.model_name_or_path} --adapter_path {training_args.output_dir} --threshold 0.1--custom_language_list data/lyrics_langs.csv --eval_data_path data/lyrics_lines.pt --save_suffix lines" - ) - elif "verses" in args.text_path: - if args.do_lowercase and args.do_remove_punct: - os.system( - f"python3 wtpsplit/evaluation/intrinsic.py --model_path {args.model_name_or_path} --adapter_path {training_args.output_dir} --threshold 0.1 --custom_language_list data/lyrics_langs.csv --eval_data_path data/lyrics_verses_strip_n.pt --save_suffix verses --do_lowercase --do_remove_punct" - ) - else: - os.system( - f"python3 wtpsplit/evaluation/intrinsic.py --model_path {args.model_name_or_path} --adapter_path {training_args.output_dir} --threshold 0.1 --custom_language_list data/lyrics_langs.csv --eval_data_path data/lyrics_verses_strip_n.pt --save_suffix verses" - ) - else: - os.system( - f"python3 wtpsplit/evaluation/intrinsic.py --model_path {args.model_name_or_path} --adapter_path {training_args.output_dir} --threshold 0.1" - ) - - -if __name__ == "__main__": - import torch_xla.distributed.xla_multiprocessing as xmp - - xmp.spawn( - _mp_fn, - args=(), - nprocs=8, - ) - \ No newline at end of file diff --git a/wtpsplit/train/train_xlmr.py b/wtpsplit/train/train_xlmr.py deleted file mode 100644 index 2515e1ee..00000000 --- a/wtpsplit/train/train_xlmr.py +++ /dev/null @@ -1,574 +0,0 @@ -import logging -import math -import os -import random -import shutil -import sys -import time -from collections import Counter, defaultdict -from dataclasses import dataclass -from functools import partial -from glob import glob -from typing import List, Optional - -import datasets -import numpy as np -import torch -import torch_xla.core.xla_model as xm -import transformers -from datasets import load_dataset -from datasets.download import DownloadConfig -from torchinfo import summary -from tqdm.auto import tqdm -from transformers import AutoTokenizer, HfArgumentParser, TrainingArguments, set_seed - -import wandb -from wtpsplit.models import ( - SubwordXLMConfig, - SubwordXLMForTokenClassification, -) -from wtpsplit.train.evaluate import evaluate_sentence, evaluate_sentence_kmers, evaluate_sentence_pairwise -from wtpsplit.train.trainer import Trainer -from wtpsplit.train.utils import Model, cleanup_cache_files -from wtpsplit.utils import Constants, LabelArgs, corrupt_training, get_label_dict, get_subword_label_dict -from wtpsplit.tokenization_utils import pack_sentences - -logger = logging.getLogger(__name__) - - -# os.environ["PJRT_DEVICE"] = "None" - - -def setup_logging(training_args: transformers.TrainingArguments) -> None: - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - handlers=[logging.StreamHandler(sys.stdout)], - ) - log_level = training_args.get_process_log_level() - logger.setLevel(log_level) - datasets.utils.logging.set_verbosity_warning() - transformers.utils.logging.set_verbosity_warning() - transformers.utils.logging.enable_default_handler() - transformers.utils.logging.enable_explicit_format() - # Log on each process the small summary: - logger.warning( - ( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {training_args.local_rank != -1}, 16-bits training: {training_args.fp16}" - ) - ) - # logger.info(f"Training/evaluation parameters {training_args}") - - -@dataclass -class Args: - model_name_or_path: str - shuffle: bool = False - use_logits: bool = False - is_decoder: bool = False - use_bert: bool = False - # TODO: adapt to HF Hub - train_text_path: str = "data/train.parquet" - valid_text_path: str = "data/valid.parquet" - include_languages: List[str] = None - eval_data_path: str = "data/all_data_24_04.pth" - num_hidden_layers: int = 1 - preprocessing_num_workers: int = 6 - block_size: int = 512 - underflow_size: int = 16 - eval_stride: int = 256 - lookahead: int = None - loss_margin: float = 0.5 - ngram_order: int = 1 - language_adapter: str = "on" - from_scratch: bool = False - pack_samples: bool = False - one_sample_per_line: bool = False - use_loss_weights: bool = False - do_sentence_training: bool = True - do_auxiliary_training: bool = True - aux_training_weight: float = 1.0 - ignore_non_hyphen: bool = False - non_punctuation_sample_ratio: float = None - adapter_warmup_steps: int = 0 - adapter_lr_multiplier: float = 1.0 - text_column: str = "text" - - # NEW PARAMS - use_subwords: bool = False - threshold: float = 0.01 - lookahead_split_layers: Optional[int] = None - min_sentence_length: int = 10 - - -def collate_fn(batch, args, label_args, label_dict, tokenizer, add_lang_ids: bool = False): - all_input_ids = [] - all_labels = [] - all_language_ids = [] - - all_attention_masks = [] - all_position_ids = [] - all_label_weights = [] - - for sample in batch: - # subword-level - if args.use_subwords: - input_ids = sample["input_ids"] - # char-level - else: - input_ids = [ord(c) for c in sample["input_ids"]] - lang = sample["lang"] - - newline_label_indices = sample["labels"] - newline_labels = [1 if i in newline_label_indices else 0 for i in range(len(input_ids))] - - block_ids = [0] * len(input_ids) - - input_ids, _, labels = corrupt_training( - input_ids, - block_ids, - newline_labels, - lang, - label_args, - label_dict=label_dict, - pack_samples=args.pack_samples, - # min_length=args.block_size, - tokenizer=tokenizer if args.use_subwords else None, - ) - - actual_block_size = args.block_size - 2 if args.use_subwords else args.block_size - - if len(input_ids) > args.block_size: - start = np.random.randint(0, len(input_ids) - actual_block_size) - input_ids = input_ids[start : start + actual_block_size] - labels = labels[start : start + actual_block_size] - elif len(input_ids) < actual_block_size: - padding = actual_block_size - len(input_ids) - # print(padding, lang) - input_ids += [tokenizer.pad_token_id] * padding if tokenizer else [0] * padding - labels += [0] * padding - - if tokenizer: - input_ids = [tokenizer.cls_token_id] + input_ids[:actual_block_size] + [tokenizer.sep_token_id] - # labels for CLS and SEP tokens are 0 (negative) - labels = [0] + labels[:actual_block_size] + [0] - else: - input_ids = input_ids[:actual_block_size] - labels = labels[:actual_block_size] - - input_ids = torch.tensor(input_ids, dtype=torch.long) - labels = torch.tensor(labels, dtype=torch.long) - position_ids = torch.arange(len(input_ids), dtype=torch.long) - label_weights = torch.ones(args.block_size, dtype=torch.float32) - if tokenizer: - attention_mask = (input_ids != tokenizer.pad_token_id).to(torch.float32) - else: - attention_mask = (input_ids != 0).to(torch.float32) - - all_input_ids.append(input_ids) - all_label_weights.append(label_weights) - all_labels.append(labels) - all_language_ids.append(Constants.LANG_CODE_TO_INDEX[lang] if add_lang_ids else 0) - - all_attention_masks.append(attention_mask) - all_position_ids.append(position_ids) - - out = { - "input_ids": torch.stack(all_input_ids, 0), - "attention_mask": torch.stack(all_attention_masks, 0), - "position_ids": torch.stack(all_position_ids, 0), - "language_ids": torch.tensor(all_language_ids, dtype=torch.long), - "label_weights": torch.stack(all_label_weights, 0), - "labels": torch.stack(all_labels, 0), - } - - return out - - -def main(): - parser = HfArgumentParser([Args, TrainingArguments, LabelArgs]) - - if sys.argv[1].endswith(".json"): - (args, training_args, label_args) = parser.parse_json_file(sys.argv[1]) - wandb_name = os.path.splitext(os.path.basename(sys.argv[1]))[0] - else: - (args, training_args, label_args) = parser.parse_args_into_dataclasses() - wandb_name = None - if xm.xrt_world_size() == 4: - # ensure same batch size on TPUv3 and TPUv4 - training_args.per_device_train_batch_size *= 2 - logger.warning(f"Per device train batch size: {training_args.per_device_train_batch_size}") - logger.warning( - f"Total train batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps* xm.xrt_world_size()}" - ) - - setup_logging(training_args) - set_seed(training_args.seed) - training_args.hub_strategy = "end" - training_args.save_total_limit = 1 - - num_labels = Constants.AUX_OFFSET + ((1 + len(Constants.PUNCTUATION_CHARS)) if args.do_auxiliary_training else 0) - if args.use_subwords: - if args.from_scratch: - config = SubwordXLMConfig( - args.model_name_or_path, - num_hidden_layers=args.num_hidden_layers, - num_labels=num_labels, - lookahead=args.lookahead, - lookahead_split_layers=args.lookahead_split_layers, - ) - backbone = SubwordXLMForTokenClassification(config) - - else: - config = SubwordXLMConfig.from_pretrained( - args.model_name_or_path, - num_hidden_layers=args.num_hidden_layers, - num_labels=num_labels, - lookahead=args.lookahead, - lookahead_split_layers=args.lookahead_split_layers, - ) - backbone = SubwordXLMForTokenClassification.from_pretrained( - args.model_name_or_path, - config=config, - ) - - backbone.config.base_model = args.model_name_or_path - tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) - - if args.lookahead: - assert args.lookahead % args.num_hidden_layers == 0 - - model = Model( - backbone, - loss_margin=args.loss_margin, - use_loss_weights=args.use_loss_weights, - do_sentence_training=args.do_sentence_training, - do_auxiliary_training=args.do_auxiliary_training, - aux_training_weight=args.aux_training_weight, - ) - - if training_args.local_rank == 0: - logger.warning(summary(model, depth=4)) - - def prepare_dataset( - num_workers=1, - include_languages=None, - shuffle=False, - split="train", - ): - with training_args.main_process_first(): - dlconf = DownloadConfig(cache_dir="/home/Markus/.cache/huggingface/datasets") - dataset = load_dataset("markus583/mC4-TEST", split=split, download_config=dlconf) - logger.warning(f"Loaded {split} dataset.") - # optional: delete downloaded dataset, it is stored in cache_dir now (but we delete it later) - # ~40GB on disk - # os.system("rm -rf /home/Markus/.cache/huggingface/datasets") - - if include_languages is not None: - include_languages = set(include_languages) - - dataset = dataset.filter( - lambda example: example["lang"] in include_languages, - num_proc=args.preprocessing_num_workers, - ) - logger.warning(f"Filtered to {len(dataset)} examples.") - - if shuffle: - dataset = dataset.shuffle(seed=42) - logger.warning("Shuffled dataset.") - - # "punctuation-specific sampling" in the paper - if args.non_punctuation_sample_ratio is not None: - languages_without_punctuation = { - lang_code - for lang_code in Constants.LANGINFO.index - if Constants.LANGINFO.loc[lang_code, "no_punctuation"] - } - - def drop_some_non_punctuation_samples(examples): - include_indices = set( - np.where([lang_code not in languages_without_punctuation for lang_code in examples["lang"]])[0] - ) - punctuation_indices = { - i for i in np.where(examples["ends_with_punctuation"])[0] if i in include_indices - } - - target_n_non_punct = int( - (len(punctuation_indices) * args.non_punctuation_sample_ratio) - / (1 - args.non_punctuation_sample_ratio) - ) - n_drop = (len(include_indices) - len(punctuation_indices)) - target_n_non_punct - - out = [True for _ in range(len(examples["ends_with_punctuation"]))] - - if n_drop <= 0: - return out - drop_indices = np.random.choice( - list(include_indices - punctuation_indices), - n_drop, - replace=False, - ) - - for i in drop_indices: - out[i] = False - - return out - - with training_args.main_process_first(): - dataset = dataset.filter( - drop_some_non_punctuation_samples, - batched=True, - batch_size=1_000_000, - num_proc=num_workers, - ) - - if args.do_auxiliary_training: - assert label_args.use_auxiliary - - if args.pack_samples: - assert not args.one_sample_per_line - - if not args.one_sample_per_line: - with training_args.main_process_first(): - dataset = dataset.map( - pack_sentences, - batched=True, - num_proc=num_workers, - fn_kwargs={ - "block_size": args.block_size, - "tokenizer": tokenizer, - "underflow_size": args.underflow_size, - "min_sentence_length": args.min_sentence_length, - }, - # a bit hacky but oh well, only drop if sentence - remove_columns=["ends_with_punctuation", "text"], - # load_from_cache_file=False - ) - - if split == "train" and args.use_subwords: - with training_args.main_process_first(): - for root, dirs, files in os.walk(os.environ.get("HF_DATASETS_CACHE")): - for file in files: - if file.startswith("m_c4-test-train"): - logger.warning(f"Removing {os.path.join(root, file)}") - os.remove(os.path.join(root, file)) - - logger.warning(f"Grouped {split} dataset.") - - return dataset - - valid_dataset = prepare_dataset( - num_workers=args.preprocessing_num_workers, - include_languages=args.include_languages, - shuffle=False, - split="valid", - ) - logger.warning(f"Valid dataset has {len(valid_dataset)} examples.") - - train_dataset = prepare_dataset( - num_workers=args.preprocessing_num_workers, - include_languages=args.include_languages, - shuffle=args.shuffle, - split="train", - ) - logger.warning(f"Train dataset has {len(train_dataset)} examples.") - - # print some samples from the dataset - count = 0 - while count < 5: - index = random.choice(range(len(train_dataset))) - sample = train_dataset[index] - - logger.warning(f"Sample {index} of the training set: {sample}.") - if tokenizer: - logger.warning(tokenizer.decode(sample["input_ids"])) - count += 1 - - eval_data = torch.load( - args.eval_data_path, - ) - - def compute_metrics(trainer): - metrics = {} - avg_metrics = defaultdict(lambda: []) - - model = trainer._wrap_model(trainer.model, training=False) - - for lang_code, lang_data in tqdm(eval_data.items(), desc="Evaluate!"): - if args.include_languages is not None and lang_code not in args.include_languages: - continue - - if trainer.args.process_index == 0 and args.do_sentence_training: - # with training_args.main_process_first(): - for dataset_name, dataset in lang_data["sentence"].items(): - if "corrupt" in dataset_name: - continue - score, info = evaluate_sentence( - lang_code, - dataset["data"], - model, - stride=128, - block_size=512, - batch_size=training_args.per_device_eval_batch_size, - threshold=args.threshold, - ) - metrics[f"{lang_code}_{dataset_name}_pr_auc"] = score - metrics[f"{lang_code}_{dataset_name}_f1"] = info["f1"] - metrics[f"{lang_code}_{dataset_name}_f1_best"] = info["f1_best"] - metrics[f"{lang_code}_{dataset_name}_threshold_best"] = info["threshold_best"] - avg_metrics[f"average_{dataset_name}_pr_auc"].append(score) - avg_metrics[f"average_{dataset_name}_f1"].append(info["f1"]) - avg_metrics[f"average_{dataset_name}_f1_best"].append(info["f1_best"]) - avg_metrics[f"average_{dataset_name}_threshold_best"].append(info["threshold_best"]) - # if lang_code in ["zh", "ja", "my", "km"]: - # avg_metrics[f"average_nonwhitespace_{dataset_name}_pr_auc"].append(score) - # else: - # avg_metrics[f"average_whitespace_{dataset_name}_pr_auc"].append(score) - # score, _ = evaluate_sentence( - # lang_code, - # dataset["data"], - # model, - # stride=args.eval_stride, - # block_size=args.block_size, - # batch_size=training_args.per_device_eval_batch_size, - # do_lowercase=True, - # do_remove_punct=True, - # ) - # metrics[f"lower_rmp_{lang_code}_{dataset_name}_pr_auc"] = score - # avg_metrics[f"lower_rmp_average_{dataset_name}_pr_auc"].append(score) - # if lang_code in ["zh", "ja", "my", "km"]: - # avg_metrics[f"lower_rmp_average_nonwhitespace_{dataset_name}_pr_auc"].append(score) - # else: - # avg_metrics[f"lower_rmp_average_whitespace_{dataset_name}_pr_auc"].append(score) - # k-mer based evaluation - # for k in [2, 3, 4]: - # score, avg_acc, info = evaluate_sentence_kmers( - # lang_code, - # dataset["data"], - # model, - # stride=128, - # block_size=512, - # batch_size=training_args.per_device_eval_batch_size, - # k=k, - # # sample_pct=0.1, - # threshold=args.threshold, - # ) - # metrics[f"k_{k}_{lang_code}_{dataset_name}_pr_auc"] = score - # avg_metrics[f"k_{k}_average_{dataset_name}_pr_auc"].append(score) - # metrics[f"k_{k}_{lang_code}_{dataset_name}_acc"] = avg_acc - # avg_metrics[f"k_{k}_average_{dataset_name}_acc"].append(avg_acc) - # metrics[f"k_{k}_{lang_code}_{dataset_name}_f1"] = info["f1"] - # metrics[f"k_{k}_{lang_code}_{dataset_name}_f1_best"] = info["f1_best"] - # metrics[f"k_{k}_{lang_code}_{dataset_name}_threshold_best"] = info["threshold_best"] - # avg_metrics[f"k_{k}_average_{dataset_name}_f1"].append(info["f1"]) - # avg_metrics[f"k_{k}_average_{dataset_name}_f1_best"].append(info["f1_best"]) - # avg_metrics[f"k_{k}_average_{dataset_name}_threshold_best"].append(info["threshold_best"]) - - # # if lang_code in ["zh", "ja", "my", "km"]: - # # avg_metrics[f"k_{k}_average_nonwhitespace_{dataset_name}_pr_auc"].append(score) - # # avg_metrics[f"k_{k}_average_nonwhitespace_{dataset_name}_acc"].append(avg_acc) - # # else: - # # avg_metrics[f"k_{k}_average_whitespace_{dataset_name}_pr_auc"].append(score) - # # avg_metrics[f"k_{k}_average_whitespace_{dataset_name}_acc"].append(avg_acc) - # if k == 2: - # # keep keys for backwards compat in wandb - # metrics[f"pairwise_{lang_code}_{dataset_name}_pr_auc"] = score - # avg_metrics[f"pairwise_average_{dataset_name}_pr_auc"].append(score) - # metrics[f"pairwise_{lang_code}_{dataset_name}_acc"] = avg_acc - # avg_metrics[f"pairwise_average_{dataset_name}_acc"].append(avg_acc) - # metrics[f"pairwise_{lang_code}_{dataset_name}_f1"] = info["f1"] - # metrics[f"pairwise_{lang_code}_{dataset_name}_f1_best"] = info["f1_best"] - # metrics[f"pairwise_{lang_code}_{dataset_name}_threshold_best"] = info["threshold_best"] - # avg_metrics[f"pairwise_average_{dataset_name}_f1"].append(info["f1"]) - # avg_metrics[f"pairwise_average_{dataset_name}_f1_best"].append(info["f1_best"]) - # avg_metrics[f"pairwise_average_{dataset_name}_threshold_best"].append(info["threshold_best"]) - if lang_code in ["zh", "ja", "my", "km"]: - avg_metrics[f"average_nonwhitespace_{dataset_name}_pr_auc"].append(score) - avg_metrics[f"average_nonwhitespace_{dataset_name}_f1"].append(info["f1"]) - avg_metrics[f"average_nonwhitespace_{dataset_name}_f1_best"].append(info["f1_best"]) - avg_metrics[f"average_nonwhitespace_{dataset_name}_threshold_best"].append( - info["threshold_best"] - ) - else: - avg_metrics[f"average_whitespace_{dataset_name}_pr_auc"].append(score) - avg_metrics[f"average_whitespace_{dataset_name}_f1"].append(info["f1"]) - avg_metrics[f"average_whitespace_{dataset_name}_f1_best"].append(info["f1_best"]) - avg_metrics[f"average_whitespace_{dataset_name}_threshold_best"].append(info["threshold_best"]) - - for name, values in avg_metrics.items(): - if len(values) > 1: - metrics[name] = np.mean(values) - - return metrics - - if "wandb" in training_args.report_to and training_args.process_index == 0: - wandb.init(name=wandb_name, project="sentence", entity="markus_583") - wandb.config.update(args) - wandb.config.update(training_args) - wandb.config.update(label_args) - - model.config.wandb_run_id = wandb.run.id - - for file in glob(os.path.join(os.path.dirname(__file__), "*.py")): - wandb.save(os.path.abspath(file), policy="now") - # also 1 dir above - wandb.save(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", file)), policy="now") - - label_dict = get_subword_label_dict(label_args, tokenizer) if args.use_subwords else get_label_dict(label_args) - logger.info(f"Label dict has {len(label_dict)} entries.") - - # needed in the trainer - training_args.adapter_warmup_steps = args.adapter_warmup_steps - training_args.adapter_lr_multiplier = args.adapter_lr_multiplier - - # give .map in multiprocessing enough of time to finish, to be safe - time.sleep(10) - if training_args.local_rank == 0: - # since both share the *same* cache_dir, we cannot simply call dataset.cleanup_cache_files() - # because that would remove the cache files of the other dataset! - cleanup_cache_files([train_dataset, valid_dataset]) - logger.warning("Cleaned up cache files.") - time.sleep(10) - - trainer = Trainer( - model, - training_args, - train_dataset=train_dataset, - eval_dataset=valid_dataset, - compute_metrics=compute_metrics, - data_collator=partial( - collate_fn, - args=args, - label_args=label_args, - label_dict=label_dict, - tokenizer=tokenizer if args.use_subwords else None, - add_lang_ids=not args.use_subwords, - ), - ) - - trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) - trainer.save_model() - trainer.save_state() - # Pattern for checkpoint directories - checkpoint_pattern = os.path.join(training_args.output_dir, "checkpoint-*") - - # Use glob.glob to find all directories matching the pattern - for checkpoint_dir in glob(checkpoint_pattern): - if os.path.isdir(checkpoint_dir): - shutil.rmtree(checkpoint_dir) - - -def _mp_fn(index): - # For xla_spawn (TPUs) - main() - - -if __name__ == "__main__": - # try: - main() - # except Exception: - # # extype, value, tb = sys.exc_info() - # # tb.print_exc() - # # pdb.post_mortem(tb) - # pass diff --git a/wtpsplit/train/trainer.py b/wtpsplit/train/trainer.py index ddb7f6f6..ca25780b 100644 --- a/wtpsplit/train/trainer.py +++ b/wtpsplit/train/trainer.py @@ -131,7 +131,7 @@ def create_scheduler(self, num_training_steps: int, optimizer: torch.optim.Optim if self.lr_scheduler is None: warmup_steps = self.args.get_warmup_steps(self.args.max_steps) - # MODIFIED: add lang adapter lr scheduler + # MODIFIED: add lang adapter lr scheduler (wtp only) def lr_lambda(current_step: int): if current_step < self.args.adapter_warmup_steps: return 0.0 diff --git a/wtpsplit/utils.py b/wtpsplit/utils.py index 024764c3..cfae534a 100644 --- a/wtpsplit/utils.py +++ b/wtpsplit/utils.py @@ -83,13 +83,9 @@ class LabelArgs: custom_punctuation_file: str = None retain_first_consecutive_punctuation: bool = True non_whitespace_remove_spaces: bool = True - case_corruption_prob_after_newline: float = 0.0 - case_corruption_prob_after_punct: float = 0.0 corrupt_entire_chunk_prob: float = 0.0 corrupt_entire_chunk_strategy: str = "mix" corrupt_entire_chunk_prob_full: float = 0.5 - use_all_labels: bool = False - use_all_labels_max_length: int = 3 def __post_init__(self): if self.custom_punctuation_file: @@ -120,24 +116,9 @@ def get_subword_label_dict(label_args, tokenizer): for i, c in enumerate(Constants.PUNCTUATION_CHARS): token_id = tokenizer.convert_tokens_to_ids(c) label_dict[token_id] = 1 + Constants.AUX_OFFSET + i - # logger.info( - # f"auxiliary character {c} has token ID {token_id} and label {label_dict[token_id]}, decoded: {tokenizer.decode([token_id])}" - # ) + if token_id == tokenizer.unk_token_id: n_unks += 1 - if label_args.use_all_labels: - # check where c is in tokenizer.vocab keys - for i_t, (token, token_idx) in enumerate(tokenizer.vocab.items()): - if ( - c in token - and token_idx not in label_dict - and len(token) < label_args.use_all_labels_max_length - and not any(i.isdigit() for i in token) - ): - label_dict[token_idx] = 1 + Constants.AUX_OFFSET + i - # logger.warning( - # f"Special auxiliary character {c} has token ID {token_idx} and label {label_dict[token_idx]}, decoded: {tokenizer.decode([token_idx])}" - # ) logger.info(f"found {n_unks} UNK tokens in auxiliary characters") @@ -258,7 +239,6 @@ def corrupt_asr(text: str, lang): return corrupted_sentences -# does the steps in Figure 2 of the paper def corrupt_training( input_ids, block_ids, @@ -274,7 +254,7 @@ def corrupt_training( if random.random() < label_args.corrupt_entire_chunk_prob: # choose corruption strategy if label_args.corrupt_entire_chunk_strategy == "mix": - corrupt_strategy = "full" if random.random() < label_args.corrupt_entire_chunk_prob_full else "asr" + corrupt_strategy = "full" # if random.random() < label_args.corrupt_entire_chunk_prob_full else "asr" else: corrupt_strategy = label_args.corrupt_entire_chunk_strategy @@ -380,15 +360,12 @@ def corrupt_training( del input_ids[i + 1] del labels[i + 1] del block_ids[i + 1] - if random.random() < label_args.case_corruption_prob_after_newline and i + 1 < len(input_ids): - input_ids, labels, block_ids = _corrupt_case(tokenizer, input_ids, labels, block_ids, i) elif label_args.use_auxiliary and labels[i] > Constants.AUX_OFFSET: # auxiliary if pack_samples: raise NotImplementedError() if random.random() < auxiliary_remove_prob: - removed_aux_char = False if label_args.retain_first_consecutive_punctuation: # remove only if the next token is not a newline # this retains the current auxiliary character, even though we decided to remove it @@ -397,20 +374,12 @@ def corrupt_training( del input_ids[i + 1] del labels[i + 1] del block_ids[i + 1] - removed_aux_char = True else: # in case of something like ".\n", this removes the "." and the \n label (=1) # so the newline in the text is kept, but the label is removed! del input_ids[i + 1] del labels[i + 1] del block_ids[i + 1] - removed_aux_char = True - if ( - random.random() < label_args.case_corruption_prob_after_punct - and removed_aux_char - and i + 1 < len(input_ids) - ): - input_ids, labels, block_ids = _corrupt_case(tokenizer, input_ids, labels, block_ids, i) try: i = i + 1 + next(index for index, label in enumerate(labels[i + 1 :]) if label != 0) @@ -420,42 +389,6 @@ def corrupt_training( return input_ids, block_ids, labels -def _corrupt_case(tokenizer: AutoTokenizer, input_ids: List[int], labels: List[int], block_ids: List[int], i: int): - if not tokenizer: - raise NotImplementedError() - - token = tokenizer.convert_ids_to_tokens(input_ids[i + 1]) - insert_ = token.startswith("▁") - if insert_: - token = token[1:] - - do_exchange = token.istitle() - if do_exchange: - token = token.lower() - - # Re-tokenize - token_ids = tokenizer.encode(token if not insert_ else "▁" + token, add_special_tokens=False) - if len(token_ids) == 0 or input_ids[i + 1] == tokenizer.unk_token_id: - # UNK or whitespace token, remove it - del input_ids[i + 1] - del labels[i + 1] - del block_ids[i + 1] - else: - if token_ids[0] == tokenizer.convert_tokens_to_ids("▁"): - token_ids = token_ids[1:] - elif len(token_ids) > 1: - # Replace the token with the remaining token - input_ids[i + 1] = token_ids[0] - for token_id in token_ids[1:]: - input_ids.insert(i + 2, token_id) - labels.insert(i + 2, 0) - block_ids.insert(i + 2, block_ids[i + 1]) - elif len(token_ids) == 1: - input_ids[i + 1] = token_ids[0] - - return input_ids, labels, block_ids - - def indices_to_sentences(text, indices, strip_whitespace=False): sentences = [] @@ -537,9 +470,6 @@ def reconstruct_sentences(text, partial_sentences): use_auxiliary=True, auxiliary_remove_prob=1.0, newline_whitespace_prob=1.0, - case_corruption_prob_after_punct=1.0, - case_corruption_prob_after_newline=1.0, - use_all_labels=True, ) label_dict = get_subword_label_dict(label_args, tokenizer) print(len(label_dict)) @@ -551,26 +481,7 @@ def reconstruct_sentences(text, partial_sentences): input_ids, block_ids, labels = corrupt_training( input_ids, block_ids, "en", label_args, label_dict, tokenizer=tokenizer ) - # print(input_ids) - # print(labels) - # print(tokenizer.tokenize(text)) - # print([(tokenizer.decode([input_id]), label) for input_id, label in zip(input_ids, labels)]) - # print("newline labels in text:") - # print(np.where(np.array(labels) == 1)) - # print("newline ids in output text:") - # print(np.where(np.array(input_ids) == tokenizer.all_special_ids[-1])) - # print(tokenizer.decode(input_ids)) - # print(tokenizer.decode(input_ids)) - - # ords = [ord(c) for c in text] - # block_ords = [0] * len(ords) - # label_args = LabelArgs(use_auxiliary=True, auxiliary_remove_prob=1.0) - # label_dict = get_label_dict(label_args) - - # ords, block_ords, labels = corrupt(ords, block_ords, "en", label_args, label_dict) - # print("ords", ords) - # print("labels", labels) - # print("newline labels in text:") - # print(np.where(np.array(labels) == 1)) - # print("newline ids in output text:") - # print(np.where(np.array([ord("\n")]) == ords)) + print(input_ids) + print(labels) + print(tokenizer.tokenize(text)) + print([(tokenizer.decode([input_id]), label) for input_id, label in zip(input_ids, labels)])