From 546925a2f49545198ecfff57b333d2f45de2437f Mon Sep 17 00:00:00 2001 From: James Bishop Date: Wed, 20 Nov 2024 17:13:16 +0000 Subject: [PATCH 01/10] allow non-translation tasks --- scripts/variational_RTC_example.py | 28 ++---- src/arc_spice/data/multieurlex_utils.py | 118 ++++++++++++++---------- src/arc_spice/utils.py | 15 +++ 3 files changed, 90 insertions(+), 71 deletions(-) create mode 100644 src/arc_spice/utils.py diff --git a/scripts/variational_RTC_example.py b/scripts/variational_RTC_example.py index 5c615f1..8475d94 100644 --- a/scripts/variational_RTC_example.py +++ b/scripts/variational_RTC_example.py @@ -3,37 +3,26 @@ """ import logging -import os -import random from random import randint -import numpy as np import torch from torch.nn.functional import binary_cross_entropy -from arc_spice.data.multieurlex_utils import MultiHot, load_multieurlex +from arc_spice.data.multieurlex_utils import MultiHot, load_multieurlex_for_translation from arc_spice.eval.classification_error import hamming_accuracy from arc_spice.eval.translation_error import get_comet_model +from arc_spice.utils import seed_everything from arc_spice.variational_pipelines.RTC_variational_pipeline import ( RTCVariationalPipeline, ) -def seed_everything(seed): - torch.manual_seed(seed) - torch.cuda.manual_seed_all(seed) - torch.backends.cudnn.deterministic = True - torch.backends.cudnn.benchmark = False - np.random.seed(seed) - random.seed(seed) - os.environ["PYTHONHASHSEED"] = str(seed) - - def load_test_row(): lang_pair = {"source": "fr", "target": "en"} - (train, _, _), metadata_params = load_multieurlex( + dataset_dict, metadata_params = load_multieurlex_for_translation( data_dir="data", level=1, lang_pair=lang_pair ) + train = dataset_dict["train"] multi_onehot = MultiHot(metadata_params["n_classes"]) test_row = get_test_row(train) class_labels = multi_onehot(test_row["class_labels"]) @@ -41,10 +30,6 @@ def load_test_row(): def get_test_row(train_data): - row_iterator = iter(train_data) - for _ in range(randint(1, 25)): - test_row = next(row_iterator) - # debug row if needed return { "source_text": ( @@ -57,7 +42,10 @@ def get_test_row(train_data): ), "class_labels": [0, 1], } - # Normal row + ## Normal row + row_iterator = iter(train_data) + for _ in range(randint(1, 25)): + test_row = next(row_iterator) return test_row diff --git a/src/arc_spice/data/multieurlex_utils.py b/src/arc_spice/data/multieurlex_utils.py index bbdedaa..2d519ac 100644 --- a/src/arc_spice/data/multieurlex_utils.py +++ b/src/arc_spice/data/multieurlex_utils.py @@ -1,7 +1,8 @@ import json +from typing import Any import torch -from datasets import load_dataset +from datasets import DatasetDict, load_dataset from datasets.formatting.formatting import LazyRow from torch.nn.functional import one_hot @@ -34,23 +35,20 @@ def _extract_articles(text: str, article_1_marker: str): return text[start:] -def extract_articles(item: LazyRow, lang_pair: dict[str, str]): - lang_source = lang_pair["source"] - lang_target = lang_pair["target"] +def extract_articles(item: LazyRow, languages: list[str]) -> dict[str, dict[str, str]]: return { - "source_text": _extract_articles( - text=item["source_text"], - article_1_marker=ARTICLE_1_MARKERS[lang_source], - ), - "target_text": _extract_articles( - text=item["target_text"], - article_1_marker=ARTICLE_1_MARKERS[lang_target], - ), + "text": { + lang: _extract_articles( + text=item["text"][lang], + article_1_marker=ARTICLE_1_MARKERS[lang], + ) + for lang in languages + } } -class PreProcesser: - """Function to preprocess the data, for the purposes of removing unused languages""" +class TranslationPreProcesser: + """Prepares the data for the translation task""" def __init__(self, language_pair: dict[str, str]) -> None: self.source_language = language_pair["source"] @@ -70,28 +68,13 @@ def __call__( """ source_text = data_row["text"][self.source_language] target_text = data_row["text"][self.target_language] - labels = data_row["labels"] return { "source_text": source_text, "target_text": target_text, - "class_labels": labels, } -def load_multieurlex( - data_dir: str, level: int, lang_pair: dict[str, str] -) -> tuple[list, dict[str, int | list]]: - """ - load the multieurlex dataset - - Args: - data_dir: root directory for the dataset class descriptors and concepts - level: level of hierarchy/specicifity of the labels - lang_pair: dictionary specifying the language pair. - - Returns: - List of datasets and a dictionary with some metadata information - """ +def load_mutlieurlex_metadata(data_dir: str, level: int) -> dict[str, Any]: assert level in [1, 2, 3], "there are 3 levels of hierarchy: 1,2,3." with open(f"{data_dir}/MultiEURLEX/data/eurovoc_concepts.json") as concepts_file: class_concepts = json.loads(concepts_file.read()) @@ -103,35 +86,68 @@ def load_multieurlex( class_descriptors = json.loads(descriptors_file.read()) descriptors_file.close() # format level for the class descriptor dictionary, add these to a list - classes = class_concepts[level] + classes = class_concepts[f"level_{level}"] descriptors = [] for class_id in classes: descriptors.append(class_descriptors[class_id]) - # load the dataset with huggingface API - data = load_dataset( - "multi_eurlex", - "all_languages", - label_level=f"level_{level}", - trust_remote_code=True, - ) # define metadata - meta_data = { + return { "n_classes": len(classes), "class_labels": classes, "class_descriptors": descriptors, } - # instantiate the preprocessor - preprocesser = PreProcesser(lang_pair) - # preprocess each split - dataset = data.map(preprocesser, remove_columns=["text"]) - extracted_dataset = dataset.map( - extract_articles, - fn_kwargs={"lang_pair": lang_pair}, + + +def load_multieurlex( + data_dir: str, + level: int, + languages: list[str], +) -> tuple[DatasetDict, dict[str, Any]]: + """ + load the multieurlex dataset + + Args: + data_dir: root directory for the dataset class descriptors and concepts + level: level of hierarchy/specicifity of the labels + languages: a list of iso codes for languages to be used + + Returns: + List of datasets and a dictionary with some metadata information + """ + metadata = load_mutlieurlex_metadata(data_dir=data_dir, level=level) + + # load the dataset with huggingface API + if isinstance(languages, list): + if len(languages) == 0: + msg = "languages list cannot be empty" + raise Exception(msg) + + load_langs = languages[0] if len(languages) == 1 else "all_languages" + + dataset_dict = load_dataset( + "multi_eurlex", + load_langs, + label_level=f"level_{level}", + trust_remote_code=True, ) + + dataset_dict = dataset_dict.map( + extract_articles, fn_kwargs={"languages": languages} + ) + # return datasets and metadata - return [ - extracted_dataset["train"], - extracted_dataset["test"], - extracted_dataset["validation"], - ], meta_data + return dataset_dict, metadata + + +def load_multieurlex_for_translation( + data_dir: str, level: int, lang_pair: dict[str, str] +) -> tuple[DatasetDict, dict[str, Any]]: + langs = [lang_pair["source"], lang_pair["target"]] + dataset_dict, meta_data = load_multieurlex( + data_dir=data_dir, level=level, languages=langs + ) + # instantiate the preprocessor + preprocesser = TranslationPreProcesser(lang_pair) + # preprocess each split + return dataset_dict.map(preprocesser, remove_columns=["text"]), meta_data diff --git a/src/arc_spice/utils.py b/src/arc_spice/utils.py new file mode 100644 index 0000000..d3430c1 --- /dev/null +++ b/src/arc_spice/utils.py @@ -0,0 +1,15 @@ +import os +import random + +import numpy as np +import torch + + +def seed_everything(seed): + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + np.random.seed(seed) + random.seed(seed) + os.environ["PYTHONHASHSEED"] = str(seed) From 7cfbc094de61adfedb50525ac380231fdffc6ade Mon Sep 17 00:00:00 2001 From: James Bishop Date: Wed, 20 Nov 2024 19:13:47 +0000 Subject: [PATCH 02/10] handle single lang case --- src/arc_spice/data/multieurlex_utils.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/arc_spice/data/multieurlex_utils.py b/src/arc_spice/data/multieurlex_utils.py index 2d519ac..524e8f5 100644 --- a/src/arc_spice/data/multieurlex_utils.py +++ b/src/arc_spice/data/multieurlex_utils.py @@ -35,7 +35,19 @@ def _extract_articles(text: str, article_1_marker: str): return text[start:] -def extract_articles(item: LazyRow, languages: list[str]) -> dict[str, dict[str, str]]: +def extract_articles( + item: LazyRow, languages: list[str] +) -> dict[str, str] | dict[str, dict[str, str]]: + # single lang has different structure that isn't nested + if len(languages) == 1 and isinstance(item["text"], str): + return { + "text": _extract_articles( + text=item["text"], + article_1_marker=ARTICLE_1_MARKERS[languages[0]], + ) + } + + # else return { "text": { lang: _extract_articles( From 176b6b6f1ab81a3fbfe2eca626036a84aa33300a Mon Sep 17 00:00:00 2001 From: James Bishop Date: Wed, 20 Nov 2024 19:14:47 +0000 Subject: [PATCH 03/10] rm reqs file --- requirements.txt | 173 ----------------------------------------------- 1 file changed, 173 deletions(-) delete mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 4e0d0d1..0000000 --- a/requirements.txt +++ /dev/null @@ -1,173 +0,0 @@ -absl-py==2.1.0 -aiohappyeyeballs==2.4.0 -aiohttp==3.10.6 -aiosignal==1.3.1 -annotated-types==0.7.0 -appnope==0.1.4 --e git+https://github.com/alan-turing-institute/ARC-SPICE.git@1ae06a2e9bff17854af1aa01cb6d068642b69358#egg=ARC_SPICE -asttokens==2.4.1 -async-timeout==4.0.3 -attrs==24.2.0 -audioread==3.0.1 -blis==1.0.1 -boto3==1.35.26 -botocore==1.35.26 -catalogue==2.0.10 -certifi==2024.8.30 -cffi==1.17.1 -charset-normalizer==3.3.2 -click==8.1.7 -cloudpathlib==0.20.0 -colorama==0.4.6 -comm==0.2.2 -confection==0.1.5 -contourpy==1.3.0 -cycler==0.12.1 -cymem==2.0.8 -datasets==3.0.0 -debugpy==1.8.6 -decorator==5.1.1 -dill==0.3.8 -en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl#sha256=1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85 -entmax==1.3 -exceptiongroup==1.2.2 -executing==2.1.0 -filelock==3.16.1 -filetype==1.2.0 -fire==0.6.0 -fonttools==4.54.1 -frozenlist==1.4.1 -fsspec==2024.6.1 -gitdb==4.0.11 -GitPython==3.1.43 -grpcio==1.66.1 -huggingface==0.0.1 -huggingface-hub==0.25.1 -hypothesis==6.112.2 -idna==3.7 -ipykernel==6.29.5 -ipython==8.27.0 -jedi==0.19.1 -Jinja2==3.1.4 -jmespath==1.0.1 -joblib==1.4.2 -jsonargparse==3.13.1 -jupyter_client==8.6.3 -jupyter_core==5.7.2 -kenlm==0.2.0 -kiwisolver==1.4.7 -langcodes==3.4.1 -language_data==1.2.0 -lazy_loader==0.4 -librosa==0.10.2.post1 -lightning-utilities==0.11.8 -llvmlite==0.43.0 -lxml==5.3.0 -marisa-trie==1.2.1 -Markdown==3.7 -markdown-it-py==3.0.0 -MarkupSafe==2.1.5 -matplotlib==3.9.2 -matplotlib-inline==0.1.7 -mdurl==0.1.2 -mpmath==1.3.0 -msgpack==1.1.0 -multidict==6.1.0 -multiprocess==0.70.16 -murmurhash==1.0.10 -nest-asyncio==1.6.0 -networkx==3.3 -nltk==3.9.1 -numba==0.60.0 -numpy==1.26.4 -opencv-python==4.9.0.80 -opencv-python-headless==4.10.0.84 -packaging==24.1 -pandas==2.2.3 -parso==0.8.4 -pexpect==4.9.0 -pillow==10.4.0 -platformdirs==4.3.6 -pooch==1.8.2 -portalocker==2.10.1 -preshed==3.0.9 -prompt_toolkit==3.0.48 -protobuf==4.25.5 -psutil==6.0.0 -ptyprocess==0.7.0 -pure_eval==0.2.3 -py-cpuinfo==9.0.0 -pyarrow==17.0.0 -pybboxes==0.1.6 -pycparser==2.22 -pyctcdecode==0.5.0 -pydantic==2.9.2 -pydantic_core==2.23.4 -Pygments==2.18.0 -pygtrie==2.5.0 -pyparsing==3.1.4 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -pytorch-lightning==2.4.0 -pytz==2024.2 -PyYAML==6.0.2 -pyzmq==26.2.0 -regex==2024.9.11 -requests==2.32.3 -requests-toolbelt==1.0.0 -rich==13.9.3 -roboflow==1.1.45 -s3transfer==0.10.2 -sacrebleu==2.4.3 -safetensors==0.4.5 -sahi==0.11.18 -scikit-learn==1.5.2 -scipy==1.14.1 -seaborn==0.13.2 -sentencepiece==0.1.99 -shapely==2.0.6 -shellingham==1.5.4 -six==1.16.0 -smart-open==7.0.5 -smmap==5.0.1 -sortedcontainers==2.4.0 -soundfile==0.12.1 -soxr==0.5.0.post1 -spacy==3.8.2 -spacy-legacy==3.0.12 -spacy-loggers==1.0.5 -srsly==2.4.8 -stack-data==0.6.3 -sympy==1.13.3 -tabulate==0.9.0 -tensorboard==2.17.1 -tensorboard-data-server==0.7.2 -termcolor==2.4.0 -terminaltables==3.1.10 -thinc==8.3.2 -thop==0.1.1.post2209072238 -threadpoolctl==3.5.0 -tokenizers==0.19.1 -torch==2.4.1 -torcheval==0.0.7 -torchmetrics==0.10.3 -torchvision==0.19.1 -tornado==6.4.1 -tqdm==4.66.5 -traitlets==5.14.3 -transformers==4.44.2 -typer==0.12.5 -typing_extensions==4.12.2 -tzdata==2024.2 -ultralytics==8.2.101 -ultralytics-thop==2.0.8 -unbabel-comet==2.2.2 -urllib3==2.2.3 -wasabi==1.1.3 -wcwidth==0.2.13 -weasel==0.4.1 -Werkzeug==3.0.4 -wrapt==1.16.0 -xxhash==3.5.0 -yarl==1.12.1 -yolov5==7.0.13 From a5d9395cf9f73dec4e08d821c432fbfd5844533c Mon Sep 17 00:00:00 2001 From: James Bishop Date: Wed, 20 Nov 2024 20:36:55 +0000 Subject: [PATCH 04/10] drop empty rows --- src/arc_spice/data/multieurlex_utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/arc_spice/data/multieurlex_utils.py b/src/arc_spice/data/multieurlex_utils.py index 524e8f5..750d664 100644 --- a/src/arc_spice/data/multieurlex_utils.py +++ b/src/arc_spice/data/multieurlex_utils.py @@ -115,6 +115,7 @@ def load_multieurlex( data_dir: str, level: int, languages: list[str], + drop_empty: bool = True, ) -> tuple[DatasetDict, dict[str, Any]]: """ load the multieurlex dataset @@ -148,16 +149,19 @@ def load_multieurlex( extract_articles, fn_kwargs={"languages": languages} ) + if drop_empty: + dataset_dict = dataset_dict.filter(lambda x: x["text"] is not None) + # return datasets and metadata return dataset_dict, metadata def load_multieurlex_for_translation( - data_dir: str, level: int, lang_pair: dict[str, str] + data_dir: str, level: int, lang_pair: dict[str, str], drop_empty: bool = True ) -> tuple[DatasetDict, dict[str, Any]]: langs = [lang_pair["source"], lang_pair["target"]] dataset_dict, meta_data = load_multieurlex( - data_dir=data_dir, level=level, languages=langs + data_dir=data_dir, level=level, languages=langs, drop_empty=drop_empty ) # instantiate the preprocessor preprocesser = TranslationPreProcesser(lang_pair) From 5dd7e2466ee2d7481bc97c0b018e011621222eb6 Mon Sep 17 00:00:00 2001 From: James Bishop Date: Thu, 21 Nov 2024 19:52:36 +0000 Subject: [PATCH 05/10] multieurlex tests --- scripts/create_test_ds.py | 128 ++++++++++++++++++ src/arc_spice/data/multieurlex_utils.py | 16 ++- tests/test_multieurlex_utils.py | 102 ++++++++++++++ .../MultiEURLEX/data/eurovoc_concepts.json | 5 + .../MultiEURLEX/data/eurovoc_descriptors.json | 5 + .../testdata/base_testdata/dataset_info.json | 72 ++++++++++ .../base_testdata/dataset_info_en.json | 68 ++++++++++ .../MultiEURLEX/data/eurovoc_concepts.json | 5 + .../MultiEURLEX/data/eurovoc_descriptors.json | 5 + .../multieurlex_test/dataset_dict.json | 1 + .../test/cache-0e959431151cbf5d.arrow | Bin 0 -> 480 bytes .../test/cache-ad7a18f1fda40fdd.arrow | Bin 0 -> 2304 bytes .../test/data-00000-of-00001.arrow | Bin 0 -> 2776 bytes .../multieurlex_test/test/dataset_info.json | 72 ++++++++++ .../testdata/multieurlex_test/test/state.json | 13 ++ .../train/cache-111f8003398a2c8c.arrow | Bin 0 -> 480 bytes .../train/cache-9f1f8bb94edbe6b0.arrow | Bin 0 -> 2256 bytes .../train/data-00000-of-00001.arrow | Bin 0 -> 2728 bytes .../multieurlex_test/train/dataset_info.json | 72 ++++++++++ .../multieurlex_test/train/state.json | 13 ++ .../validation/cache-6cb654ae0f80f1b4.arrow | Bin 0 -> 2264 bytes .../validation/cache-eeed225c5ca2d2ec.arrow | Bin 0 -> 480 bytes .../validation/data-00000-of-00001.arrow | Bin 0 -> 2736 bytes .../validation/dataset_info.json | 72 ++++++++++ .../multieurlex_test/validation/state.json | 13 ++ .../MultiEURLEX/data/eurovoc_concepts.json | 5 + .../MultiEURLEX/data/eurovoc_descriptors.json | 5 + .../multieurlex_test_en/dataset_dict.json | 1 + .../test/cache-9689fb25e03c612c.arrow | Bin 0 -> 1592 bytes .../test/cache-fdb68d221b295d16.arrow | Bin 0 -> 480 bytes .../test/data-00000-of-00001.arrow | Bin 0 -> 1704 bytes .../test/dataset_info.json | 68 ++++++++++ .../multieurlex_test_en/test/state.json | 13 ++ .../train/cache-24995d769a69f928.arrow | Bin 0 -> 1544 bytes .../train/cache-fed4751c7a74f655.arrow | Bin 0 -> 480 bytes .../train/data-00000-of-00001.arrow | Bin 0 -> 1656 bytes .../train/dataset_info.json | 68 ++++++++++ .../multieurlex_test_en/train/state.json | 13 ++ .../validation/cache-a9e7f9d0c9607947.arrow | Bin 0 -> 1552 bytes .../validation/cache-cb2e97c59bb72892.arrow | Bin 0 -> 480 bytes .../validation/data-00000-of-00001.arrow | Bin 0 -> 1664 bytes .../validation/dataset_info.json | 68 ++++++++++ .../multieurlex_test_en/validation/state.json | 13 ++ 43 files changed, 913 insertions(+), 3 deletions(-) create mode 100644 scripts/create_test_ds.py create mode 100644 tests/test_multieurlex_utils.py create mode 100644 tests/testdata/base_testdata/MultiEURLEX/data/eurovoc_concepts.json create mode 100644 tests/testdata/base_testdata/MultiEURLEX/data/eurovoc_descriptors.json create mode 100644 tests/testdata/base_testdata/dataset_info.json create mode 100644 tests/testdata/base_testdata/dataset_info_en.json create mode 100644 tests/testdata/multieurlex_test/MultiEURLEX/data/eurovoc_concepts.json create mode 100644 tests/testdata/multieurlex_test/MultiEURLEX/data/eurovoc_descriptors.json create mode 100644 tests/testdata/multieurlex_test/dataset_dict.json create mode 100644 tests/testdata/multieurlex_test/test/cache-0e959431151cbf5d.arrow create mode 100644 tests/testdata/multieurlex_test/test/cache-ad7a18f1fda40fdd.arrow create mode 100644 tests/testdata/multieurlex_test/test/data-00000-of-00001.arrow create mode 100644 tests/testdata/multieurlex_test/test/dataset_info.json create mode 100644 tests/testdata/multieurlex_test/test/state.json create mode 100644 tests/testdata/multieurlex_test/train/cache-111f8003398a2c8c.arrow create mode 100644 tests/testdata/multieurlex_test/train/cache-9f1f8bb94edbe6b0.arrow create mode 100644 tests/testdata/multieurlex_test/train/data-00000-of-00001.arrow create mode 100644 tests/testdata/multieurlex_test/train/dataset_info.json create mode 100644 tests/testdata/multieurlex_test/train/state.json create mode 100644 tests/testdata/multieurlex_test/validation/cache-6cb654ae0f80f1b4.arrow create mode 100644 tests/testdata/multieurlex_test/validation/cache-eeed225c5ca2d2ec.arrow create mode 100644 tests/testdata/multieurlex_test/validation/data-00000-of-00001.arrow create mode 100644 tests/testdata/multieurlex_test/validation/dataset_info.json create mode 100644 tests/testdata/multieurlex_test/validation/state.json create mode 100644 tests/testdata/multieurlex_test_en/MultiEURLEX/data/eurovoc_concepts.json create mode 100644 tests/testdata/multieurlex_test_en/MultiEURLEX/data/eurovoc_descriptors.json create mode 100644 tests/testdata/multieurlex_test_en/dataset_dict.json create mode 100644 tests/testdata/multieurlex_test_en/test/cache-9689fb25e03c612c.arrow create mode 100644 tests/testdata/multieurlex_test_en/test/cache-fdb68d221b295d16.arrow create mode 100644 tests/testdata/multieurlex_test_en/test/data-00000-of-00001.arrow create mode 100644 tests/testdata/multieurlex_test_en/test/dataset_info.json create mode 100644 tests/testdata/multieurlex_test_en/test/state.json create mode 100644 tests/testdata/multieurlex_test_en/train/cache-24995d769a69f928.arrow create mode 100644 tests/testdata/multieurlex_test_en/train/cache-fed4751c7a74f655.arrow create mode 100644 tests/testdata/multieurlex_test_en/train/data-00000-of-00001.arrow create mode 100644 tests/testdata/multieurlex_test_en/train/dataset_info.json create mode 100644 tests/testdata/multieurlex_test_en/train/state.json create mode 100644 tests/testdata/multieurlex_test_en/validation/cache-a9e7f9d0c9607947.arrow create mode 100644 tests/testdata/multieurlex_test_en/validation/cache-cb2e97c59bb72892.arrow create mode 100644 tests/testdata/multieurlex_test_en/validation/data-00000-of-00001.arrow create mode 100644 tests/testdata/multieurlex_test_en/validation/dataset_info.json create mode 100644 tests/testdata/multieurlex_test_en/validation/state.json diff --git a/scripts/create_test_ds.py b/scripts/create_test_ds.py new file mode 100644 index 0000000..488a061 --- /dev/null +++ b/scripts/create_test_ds.py @@ -0,0 +1,128 @@ +import os +import shutil + +import datasets +from datasets import load_dataset + +from arc_spice.data import multieurlex_utils + +PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +TESTDATA_DIR = os.path.join(PROJECT_ROOT, "tests/testdata") +BASE_DATASET_INFO_MULTILANG = os.path.join( + TESTDATA_DIR, "base_testdata/dataset_info.json" +) +BASE_DATASET_INFO_EN = os.path.join(TESTDATA_DIR, "base_testdata/dataset_info_en.json") +BASE_DATASET_METADATA_DIR = os.path.join(TESTDATA_DIR, "base_testdata/MultiEURLEX") + +# TODO +CONTENT_MULTILANG: list[dict[str, str]] = [ + { + "en": f"Some text before the marker 1 {multieurlex_utils.ARTICLE_1_MARKERS['en']} Some text after the marker 1", # noqa: E501 + "fr": f"Some text before the marker 1 {multieurlex_utils.ARTICLE_1_MARKERS['fr']} Some text after the marker 1", # noqa: E501 + "de": f"Some text before the marker 1 {multieurlex_utils.ARTICLE_1_MARKERS['de']} Some text after the marker 1", # noqa: E501 + }, + { + "en": f"Some text before the marker 2 {multieurlex_utils.ARTICLE_1_MARKERS['en']} Some text after the marker 2", # noqa: E501 + "fr": f"Some text before the marker 2 {multieurlex_utils.ARTICLE_1_MARKERS['fr']} Some text after the marker 2", # noqa: E501 + "de": f"Some text before the marker 2 {multieurlex_utils.ARTICLE_1_MARKERS['de']} Some text after the marker 2", # noqa: E501 + }, + { + "en": "Some text before the marker 3", # no marker, no text after marker + "fr": "Some text before the marker 3", + "de": "Some text before the marker 3", + }, + { + "en": f"Some text before the marker 4 {multieurlex_utils.ARTICLE_1_MARKERS['en']} Some text after the marker 4", # noqa: E501 + "fr": f"Some text before the marker 4 {multieurlex_utils.ARTICLE_1_MARKERS['fr']} Some text after the marker 4", # noqa: E501 + "de": f"Some text before the marker 4 {multieurlex_utils.ARTICLE_1_MARKERS['de']} Some text after the marker 4", # noqa: E501 + }, + { + "en": f"Some text before the marker 5 {multieurlex_utils.ARTICLE_1_MARKERS['en']} Some text after the marker 5", # noqa: E501 + "fr": f"Some text before the marker 5 {multieurlex_utils.ARTICLE_1_MARKERS['fr']} Some text after the marker 5", # noqa: E501 + "de": f"Some text before the marker 5 {multieurlex_utils.ARTICLE_1_MARKERS['de']} Some text after the marker 5", # noqa: E501 + }, +] +CONTENT_EN: list[str] = [ + f"Some text before the marker 1 {multieurlex_utils.ARTICLE_1_MARKERS['en']} Some text after the marker 1", # noqa: E501 + f"Some text before the marker 2 {multieurlex_utils.ARTICLE_1_MARKERS['en']} Some text after the marker 2", # noqa: E501 + "Some text before the marker 3", # no marker, no text after marker + f"Some text before the marker 4 {multieurlex_utils.ARTICLE_1_MARKERS['en']} Some text after the marker 4", # noqa: E501 + f"Some text before the marker 5 {multieurlex_utils.ARTICLE_1_MARKERS['en']} Some text after the marker 5", # noqa: E501 +] + + +def overwrite_text( + _orig, + i: int, + content: list[dict[str, str]] | list[str], +) -> dict[str, str | dict[str, str]]: + return {"text": content[i]} + + +def create_test_ds( + testdata_dir: str, + ds_name: str, + content: list[dict[str, str]] | list[str], + dataset_info_fpath: str, +) -> None: + dataset = load_dataset( + "multi_eurlex", + "all_languages", + label_level="level_1", + trust_remote_code=True, + ) + + dataset["train"] = dataset["train"].take(5) + dataset["validation"] = dataset["validation"].take(5) + dataset["test"] = dataset["test"].take(5) + + dataset = dataset.map( + overwrite_text, + with_indices=True, + fn_kwargs={"content": content}, + ) + + dataset.save_to_disk(os.path.join(testdata_dir, ds_name)) + + shutil.copy( + dataset_info_fpath, + os.path.join(testdata_dir, ds_name, "train/dataset_info.json"), + ) + shutil.copy( + dataset_info_fpath, + os.path.join(testdata_dir, ds_name, "validation/dataset_info.json"), + ) + shutil.copy( + dataset_info_fpath, + os.path.join(testdata_dir, ds_name, "test/dataset_info.json"), + ) + # metadata copy + shutil.copytree( + BASE_DATASET_METADATA_DIR, + os.path.join(testdata_dir, ds_name, "MultiEURLEX"), + ) + + assert datasets.load_from_disk(os.path.join(testdata_dir, ds_name)) is not None + + +if __name__ == "__main__": + os.makedirs(TESTDATA_DIR, exist_ok=True) + + content = [ + "Some text before the marker en Some text after the marker", + "Some text before the marker fr Some text after the marker", + ] + + create_test_ds( + testdata_dir=TESTDATA_DIR, + ds_name="multieurlex_test", + content=CONTENT_MULTILANG, + dataset_info_fpath=BASE_DATASET_INFO_MULTILANG, + ) + + create_test_ds( + testdata_dir=TESTDATA_DIR, + ds_name="multieurlex_test_en", + content=CONTENT_EN, + dataset_info_fpath=BASE_DATASET_INFO_EN, + ) diff --git a/src/arc_spice/data/multieurlex_utils.py b/src/arc_spice/data/multieurlex_utils.py index 750d664..e38fdab 100644 --- a/src/arc_spice/data/multieurlex_utils.py +++ b/src/arc_spice/data/multieurlex_utils.py @@ -7,7 +7,11 @@ from torch.nn.functional import one_hot # For identifying where the adopted decisions begin -ARTICLE_1_MARKERS = {"en": "\nArticle 1\n", "fr": "\nArticle premier\n"} +ARTICLE_1_MARKERS = { + "en": "\nArticle 1\n", + "fr": "\nArticle premier\n", + "de": "\nArtikel 1\n", +} # creates a multi-hot vector for classification loss @@ -146,11 +150,17 @@ def load_multieurlex( ) dataset_dict = dataset_dict.map( - extract_articles, fn_kwargs={"languages": languages} + extract_articles, + fn_kwargs={"languages": languages}, ) if drop_empty: - dataset_dict = dataset_dict.filter(lambda x: x["text"] is not None) + if len(languages) == 1: + dataset_dict = dataset_dict.filter(lambda x: x["text"] is not None) + else: + dataset_dict = dataset_dict.filter( + lambda x: all(x is not None for x in x["text"].values()) + ) # return datasets and metadata return dataset_dict, metadata diff --git a/tests/test_multieurlex_utils.py b/tests/test_multieurlex_utils.py new file mode 100644 index 0000000..8a1bcf6 --- /dev/null +++ b/tests/test_multieurlex_utils.py @@ -0,0 +1,102 @@ +import os +from unittest.mock import patch + +import datasets +import pyarrow as pa +from datasets.formatting import PythonFormatter +from datasets.formatting.formatting import LazyRow + +from arc_spice.data import multieurlex_utils + +# def extract_articles( +# item: LazyRow, languages: list[str] +# ) -> dict[str, str] | dict[str, dict[str, str]]: + +TEST_ROOT = os.path.dirname(os.path.abspath(__file__)) + + +def _create_row(text) -> LazyRow: + pa_table = pa.Table.from_pydict({"text": [text]}) + formatter = PythonFormatter(lazy=True) + return formatter.format_row(pa_table) + + +def _create_multilang_row(texts_by_lang: dict[str, str]) -> LazyRow: + d = [{"text": texts_by_lang}] + pa_table = pa.Table.from_pylist(d) + formatter = PythonFormatter(lazy=True) + return formatter.format_row(pa_table) + + +def test_extract_articles_single_lang(): + langs = ["en"] + pre_text = "Some text before the marker" + post_text = "Some text after the marker" + row = _create_row( + text=f"{pre_text} {multieurlex_utils.ARTICLE_1_MARKERS['en']} {post_text}" + ) + out = multieurlex_utils.extract_articles(item=row, languages=langs) + assert out == {"text": f"{multieurlex_utils.ARTICLE_1_MARKERS['en']} {post_text}"} + + +def test_extract_articles_multi_lang(): + langs = ["en", "fr"] + pre_text = "Some text before the marker" + post_text = "Some text after the marker" + texts = { + lang: f"{pre_text} {multieurlex_utils.ARTICLE_1_MARKERS[lang]} {post_text}" + for lang in langs + } + row = _create_multilang_row(texts_by_lang=texts) + out = multieurlex_utils.extract_articles(item=row, languages=langs) + assert out == { + "text": { + "en": f"{multieurlex_utils.ARTICLE_1_MARKERS['en']} {post_text}", + "fr": f"{multieurlex_utils.ARTICLE_1_MARKERS['fr']} {post_text}", + } + } + + +def test_load_multieurlex_en(): + data_dir = f"{TEST_ROOT}/testdata/multieurlex_test_en" + level = 1 + languages = ["en"] + drop_empty = True + + ds = datasets.load_from_disk(data_dir) + with patch("arc_spice.data.multieurlex_utils.load_dataset", return_value=ds): + dataset_dict, metadata = multieurlex_utils.load_multieurlex( + data_dir=data_dir, level=level, languages=languages, drop_empty=drop_empty + ) + assert len(dataset_dict) == 3 + assert len(dataset_dict["train"]) == 4 # 5 items, 1 is empty so dropped + assert len(dataset_dict["validation"]) == 4 # 5 items, 1 is empty so dropped + assert len(dataset_dict["test"]) == 4 # 5 items, 1 is empty so dropped + assert dataset_dict["train"]["text"] == [ + f"{multieurlex_utils.ARTICLE_1_MARKERS["en"]} Some text after the marker {i}" # noqa: E501 + for i in [1, 2, 4, 5] # 3 dropped + ] + + +def test_load_multieurlex_for_translation(): + data_dir = f"{TEST_ROOT}/testdata/multieurlex_test" + level = 1 + languages = ["de", "en", "fr"] + drop_empty = True + + ds = datasets.load_from_disk(data_dir) + with patch("arc_spice.data.multieurlex_utils.load_dataset", return_value=ds): + dataset_dict, metadata = multieurlex_utils.load_multieurlex( + data_dir=data_dir, level=level, languages=languages, drop_empty=drop_empty + ) + assert len(dataset_dict) == 3 + assert len(dataset_dict["train"]) == 4 # 5 items, 1 is empty so dropped + assert len(dataset_dict["validation"]) == 4 # 5 items, 1 is empty so dropped + assert len(dataset_dict["test"]) == 4 # 5 items, 1 is empty so dropped + assert dataset_dict["train"]["text"] == [ # + { + lang: f"{multieurlex_utils.ARTICLE_1_MARKERS[lang]} Some text after the marker {i}" # noqa: E501 + for lang in languages + } + for i in [1, 2, 4, 5] # 3 dropped + ] diff --git a/tests/testdata/base_testdata/MultiEURLEX/data/eurovoc_concepts.json b/tests/testdata/base_testdata/MultiEURLEX/data/eurovoc_concepts.json new file mode 100644 index 0000000..8a6033f --- /dev/null +++ b/tests/testdata/base_testdata/MultiEURLEX/data/eurovoc_concepts.json @@ -0,0 +1,5 @@ +{ + "level_1": [ + "0" + ] +} diff --git a/tests/testdata/base_testdata/MultiEURLEX/data/eurovoc_descriptors.json b/tests/testdata/base_testdata/MultiEURLEX/data/eurovoc_descriptors.json new file mode 100644 index 0000000..3159b9b --- /dev/null +++ b/tests/testdata/base_testdata/MultiEURLEX/data/eurovoc_descriptors.json @@ -0,0 +1,5 @@ +{ + "0": { + "en": "something" + } +} diff --git a/tests/testdata/base_testdata/dataset_info.json b/tests/testdata/base_testdata/dataset_info.json new file mode 100644 index 0000000..ddf9a0d --- /dev/null +++ b/tests/testdata/base_testdata/dataset_info.json @@ -0,0 +1,72 @@ +{ + "builder_name": "multi_eurlex_test", + "config_name": "all_languages", + "dataset_name": "multi_eurlex", + "features": { + "celex_id": { + "dtype": "string", + "_type": "Value" + }, + "text": { + "languages": [ + "en", + "de", + "fr" + ], + "_type": "Translation" + }, + "labels": { + "feature": { + "names": [ + "100149", + "100160", + "100148", + "100147", + "100152", + "100143", + "100156", + "100158", + "100154", + "100153", + "100142", + "100145", + "100150", + "100162", + "100159", + "100144", + "100151", + "100157", + "100161", + "100146", + "100155" + ], + "_type": "ClassLabel" + }, + "_type": "Sequence" + } + }, + "splits": { + "train": { + "name": "train", + "num_examples": 5, + "dataset_name": "multi_eurlex_test" + }, + "test": { + "name": "test", + "num_examples": 5, + "dataset_name": "multi_eurlex_test" + }, + "validation": { + "name": "validation", + "num_examples": 5, + "dataset_name": "multi_eurlex_test" + } + }, + "version": { + "version_str": "1.0.0", + "description": "", + "major": 1, + "minor": 0, + "patch": 0 + } +} diff --git a/tests/testdata/base_testdata/dataset_info_en.json b/tests/testdata/base_testdata/dataset_info_en.json new file mode 100644 index 0000000..f55d6bd --- /dev/null +++ b/tests/testdata/base_testdata/dataset_info_en.json @@ -0,0 +1,68 @@ +{ + "builder_name": "multi_eurlex_test_en", + "config_name": "en", + "dataset_name": "multi_eurlex_test_en", + "features": { + "celex_id": { + "dtype": "string", + "_type": "Value" + }, + "text": { + "dtype": "string", + "_type": "Value" + }, + "labels": { + "feature": { + "names": [ + "100149", + "100160", + "100148", + "100147", + "100152", + "100143", + "100156", + "100158", + "100154", + "100153", + "100142", + "100145", + "100150", + "100162", + "100159", + "100144", + "100151", + "100157", + "100161", + "100146", + "100155" + ], + "_type": "ClassLabel" + }, + "_type": "Sequence" + } + }, + "splits": { + "train": { + "name": "train", + "num_examples": 5, + "dataset_name": "multi_eurlex_test_en" + }, + "test": { + "name": "test", + "num_examples": 5, + "dataset_name": "multi_eurlex_test_en" + }, + "validation": { + "name": "validation", + "num_examples": 5, + "dataset_name": "multi_eurlex_test_en" + } + }, + "version": { + "version_str": "1.0.0", + "description": "", + "major": 1, + "minor": 0, + "patch": 0 + } +} diff --git a/tests/testdata/multieurlex_test/MultiEURLEX/data/eurovoc_concepts.json b/tests/testdata/multieurlex_test/MultiEURLEX/data/eurovoc_concepts.json new file mode 100644 index 0000000..8a6033f --- /dev/null +++ b/tests/testdata/multieurlex_test/MultiEURLEX/data/eurovoc_concepts.json @@ -0,0 +1,5 @@ +{ + "level_1": [ + "0" + ] +} diff --git a/tests/testdata/multieurlex_test/MultiEURLEX/data/eurovoc_descriptors.json b/tests/testdata/multieurlex_test/MultiEURLEX/data/eurovoc_descriptors.json new file mode 100644 index 0000000..3159b9b --- /dev/null +++ b/tests/testdata/multieurlex_test/MultiEURLEX/data/eurovoc_descriptors.json @@ -0,0 +1,5 @@ +{ + "0": { + "en": "something" + } +} diff --git a/tests/testdata/multieurlex_test/dataset_dict.json b/tests/testdata/multieurlex_test/dataset_dict.json new file mode 100644 index 0000000..f15a9f8 --- /dev/null +++ b/tests/testdata/multieurlex_test/dataset_dict.json @@ -0,0 +1 @@ +{"splits": ["train", "test", "validation"]} diff --git a/tests/testdata/multieurlex_test/test/cache-0e959431151cbf5d.arrow b/tests/testdata/multieurlex_test/test/cache-0e959431151cbf5d.arrow new file mode 100644 index 0000000000000000000000000000000000000000..458b7280e5559ab51f4283151d5512686ceeb075 GIT binary patch literal 480 zcmZ`#Jx{|h5WTb}jY@%QSvcw4 z$Gfxd!#U?Mbwl#0+PM(y} zWj}QK-3PXwU4`-Q;!-4(jHZm` z!A?6=Mwq?rO496JmbR0Yy<*ur?0Oc&eQ6(&E0YJAg2zG9>IW@V>A5Wv;6sULl-e&x zlb@zR(u;$vy`QKOaqwEkc>}r|RWJ!Ux|r)YuD6a-(CQ9r-kQ-Gtof^~c{SGjy3tIA z?-`%LdBa@aM9pe-6ZK6m-mr>mGE6UZ^PwFnpmO)2Q=2#d}-CW&+Z^LH?y zn^C<^`Yr9lJ%K$(oS%XazX-QXcaN~sxO)~c@7F;%X!x{kmvJ)wi(a;jN6Ro{_uT8>OU+i Bi#!R|*mdta2W~^x;$Eb?iBi`sy}#bv zOq9*|&7OwjcH^ko*>dL`cU?@+YptGk-@z2?{Wygad*Eh^(w%<*H=?XyE~r)8(5-Yk zW^YM!r`Aq3`NH)*&kq*49ZFSCq+r2H^CE?1k%EdyVbw}DA`Gm};(|eJXuWo}s`Z9; zEx{m*ZzJqls@4_^RuH-`htK74tJduSqaJ_$~1tke(SX3?j>jq!Vp{lVv#?V%L^8LNLpPI$ia^xY)ocjR~;$gz&+ GW$Fh5NjkUy literal 0 HcmV?d00001 diff --git a/tests/testdata/multieurlex_test/train/cache-9f1f8bb94edbe6b0.arrow b/tests/testdata/multieurlex_test/train/cache-9f1f8bb94edbe6b0.arrow new file mode 100644 index 0000000000000000000000000000000000000000..6e1fae380950fc17950c44ecc0c0c91efe9a85e8 GIT binary patch literal 2256 zcma)8&2G~`5Z<bSviq@vOn z;M4;T&?Aq~BL^M;i3Gc|!CAnT(gidZ^tXJp>UxoCav(gDrlaiG^eGtO{KXFr8!Ndxt`W^ zgzM-vjdLckuJ-EPJneOLEzTs1ts`_To?de%6}T6h@g(v`qi5)V=!Nd^4u3o5ahIBW zf;I}aJFtURr_*+Lufx5LWuCArbhi8BexHDW-{p)(hZ%I%5lNi;pTAi~ zlTo`#`W?B3If1=GoSy~O}h#qh9xU0y2 zj~IB=(Ib9HjJ>dip&`L`3I8#VS;k(<4 zhYG!IS=^4gyh96OZ-e$jZ6Mqibr)cl#2+-qC{6aA8i!gzHFith%$mr`mP%}m*0=ED zf~sWa_Yik?gJSWn%ehlLuTAN*Dsq98Y~&`}l2Qp}<4i%jxzKS#X`e}1mq3MIlZx8M zlD5Ksox!6AAcg-e?elz@eZm1wk+K#3^Ne5Fr&BRy(x>|b*Z*v}z4p(u)i&3aG9__N zIN!U(>%{1Jvte1@Q>$S&p*6QjT2k9=c}=_Baih$nL4oWMuK zXT;UVNg5vT$gr!%&f$O?xEzKbq&zXweQpf=S9~G~@p{fpiFA@I_O%95+ literal 0 HcmV?d00001 diff --git a/tests/testdata/multieurlex_test/train/data-00000-of-00001.arrow b/tests/testdata/multieurlex_test/train/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..62bd2afff9ae9176b799eabcf36e703b75486891 GIT binary patch literal 2728 zcmb_e&2G~`5Z<(D8-;+%j}QlhEQ>=AJ=lqzCWr$F#HmQAl~5&wikocOs7_iOM=C13 z0S68oIQGf|AoT%w2o5{~5_HrL``h-iWpC=K=Rr7>_B%|)@*vL4X?tkr^OF64Fz!Tojk%#P*pgxP zI_lk0?|#rrU-E_RI*#i#`8t&9j+VTJkyfGi-$KbiLR>;W>gyP%Iw3E zSV;R2YT*GMuQI>OjDHtQd+wtRz7OVFoMVGw5F4=nnEA|cEsYaHYl}`K#9QFUQ*2~k zh$G(FY z;7S@%%ofEh0&{$j%Fwg($}_$UU%sb-AzuZbMrka7daNqLXA=&m>Dq5c+50Rw=tzy3 zHqy2IvBI_eFBW5rw65(-EHdb+^4rjf%%+9bwf+7CuJsSfp2GroJWrgvX#E<4P7v$F z`rZLAftmAK)p6>NovK@-R9nO7C~2ipuer^JzgG2|4x}bf*Qvot(G!gC!JolD!OZFU zUQb#)1lE@9>_w1wq}2Q64P zAIV-q1nX}d{W-*?(yU&DGI{Jzou5493Aw*UYD literal 0 HcmV?d00001 diff --git a/tests/testdata/multieurlex_test/train/dataset_info.json b/tests/testdata/multieurlex_test/train/dataset_info.json new file mode 100644 index 0000000..ddf9a0d --- /dev/null +++ b/tests/testdata/multieurlex_test/train/dataset_info.json @@ -0,0 +1,72 @@ +{ + "builder_name": "multi_eurlex_test", + "config_name": "all_languages", + "dataset_name": "multi_eurlex", + "features": { + "celex_id": { + "dtype": "string", + "_type": "Value" + }, + "text": { + "languages": [ + "en", + "de", + "fr" + ], + "_type": "Translation" + }, + "labels": { + "feature": { + "names": [ + "100149", + "100160", + "100148", + "100147", + "100152", + "100143", + "100156", + "100158", + "100154", + "100153", + "100142", + "100145", + "100150", + "100162", + "100159", + "100144", + "100151", + "100157", + "100161", + "100146", + "100155" + ], + "_type": "ClassLabel" + }, + "_type": "Sequence" + } + }, + "splits": { + "train": { + "name": "train", + "num_examples": 5, + "dataset_name": "multi_eurlex_test" + }, + "test": { + "name": "test", + "num_examples": 5, + "dataset_name": "multi_eurlex_test" + }, + "validation": { + "name": "validation", + "num_examples": 5, + "dataset_name": "multi_eurlex_test" + } + }, + "version": { + "version_str": "1.0.0", + "description": "", + "major": 1, + "minor": 0, + "patch": 0 + } +} diff --git a/tests/testdata/multieurlex_test/train/state.json b/tests/testdata/multieurlex_test/train/state.json new file mode 100644 index 0000000..43f32db --- /dev/null +++ b/tests/testdata/multieurlex_test/train/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "95b517d0a3072460", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": "train" +} diff --git a/tests/testdata/multieurlex_test/validation/cache-6cb654ae0f80f1b4.arrow b/tests/testdata/multieurlex_test/validation/cache-6cb654ae0f80f1b4.arrow new file mode 100644 index 0000000000000000000000000000000000000000..e52bb1cc8833c956357c4dc270b163039b5c04d4 GIT binary patch literal 2264 zcma)7y>1gh5Z(|cv5W!pBNPdQPA7^K5js0}j)N#bv=oRS6bVqc*n5tZ^Cg_mQc%bX zP*G4&rsNS)Qt$vsG!zs(00j?#FyFVgyLKWG-&nIdKi|&I&CaezqtWR7tPnLps)&nX zUd)LyB+yLB+N)4Y+Fm_J8E!D`WsLNdA^(JtzHgWh++Gsxx_6xeHc7@HQX>I;EgWzN_9>(Ed@Q4*qwXhXlm#>F1X`{%8 zEOUetkm(){g4K4L?X70uhti8$UR2-m-9vF6XS+S@b^(mSwiIX`;oz)e%slr$e@7#9 zM)exxo7%=cp*|1JPeF)Zz%8NofIY|FbHH3r;@rz*pE7%^EEZI6L7lme;Z?rw0^{EX z6VE(4;Co>1MH`z2lh^?ML%wE?dl^4{Xk`(lLcE3l_za9pggAo##~GN86ygi~HSAEv z*Dk2rQPX#5M(yojKjuc_eKB`|dP)5#34t)9?*4(X_2u=W^UxFc&ukL8J%ZJ+Q=unZcpn+dah}p!7pmT?Bi+O z;J=>2vj!xC|E=}o+$=uv04K=k2LJh#-{>DF)+|7ttP@@T)Ajb&KTS8=Ts6YP%yZ)T z-U2UyS@V^KS8qJ=R$C1ttv0RHveKHBRV4S^F_Iolda0*4y<8LVvd;CR3&;e3c?-k!}_Uk1ud)YiY=6ad8g*cLVdSrs9PncvV3xNI|4-lSv!8N$X|4lfQ^#O!FfDhq?k09s+NaOeI&P+n3$!dGb$(*0>oS8XuW~am9aCkf`vM8jYT#mzn<_n0T(E1%3eLT(q&lFvte*AJdpQ&Sm`cp|zzQiM)gVcmhWCM2_J9 zJ^|BGB46QO#0&-Ob|JY*i~7FhlDQqs$F-4ox6Hkvo=^TnkYe}^MdLb2CAp<bK?2l z1uubF^IElBsXi~SG^$2gsavUGrBy4{5Txk|#t-1n;GbaDbbYU@99{xvOSShR$UDmE z2GOpHoQhL?6vdrg6*`roGeQRK7~XN<$oF@aYSt{(Nm*)>oL<%`yh&O7tXX{2j`W}P zY=-Ir>N!2jx$8N7iY)b9IFk8H8hsj`{{xWB`^0yQWa6)<>ePAp H&Jp?t^H($4 literal 0 HcmV?d00001 diff --git a/tests/testdata/multieurlex_test/validation/dataset_info.json b/tests/testdata/multieurlex_test/validation/dataset_info.json new file mode 100644 index 0000000..ddf9a0d --- /dev/null +++ b/tests/testdata/multieurlex_test/validation/dataset_info.json @@ -0,0 +1,72 @@ +{ + "builder_name": "multi_eurlex_test", + "config_name": "all_languages", + "dataset_name": "multi_eurlex", + "features": { + "celex_id": { + "dtype": "string", + "_type": "Value" + }, + "text": { + "languages": [ + "en", + "de", + "fr" + ], + "_type": "Translation" + }, + "labels": { + "feature": { + "names": [ + "100149", + "100160", + "100148", + "100147", + "100152", + "100143", + "100156", + "100158", + "100154", + "100153", + "100142", + "100145", + "100150", + "100162", + "100159", + "100144", + "100151", + "100157", + "100161", + "100146", + "100155" + ], + "_type": "ClassLabel" + }, + "_type": "Sequence" + } + }, + "splits": { + "train": { + "name": "train", + "num_examples": 5, + "dataset_name": "multi_eurlex_test" + }, + "test": { + "name": "test", + "num_examples": 5, + "dataset_name": "multi_eurlex_test" + }, + "validation": { + "name": "validation", + "num_examples": 5, + "dataset_name": "multi_eurlex_test" + } + }, + "version": { + "version_str": "1.0.0", + "description": "", + "major": 1, + "minor": 0, + "patch": 0 + } +} diff --git a/tests/testdata/multieurlex_test/validation/state.json b/tests/testdata/multieurlex_test/validation/state.json new file mode 100644 index 0000000..ee9c01e --- /dev/null +++ b/tests/testdata/multieurlex_test/validation/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "71189a1028d8d0fe", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": "validation" +} diff --git a/tests/testdata/multieurlex_test_en/MultiEURLEX/data/eurovoc_concepts.json b/tests/testdata/multieurlex_test_en/MultiEURLEX/data/eurovoc_concepts.json new file mode 100644 index 0000000..8a6033f --- /dev/null +++ b/tests/testdata/multieurlex_test_en/MultiEURLEX/data/eurovoc_concepts.json @@ -0,0 +1,5 @@ +{ + "level_1": [ + "0" + ] +} diff --git a/tests/testdata/multieurlex_test_en/MultiEURLEX/data/eurovoc_descriptors.json b/tests/testdata/multieurlex_test_en/MultiEURLEX/data/eurovoc_descriptors.json new file mode 100644 index 0000000..3159b9b --- /dev/null +++ b/tests/testdata/multieurlex_test_en/MultiEURLEX/data/eurovoc_descriptors.json @@ -0,0 +1,5 @@ +{ + "0": { + "en": "something" + } +} diff --git a/tests/testdata/multieurlex_test_en/dataset_dict.json b/tests/testdata/multieurlex_test_en/dataset_dict.json new file mode 100644 index 0000000..f15a9f8 --- /dev/null +++ b/tests/testdata/multieurlex_test_en/dataset_dict.json @@ -0,0 +1 @@ +{"splits": ["train", "test", "validation"]} diff --git a/tests/testdata/multieurlex_test_en/test/cache-9689fb25e03c612c.arrow b/tests/testdata/multieurlex_test_en/test/cache-9689fb25e03c612c.arrow new file mode 100644 index 0000000000000000000000000000000000000000..f5f800aa87bdf23c0ec5910fa68cee6df11f0255 GIT binary patch literal 1592 zcmb7D&2G~`5Z<&&6GDLSBZLG(mQOu!upQe;IDzPq18OCNgiz%=n>4DE2FHBT*9#LPV9SwYNyE z8vEZ0Fu93gZ_ppU2H{`E_nqX8JDv`&-IvadJCsSW$mLuqsm$c^d^}Lfp!hPAgxq3xzdq`M+hA(zE}glEZx%=BhvXO%onCaQ#wuJ3t%*vDo-vBxOvS=40| z1&qQDqbRn>W<;U2*|>0<8(FX2Ewp+DW^9-V=tso*7K9j3+*-#TV9p$3>k@z}hC zmReC9R@MH0a6m;(()HZqt$ZU!jWy-dKlO%6dKuy@AeMnv#XBQ;n-50kxS7${Mz%R{ zN9XwC9Y5HMcum}VofqRYlaAkX&aNlY!7w< z!BsLR1-WxEl ce_$M(_t5BkkGG7@@m$ZY&hcCy-U(9w0rElXP5=M^ literal 0 HcmV?d00001 diff --git a/tests/testdata/multieurlex_test_en/test/cache-fdb68d221b295d16.arrow b/tests/testdata/multieurlex_test_en/test/cache-fdb68d221b295d16.arrow new file mode 100644 index 0000000000000000000000000000000000000000..97fa73e41f32ba4420fcda250023c57e99c969d8 GIT binary patch literal 480 zcmZ`#Jx{|h5WTb}jY@@~Et- z`=K-MA#kmB`{?Wb<6yw5!L@yLr7opIZ!xJ0Ac5 literal 0 HcmV?d00001 diff --git a/tests/testdata/multieurlex_test_en/test/data-00000-of-00001.arrow b/tests/testdata/multieurlex_test_en/test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..af6087a73056ba5d90da93fcf438d75b21d15dce GIT binary patch literal 1704 zcmb7Dy>1gh5MC#CY>Xl1M~DKU)m0P}clMoss6a$i6ey8s5Q;AL`bV5?IG-h>h&SL7 zC{lvN15i*<@DM3^10KPAv%9l*3X%z9jpv)+o!gn+Y&M%;J3=%CQC(aYd!iz0K%i3D z*&9&H)_z_h3^$nUHT;vW2K?3fK0%&()A?xOJ(8#1NX6MIRZFc5RHD|$)1g*|+55K& z$XjOVbUyYTO7ECWFXLpTybmyCYMuQ@coLte#2jW%*2(jDrdxRB1->7IeL9R#$7dAw z9O^QPT8zRrqp0JM%ZNf}b8%sw8#%8#t>e6r>m|&q1TMq%(s8yh?;!GC7jNxp5-*p} z=?3)Gx!XKa?^bF)z=eV^DsdG&S&hfI_E9`gVi)>nZ~~_JX4nl(xp5o#p0&};N#6sT zTNYx7xHA*GLwh8_lutx*nyDFK+LM~7>OFy4dW^qwlUJdU&umnEGi3wgAIykCH*M?WD@V$(zO)H_i_+_9V9&=tjQS)aegTQF3IZuXkq!gnzB zSTRohuir>PFGBhjB9<+qiWgS0V4dS8))CP~gU+lpTCQ{aw+nvOe{cdMeSfiUa{Wu^ z`V`AV-%Iy*AN&AJ-_&mTLHjUh_kBXqp&xWR7Dau74tzfhblid8?L|nk^bGPN_zUv1AxJj(6Jw$=lR53d!^|)#nY;X1m>P#jy}=L28OqaUyD>0R*a4t-S@c zs_lO&#BhUYZ(yab0sKq*en8&1zB|{ zJBs6~I{zClsHh3On0vmJ?_kzgQ@&zRZ>Xe~A@4#mGALB>T1#`xTdkY8nbtR2woTxk zZsO1O{6_zx<0*Lma^18U-J~0xKGK~+{Il}D?B8SX8JKtK1fCba^a8&_)H#RrH0t)^ zj-T|R^B_t*pafpu>mUjAun=nHUhE5b1>S(0&*%BLpGn7WIv3Xy>2MgFbePMzlV3?^ zlFmN@`Ul1Z2gZd5#ziPm^{~!b!KYyM%=FXD8T}-_tuh^H<+hf_e^={#UnKtjl1$vb Q-+U*NZ{ClogLBXPzm#q2N&o-= literal 0 HcmV?d00001 diff --git a/tests/testdata/multieurlex_test_en/train/cache-fed4751c7a74f655.arrow b/tests/testdata/multieurlex_test_en/train/cache-fed4751c7a74f655.arrow new file mode 100644 index 0000000000000000000000000000000000000000..98f3bda3393d594830c0aa325790f8a18e4a93f3 GIT binary patch literal 480 zcmZ`#Jx{|h5WTb}jY@%QS$fjD zk9TL^hjY%!h$thLE!|K|SCqhmMB<-M5sJY7cpqt4<`ZbnCGfvO?g)EUwlnYQL7tV- z)iCz@!w0r&Y#sb&_&jN_YP8*O$kkq|mzjF5-bSsaX##0%*XsV%ql&AIZt}9Yugj{; z&7mmNgnrk+>)5s^HC3(sd^>O%?J!$R8$WFpc8XTKvjhadvH`+u@M63X&_quF&g8yf z`9GGFZo{Qg|0}Hp-GP^!^s+_#cNf0pBf$OUUR<(bbwT8U=Q}UoEmD3*j@N;l>v&$K FegTSWJ1YPH literal 0 HcmV?d00001 diff --git a/tests/testdata/multieurlex_test_en/train/data-00000-of-00001.arrow b/tests/testdata/multieurlex_test_en/train/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..18341036bf1f8b055ef9f46abdba4c937408bf4d GIT binary patch literal 1656 zcmb7EyKWOf6dfmaY#A#G4^n9U2!tpoP$JPF6m9JBL#*wvUQ0m{e?Y+p zP^1Kj51^prBU17Qe1bV=cV>2FBxA01cg}szojZ5DTCG;Uc7!+(q_)@>d!iv)K%iRP z=sOVWN`F-&h8awK3t!q=z~7YZGu8`xG8@kAht`EXlySC9`iJCdFoI=R9~8PxBI9+pgodL64geMUF;6 z*Pug*C>n(G7({5bjB|zV?$##yG6ztnp%P~i)$iGEs@a$Wd))ArrNb9NxWFR zU@3QdS;T&0D#*?^CGC3vYoU}wUpA(3+NBCS7aSg`Z0axRA=}?KE@HxP)E$rIN zC+^VR^SY5|QkfLW;KZ3u4 ze}j4EAp<+iQTGahEJRvk;*-&wqEvv`}b z_?wKuwp#?7vV_}a3DG+e@3aZ#J?(>ehk88p+|y3tTd$HH`|C=n?KhRo_fF#fJxTkm R+x6V&c>SJr9ejt3{{ul@7jpmr literal 0 HcmV?d00001 diff --git a/tests/testdata/multieurlex_test_en/train/dataset_info.json b/tests/testdata/multieurlex_test_en/train/dataset_info.json new file mode 100644 index 0000000..f55d6bd --- /dev/null +++ b/tests/testdata/multieurlex_test_en/train/dataset_info.json @@ -0,0 +1,68 @@ +{ + "builder_name": "multi_eurlex_test_en", + "config_name": "en", + "dataset_name": "multi_eurlex_test_en", + "features": { + "celex_id": { + "dtype": "string", + "_type": "Value" + }, + "text": { + "dtype": "string", + "_type": "Value" + }, + "labels": { + "feature": { + "names": [ + "100149", + "100160", + "100148", + "100147", + "100152", + "100143", + "100156", + "100158", + "100154", + "100153", + "100142", + "100145", + "100150", + "100162", + "100159", + "100144", + "100151", + "100157", + "100161", + "100146", + "100155" + ], + "_type": "ClassLabel" + }, + "_type": "Sequence" + } + }, + "splits": { + "train": { + "name": "train", + "num_examples": 5, + "dataset_name": "multi_eurlex_test_en" + }, + "test": { + "name": "test", + "num_examples": 5, + "dataset_name": "multi_eurlex_test_en" + }, + "validation": { + "name": "validation", + "num_examples": 5, + "dataset_name": "multi_eurlex_test_en" + } + }, + "version": { + "version_str": "1.0.0", + "description": "", + "major": 1, + "minor": 0, + "patch": 0 + } +} diff --git a/tests/testdata/multieurlex_test_en/train/state.json b/tests/testdata/multieurlex_test_en/train/state.json new file mode 100644 index 0000000..3fea560 --- /dev/null +++ b/tests/testdata/multieurlex_test_en/train/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "8ae6be9ed4cd9e2d", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": "train" +} diff --git a/tests/testdata/multieurlex_test_en/validation/cache-a9e7f9d0c9607947.arrow b/tests/testdata/multieurlex_test_en/validation/cache-a9e7f9d0c9607947.arrow new file mode 100644 index 0000000000000000000000000000000000000000..818bf9534067813a8fc6414b51413e7dfad90135 GIT binary patch literal 1552 zcmb7Dy^hmB5Z>G+cUFiX{0Jd|khRqn6t?4^mFo^L#lTlVg$P0%O9y5|_mI z|21ZZwR-mmKhieL1oR_f^%oc$k>5u79$?N~V#I?WC$*RYV~!klEa!;S)-zgutGEv0 z9uPOAx`N3;183qO=u|-3RQi}!@ zMD$Pzx#BGJ1-v5O5O<$f^&~Ha>v!GLi>Yui3~pA`qH^nV;ZC#aBSC*pzhFcap4 literal 0 HcmV?d00001 diff --git a/tests/testdata/multieurlex_test_en/validation/cache-cb2e97c59bb72892.arrow b/tests/testdata/multieurlex_test_en/validation/cache-cb2e97c59bb72892.arrow new file mode 100644 index 0000000000000000000000000000000000000000..faf914f33244af882a1abe0b1c64fdb55873fbb7 GIT binary patch literal 480 zcmZ`#Jx{|h5WTb}joJuhs1T#&xl=@2f*AM}tSCvH)Dna$O$Jo5^Hcbj=ACg+7EXHi z@$T&Vu+~}`5GBO2q-zT4iXwQBK>YDBLgD#u=OYcvd<4z82>z$fy}@3TX{vYiATO%Y zdDpl4>;lsqOzHhY_i@r-Rd1SZcc(T|y^Pdz{@QCb3EK;5A%WYN^ z`}96b)qs90;MmtSN~^roZoW0Rgw~iXq=lO{@jC@G-dO?yU|9fuHh3Z42xy`w0B5pa zu>2oOOgH{gvHO+gf^NY}PC8j4{=Ey|@)6*EV=pe5vHC~kgy%a?-%V0(M~>&79P4;q GrhWj6&pRvt literal 0 HcmV?d00001 diff --git a/tests/testdata/multieurlex_test_en/validation/data-00000-of-00001.arrow b/tests/testdata/multieurlex_test_en/validation/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..ec81ae52b1f24dcf18adfdd8436db0bad2ca667a GIT binary patch literal 1664 zcmb7EyKWOf6dfmaVi^mi zs?#?iR&)Jjg&1Zq^>ut{s{?<}ZC|jSdz0C4?md?0-cZHaGF1yL^i`tHP9_5_46+aJ z6p**b(#dS(J(AvuRlSOnrSd*Pm8rArKire}R3&D!d}kdzi>LYsuRXuj^20-JM$~Cp z6!skIS`-Bqg>8$XjzcaY3Z2gRg+*-StZuiCvqtVLVUfjm5$-G04Ln|sMkxC*?klki`71a9^L;b)241;&2l#%j_$osOQ<76q0J!tz25c&8M0x4EK9G+(aU)Wl3a$o3(2m55yery+QrJ&Z=4$<78-PsuSP0l>;L15-O8VB09M|=d~TXH z$+zirg?L}q?*aHBn0IOiEkAhK>h=PQx*dml4jnqwMo>=Av3>%71OEZ@&d2kql6-i1 zs)q9vc&y|!PT#6j`m*sf%_jXsNxvZr&v=-@T6QdK|7Hoc%@VB3(q3l_x7{LKmnGUZ zON82y_|BY5zS9jb-;gs+vTc3Sh}w_P&Ku)MoBa8oL9%nic_Z2QTSnZNhx16+zcmyW AmH+?% literal 0 HcmV?d00001 diff --git a/tests/testdata/multieurlex_test_en/validation/dataset_info.json b/tests/testdata/multieurlex_test_en/validation/dataset_info.json new file mode 100644 index 0000000..f55d6bd --- /dev/null +++ b/tests/testdata/multieurlex_test_en/validation/dataset_info.json @@ -0,0 +1,68 @@ +{ + "builder_name": "multi_eurlex_test_en", + "config_name": "en", + "dataset_name": "multi_eurlex_test_en", + "features": { + "celex_id": { + "dtype": "string", + "_type": "Value" + }, + "text": { + "dtype": "string", + "_type": "Value" + }, + "labels": { + "feature": { + "names": [ + "100149", + "100160", + "100148", + "100147", + "100152", + "100143", + "100156", + "100158", + "100154", + "100153", + "100142", + "100145", + "100150", + "100162", + "100159", + "100144", + "100151", + "100157", + "100161", + "100146", + "100155" + ], + "_type": "ClassLabel" + }, + "_type": "Sequence" + } + }, + "splits": { + "train": { + "name": "train", + "num_examples": 5, + "dataset_name": "multi_eurlex_test_en" + }, + "test": { + "name": "test", + "num_examples": 5, + "dataset_name": "multi_eurlex_test_en" + }, + "validation": { + "name": "validation", + "num_examples": 5, + "dataset_name": "multi_eurlex_test_en" + } + }, + "version": { + "version_str": "1.0.0", + "description": "", + "major": 1, + "minor": 0, + "patch": 0 + } +} diff --git a/tests/testdata/multieurlex_test_en/validation/state.json b/tests/testdata/multieurlex_test_en/validation/state.json new file mode 100644 index 0000000..654a15e --- /dev/null +++ b/tests/testdata/multieurlex_test_en/validation/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "06c8600442860ac7", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": "validation" +} From 03dd56a2c9818dd5c1eebbf6e2a7fbdb8f1947a6 Mon Sep 17 00:00:00 2001 From: James Bishop Date: Thu, 21 Nov 2024 19:55:43 +0000 Subject: [PATCH 06/10] rm cache files --- .../test/cache-0e959431151cbf5d.arrow | Bin 480 -> 0 bytes .../test/cache-ad7a18f1fda40fdd.arrow | Bin 2304 -> 0 bytes .../train/cache-111f8003398a2c8c.arrow | Bin 480 -> 0 bytes .../train/cache-9f1f8bb94edbe6b0.arrow | Bin 2256 -> 0 bytes .../validation/cache-6cb654ae0f80f1b4.arrow | Bin 2264 -> 0 bytes .../validation/cache-eeed225c5ca2d2ec.arrow | Bin 480 -> 0 bytes .../test/cache-9689fb25e03c612c.arrow | Bin 1592 -> 0 bytes .../test/cache-fdb68d221b295d16.arrow | Bin 480 -> 0 bytes .../train/cache-24995d769a69f928.arrow | Bin 1544 -> 0 bytes .../train/cache-fed4751c7a74f655.arrow | Bin 480 -> 0 bytes .../validation/cache-a9e7f9d0c9607947.arrow | Bin 1552 -> 0 bytes .../validation/cache-cb2e97c59bb72892.arrow | Bin 480 -> 0 bytes 12 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tests/testdata/multieurlex_test/test/cache-0e959431151cbf5d.arrow delete mode 100644 tests/testdata/multieurlex_test/test/cache-ad7a18f1fda40fdd.arrow delete mode 100644 tests/testdata/multieurlex_test/train/cache-111f8003398a2c8c.arrow delete mode 100644 tests/testdata/multieurlex_test/train/cache-9f1f8bb94edbe6b0.arrow delete mode 100644 tests/testdata/multieurlex_test/validation/cache-6cb654ae0f80f1b4.arrow delete mode 100644 tests/testdata/multieurlex_test/validation/cache-eeed225c5ca2d2ec.arrow delete mode 100644 tests/testdata/multieurlex_test_en/test/cache-9689fb25e03c612c.arrow delete mode 100644 tests/testdata/multieurlex_test_en/test/cache-fdb68d221b295d16.arrow delete mode 100644 tests/testdata/multieurlex_test_en/train/cache-24995d769a69f928.arrow delete mode 100644 tests/testdata/multieurlex_test_en/train/cache-fed4751c7a74f655.arrow delete mode 100644 tests/testdata/multieurlex_test_en/validation/cache-a9e7f9d0c9607947.arrow delete mode 100644 tests/testdata/multieurlex_test_en/validation/cache-cb2e97c59bb72892.arrow diff --git a/tests/testdata/multieurlex_test/test/cache-0e959431151cbf5d.arrow b/tests/testdata/multieurlex_test/test/cache-0e959431151cbf5d.arrow deleted file mode 100644 index 458b7280e5559ab51f4283151d5512686ceeb075..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 480 zcmZ`#Jx{|h5WTb}jY@%QSvcw4 z$Gfxd!#U?Mbwl#0+PM(y} zWj}QK-3PXwU4`-Q;!-4(jHZm` z!A?6=Mwq?rO496JmbR0Yy<*ur?0Oc&eQ6(&E0YJAg2zG9>IW@V>A5Wv;6sULl-e&x zlb@zR(u;$vy`QKOaqwEkc>}r|RWJ!Ux|r)YuD6a-(CQ9r-kQ-Gtof^~c{SGjy3tIA z?-`%LdBa@aM9pe-6ZK6m-mr>mGE6UZ^PwFnpmO)2Q=2#d}-CW&+Z^LH?y zn^C<^`Yr9lJ%K$(oS%XazX-QXcaN~sxO)~c@7F;%X!x{kmvJ)wi(a;jN6Ro{_uT8>OU+i Biq!Vp{lVv#?V%L^8LNLpPI$ia^xY)ocjR~;$gz&+ GW$Fh5NjkUy diff --git a/tests/testdata/multieurlex_test/train/cache-9f1f8bb94edbe6b0.arrow b/tests/testdata/multieurlex_test/train/cache-9f1f8bb94edbe6b0.arrow deleted file mode 100644 index 6e1fae380950fc17950c44ecc0c0c91efe9a85e8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2256 zcma)8&2G~`5Z<bSviq@vOn z;M4;T&?Aq~BL^M;i3Gc|!CAnT(gidZ^tXJp>UxoCav(gDrlaiG^eGtO{KXFr8!Ndxt`W^ zgzM-vjdLckuJ-EPJneOLEzTs1ts`_To?de%6}T6h@g(v`qi5)V=!Nd^4u3o5ahIBW zf;I}aJFtURr_*+Lufx5LWuCArbhi8BexHDW-{p)(hZ%I%5lNi;pTAi~ zlTo`#`W?B3If1=GoSy~O}h#qh9xU0y2 zj~IB=(Ib9HjJ>dip&`L`3I8#VS;k(<4 zhYG!IS=^4gyh96OZ-e$jZ6Mqibr)cl#2+-qC{6aA8i!gzHFith%$mr`mP%}m*0=ED zf~sWa_Yik?gJSWn%ehlLuTAN*Dsq98Y~&`}l2Qp}<4i%jxzKS#X`e}1mq3MIlZx8M zlD5Ksox!6AAcg-e?elz@eZm1wk+K#3^Ne5Fr&BRy(x>|b*Z*v}z4p(u)i&3aG9__N zIN!U(>%{1Jvte1@Q>$S&p*6QjT2k9=c}=_Baih$nL4oWMuK zXT;UVNg5vT$gr!%&f$O?xEzKbq&zXweQpf=S9~G~@p{fpiFA@I_O%95+ diff --git a/tests/testdata/multieurlex_test/validation/cache-6cb654ae0f80f1b4.arrow b/tests/testdata/multieurlex_test/validation/cache-6cb654ae0f80f1b4.arrow deleted file mode 100644 index e52bb1cc8833c956357c4dc270b163039b5c04d4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2264 zcma)7y>1gh5Z(|cv5W!pBNPdQPA7^K5js0}j)N#bv=oRS6bVqc*n5tZ^Cg_mQc%bX zP*G4&rsNS)Qt$vsG!zs(00j?#FyFVgyLKWG-&nIdKi|&I&CaezqtWR7tPnLps)&nX zUd)LyB+yLB+N)4Y+Fm_J8E!D`WsLNdA^(JtzHgWh++Gsxx_6xeHc7@HQX>I;EgWzN_9>(Ed@Q4*qwXhXlm#>F1X`{%8 zEOUetkm(){g4K4L?X70uhti8$UR2-m-9vF6XS+S@b^(mSwiIX`;oz)e%slr$e@7#9 zM)exxo7%=cp*|1JPeF)Zz%8NofIY|FbHH3r;@rz*pE7%^EEZI6L7lme;Z?rw0^{EX z6VE(4;Co>1MH`z2lh^?ML%wE?dl^4{Xk`(lLcE3l_za9pggAo##~GN86ygi~HSAEv z*Dk2rQPX#5M(yojKjuc_eKB`|dP)5#34t)9?*4(X_2u=W^UxFc&ukL8J%ZJ+Q=unZcpn+dah}p!7pmT?Bi+O z;J=>2vj!xC|E=}o+$=uv04K=k2LJh#-{>DF)+|7ttP@@T)Ajb&KTS8=Ts6YP%yZ)T z-U2UyS@V^KS8qJ=R$C1ttv0RHveKHBRV4S^F_Iolda0*4y<8LVvd;CR3&;e3c?-k!}_Uk1ud)YiY=6ad8g*cLV4DE2FHBT*9#LPV9SwYNyE z8vEZ0Fu93gZ_ppU2H{`E_nqX8JDv`&-IvadJCsSW$mLuqsm$c^d^}Lfp!hPAgxq3xzdq`M+hA(zE}glEZx%=BhvXO%onCaQ#wuJ3t%*vDo-vBxOvS=40| z1&qQDqbRn>W<;U2*|>0<8(FX2Ewp+DW^9-V=tso*7K9j3+*-#TV9p$3>k@z}hC zmReC9R@MH0a6m;(()HZqt$ZU!jWy-dKlO%6dKuy@AeMnv#XBQ;n-50kxS7${Mz%R{ zN9XwC9Y5HMcum}VofqRYlaAkX&aNlY!7w< z!BsLR1-WxEl ce_$M(_t5BkkGG7@@m$ZY&hcCy-U(9w0rElXP5=M^ diff --git a/tests/testdata/multieurlex_test_en/test/cache-fdb68d221b295d16.arrow b/tests/testdata/multieurlex_test_en/test/cache-fdb68d221b295d16.arrow deleted file mode 100644 index 97fa73e41f32ba4420fcda250023c57e99c969d8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 480 zcmZ`#Jx{|h5WTb}jY@@~Et- z`=K-MA#kmB`{?Wb<6yw5!L@yLr7opIZ!xJ0Ac5 diff --git a/tests/testdata/multieurlex_test_en/train/cache-24995d769a69f928.arrow b/tests/testdata/multieurlex_test_en/train/cache-24995d769a69f928.arrow deleted file mode 100644 index 8e9a204d96974d9e01da64f70d8a6221efc10a7e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1544 zcmb7E&5ja55N>v1AxJj(6Jw$=lR53d!^|)#nY;X1m>P#jy}=L28OqaUyD>0R*a4t-S@c zs_lO&#BhUYZ(yab0sKq*en8&1zB|{ zJBs6~I{zClsHh3On0vmJ?_kzgQ@&zRZ>Xe~A@4#mGALB>T1#`xTdkY8nbtR2woTxk zZsO1O{6_zx<0*Lma^18U-J~0xKGK~+{Il}D?B8SX8JKtK1fCba^a8&_)H#RrH0t)^ zj-T|R^B_t*pafpu>mUjAun=nHUhE5b1>S(0&*%BLpGn7WIv3Xy>2MgFbePMzlV3?^ zlFmN@`Ul1Z2gZd5#ziPm^{~!b!KYyM%=FXD8T}-_tuh^H<+hf_e^={#UnKtjl1$vb Q-+U*NZ{ClogLBXPzm#q2N&o-= diff --git a/tests/testdata/multieurlex_test_en/train/cache-fed4751c7a74f655.arrow b/tests/testdata/multieurlex_test_en/train/cache-fed4751c7a74f655.arrow deleted file mode 100644 index 98f3bda3393d594830c0aa325790f8a18e4a93f3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 480 zcmZ`#Jx{|h5WTb}jY@%QS$fjD zk9TL^hjY%!h$thLE!|K|SCqhmMB<-M5sJY7cpqt4<`ZbnCGfvO?g)EUwlnYQL7tV- z)iCz@!w0r&Y#sb&_&jN_YP8*O$kkq|mzjF5-bSsaX##0%*XsV%ql&AIZt}9Yugj{; z&7mmNgnrk+>)5s^HC3(sd^>O%?J!$R8$WFpc8XTKvjhadvH`+u@M63X&_quF&g8yf z`9GGFZo{Qg|0}Hp-GP^!^s+_#cNf0pBf$OUUR<(bbwT8U=Q}UoEmD3*j@N;l>v&$K FegTSWJ1YPH diff --git a/tests/testdata/multieurlex_test_en/validation/cache-a9e7f9d0c9607947.arrow b/tests/testdata/multieurlex_test_en/validation/cache-a9e7f9d0c9607947.arrow deleted file mode 100644 index 818bf9534067813a8fc6414b51413e7dfad90135..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1552 zcmb7Dy^hmB5Z>G+cUFiX{0Jd|khRqn6t?4^mFo^L#lTlVg$P0%O9y5|_mI z|21ZZwR-mmKhieL1oR_f^%oc$k>5u79$?N~V#I?WC$*RYV~!klEa!;S)-zgutGEv0 z9uPOAx`N3;183qO=u|-3RQi}!@ zMD$Pzx#BGJ1-v5O5O<$f^&~Ha>v!GLi>Yui3~pA`qH^nV;ZC#aBSC*pzhFcap4 diff --git a/tests/testdata/multieurlex_test_en/validation/cache-cb2e97c59bb72892.arrow b/tests/testdata/multieurlex_test_en/validation/cache-cb2e97c59bb72892.arrow deleted file mode 100644 index faf914f33244af882a1abe0b1c64fdb55873fbb7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 480 zcmZ`#Jx{|h5WTb}joJuhs1T#&xl=@2f*AM}tSCvH)Dna$O$Jo5^Hcbj=ACg+7EXHi z@$T&Vu+~}`5GBO2q-zT4iXwQBK>YDBLgD#u=OYcvd<4z82>z$fy}@3TX{vYiATO%Y zdDpl4>;lsqOzHhY_i@r-Rd1SZcc(T|y^Pdz{@QCb3EK;5A%WYN^ z`}96b)qs90;MmtSN~^roZoW0Rgw~iXq=lO{@jC@G-dO?yU|9fuHh3Z42xy`w0B5pa zu>2oOOgH{gvHO+gf^NY}PC8j4{=Ey|@)6*EV=pe5vHC~kgy%a?-%V0(M~>&79P4;q GrhWj6&pRvt From 42b7a1d36586ca5ca0f1b0c59e14a5cf1864eb88 Mon Sep 17 00:00:00 2001 From: James Bishop Date: Thu, 21 Nov 2024 19:56:38 +0000 Subject: [PATCH 07/10] ignore test caches --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 1285fd4..4cefc4c 100644 --- a/.gitignore +++ b/.gitignore @@ -164,3 +164,6 @@ slurm_scripts/slurm_logs* temp .vscode local_notebooks + +# test caches +tests/testdata/*/*/cache* From ae88310d37531ee197dff1a15c39e460ff6f630c Mon Sep 17 00:00:00 2001 From: James Bishop Date: Thu, 21 Nov 2024 20:00:20 +0000 Subject: [PATCH 08/10] fix quote mismatch --- tests/test_multieurlex_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_multieurlex_utils.py b/tests/test_multieurlex_utils.py index 8a1bcf6..ca82e61 100644 --- a/tests/test_multieurlex_utils.py +++ b/tests/test_multieurlex_utils.py @@ -73,7 +73,7 @@ def test_load_multieurlex_en(): assert len(dataset_dict["validation"]) == 4 # 5 items, 1 is empty so dropped assert len(dataset_dict["test"]) == 4 # 5 items, 1 is empty so dropped assert dataset_dict["train"]["text"] == [ - f"{multieurlex_utils.ARTICLE_1_MARKERS["en"]} Some text after the marker {i}" # noqa: E501 + f"{multieurlex_utils.ARTICLE_1_MARKERS['en']} Some text after the marker {i}" # noqa: E501 for i in [1, 2, 4, 5] # 3 dropped ] From 5d35c90942349075af9528b630f9e5e8eb4c2e86 Mon Sep 17 00:00:00 2001 From: James Bishop Date: Fri, 22 Nov 2024 15:20:03 +0000 Subject: [PATCH 09/10] rm hardcoded sample --- scripts/variational_RTC_example.py | 31 +++++++++--------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/scripts/variational_RTC_example.py b/scripts/variational_RTC_example.py index 8475d94..fdeea22 100644 --- a/scripts/variational_RTC_example.py +++ b/scripts/variational_RTC_example.py @@ -17,6 +17,13 @@ ) +def get_random_test_row(train_data): + row_iterator = iter(train_data) + for _ in range(randint(1, 25)): + test_row = next(row_iterator) + return test_row + + def load_test_row(): lang_pair = {"source": "fr", "target": "en"} dataset_dict, metadata_params = load_multieurlex_for_translation( @@ -24,31 +31,11 @@ def load_test_row(): ) train = dataset_dict["train"] multi_onehot = MultiHot(metadata_params["n_classes"]) - test_row = get_test_row(train) - class_labels = multi_onehot(test_row["class_labels"]) + test_row = get_random_test_row(train) + class_labels = multi_onehot(test_row["labels"]) return test_row, class_labels, metadata_params -def get_test_row(train_data): - # debug row if needed - return { - "source_text": ( - "Le renard brun rapide a sauté par-dessus le chien paresseux." - "Le renard a sauté par-dessus le chien paresseux." - ), - "target_text": ( - "The quick brown fox jumped over the lazy dog. The fox jumped" - " over the lazy dog" - ), - "class_labels": [0, 1], - } - ## Normal row - row_iterator = iter(train_data) - for _ in range(randint(1, 25)): - test_row = next(row_iterator) - return test_row - - def print_results(clean_output, var_output, class_labels, test_row, comet_model): # ### TRANSLATION ### print("\nTranslation:") From ac6a29216f0c75b0eb6e93effa7058d14178d183 Mon Sep 17 00:00:00 2001 From: James Bishop Date: Fri, 22 Nov 2024 15:30:58 +0000 Subject: [PATCH 10/10] rm commented code --- tests/test_multieurlex_utils.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/test_multieurlex_utils.py b/tests/test_multieurlex_utils.py index ca82e61..4701f51 100644 --- a/tests/test_multieurlex_utils.py +++ b/tests/test_multieurlex_utils.py @@ -8,10 +8,6 @@ from arc_spice.data import multieurlex_utils -# def extract_articles( -# item: LazyRow, languages: list[str] -# ) -> dict[str, str] | dict[str, dict[str, str]]: - TEST_ROOT = os.path.dirname(os.path.abspath(__file__))