alan-turing-institute · J-Dymond · Nov 22, 2024 · Nov 20, 2024 · Nov 20, 2024 · Nov 20, 2024
diff --git a/.gitignore b/.gitignore
@@ -164,3 +164,6 @@ slurm_scripts/slurm_logs*
 temp
 .vscode
 local_notebooks
+
+# test caches
+tests/testdata/*/*/cache*
diff --git a/requirements.txt b/requirements.txt
diff --git a/scripts/create_test_ds.py b/scripts/create_test_ds.py
@@ -0,0 +1,128 @@
+import os
+import shutil
+
+import datasets
+from datasets import load_dataset
+
+from arc_spice.data import multieurlex_utils
+
+PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+TESTDATA_DIR = os.path.join(PROJECT_ROOT, "tests/testdata")
+BASE_DATASET_INFO_MULTILANG = os.path.join(
+    TESTDATA_DIR, "base_testdata/dataset_info.json"
+)
+BASE_DATASET_INFO_EN = os.path.join(TESTDATA_DIR, "base_testdata/dataset_info_en.json")
+BASE_DATASET_METADATA_DIR = os.path.join(TESTDATA_DIR, "base_testdata/MultiEURLEX")
+
+# TODO
+CONTENT_MULTILANG: list[dict[str, str]] = [
+    {
+        "en": f"Some text before the marker 1 {multieurlex_utils.ARTICLE_1_MARKERS['en']} Some text after the marker 1",  # noqa: E501
+        "fr": f"Some text before the marker 1 {multieurlex_utils.ARTICLE_1_MARKERS['fr']} Some text after the marker 1",  # noqa: E501
+        "de": f"Some text before the marker 1 {multieurlex_utils.ARTICLE_1_MARKERS['de']} Some text after the marker 1",  # noqa: E501
+    },
+    {
+        "en": f"Some text before the marker 2 {multieurlex_utils.ARTICLE_1_MARKERS['en']} Some text after the marker 2",  # noqa: E501
+        "fr": f"Some text before the marker 2 {multieurlex_utils.ARTICLE_1_MARKERS['fr']} Some text after the marker 2",  # noqa: E501
+        "de": f"Some text before the marker 2 {multieurlex_utils.ARTICLE_1_MARKERS['de']} Some text after the marker 2",  # noqa: E501
+    },
+    {
+        "en": "Some text before the marker 3",  # no marker, no text after marker
+        "fr": "Some text before the marker 3",
+        "de": "Some text before the marker 3",
+    },
+    {
+        "en": f"Some text before the marker 4 {multieurlex_utils.ARTICLE_1_MARKERS['en']} Some text after the marker 4",  # noqa: E501
+        "fr": f"Some text before the marker 4 {multieurlex_utils.ARTICLE_1_MARKERS['fr']} Some text after the marker 4",  # noqa: E501
+        "de": f"Some text before the marker 4 {multieurlex_utils.ARTICLE_1_MARKERS['de']} Some text after the marker 4",  # noqa: E501
+    },
+    {
+        "en": f"Some text before the marker 5 {multieurlex_utils.ARTICLE_1_MARKERS['en']} Some text after the marker 5",  # noqa: E501
+        "fr": f"Some text before the marker 5 {multieurlex_utils.ARTICLE_1_MARKERS['fr']} Some text after the marker 5",  # noqa: E501
+        "de": f"Some text before the marker 5 {multieurlex_utils.ARTICLE_1_MARKERS['de']} Some text after the marker 5",  # noqa: E501
+    },
+]
+CONTENT_EN: list[str] = [
+    f"Some text before the marker 1 {multieurlex_utils.ARTICLE_1_MARKERS['en']} Some text after the marker 1",  # noqa: E501
+    f"Some text before the marker 2 {multieurlex_utils.ARTICLE_1_MARKERS['en']} Some text after the marker 2",  # noqa: E501
+    "Some text before the marker 3",  # no marker, no text after marker
+    f"Some text before the marker 4 {multieurlex_utils.ARTICLE_1_MARKERS['en']} Some text after the marker 4",  # noqa: E501
+    f"Some text before the marker 5 {multieurlex_utils.ARTICLE_1_MARKERS['en']} Some text after the marker 5",  # noqa: E501
+]
+
+
+def overwrite_text(
+    _orig,
+    i: int,
+    content: list[dict[str, str]] | list[str],
+) -> dict[str, str | dict[str, str]]:
+    return {"text": content[i]}
+
+
+def create_test_ds(
+    testdata_dir: str,
+    ds_name: str,
+    content: list[dict[str, str]] | list[str],
+    dataset_info_fpath: str,
+) -> None:
+    dataset = load_dataset(
+        "multi_eurlex",
+        "all_languages",
+        label_level="level_1",
+        trust_remote_code=True,
+    )
+
+    dataset["train"] = dataset["train"].take(5)
+    dataset["validation"] = dataset["validation"].take(5)
+    dataset["test"] = dataset["test"].take(5)
+
+    dataset = dataset.map(
+        overwrite_text,
+        with_indices=True,
+        fn_kwargs={"content": content},
+    )
+
+    dataset.save_to_disk(os.path.join(testdata_dir, ds_name))
+
+    shutil.copy(
+        dataset_info_fpath,
+        os.path.join(testdata_dir, ds_name, "train/dataset_info.json"),
+    )
+    shutil.copy(
+        dataset_info_fpath,
+        os.path.join(testdata_dir, ds_name, "validation/dataset_info.json"),
+    )
+    shutil.copy(
+        dataset_info_fpath,
+        os.path.join(testdata_dir, ds_name, "test/dataset_info.json"),
+    )
+    # metadata copy
+    shutil.copytree(
+        BASE_DATASET_METADATA_DIR,
+        os.path.join(testdata_dir, ds_name, "MultiEURLEX"),
+    )
+
+    assert datasets.load_from_disk(os.path.join(testdata_dir, ds_name)) is not None
+
+
+if __name__ == "__main__":
+    os.makedirs(TESTDATA_DIR, exist_ok=True)
+
+    content = [
+        "Some text before the marker en Some text after the marker",
+        "Some text before the marker fr Some text after the marker",
+    ]
+
+    create_test_ds(
+        testdata_dir=TESTDATA_DIR,
+        ds_name="multieurlex_test",
+        content=CONTENT_MULTILANG,
+        dataset_info_fpath=BASE_DATASET_INFO_MULTILANG,
+    )
+
+    create_test_ds(
+        testdata_dir=TESTDATA_DIR,
+        ds_name="multieurlex_test_en",
+        content=CONTENT_EN,
+        dataset_info_fpath=BASE_DATASET_INFO_EN,
+    )
diff --git a/scripts/variational_RTC_example.py b/scripts/variational_RTC_example.py
@@ -3,48 +3,33 @@
 """
 
 import logging
-import os
-import random
 from random import randint
 
-import numpy as np
 import torch
 from torch.nn.functional import binary_cross_entropy
 
-from arc_spice.data.multieurlex_utils import MultiHot, load_multieurlex
+from arc_spice.data.multieurlex_utils import MultiHot, load_multieurlex_for_translation
 from arc_spice.eval.classification_error import hamming_accuracy
 from arc_spice.eval.translation_error import get_comet_model
+from arc_spice.utils import seed_everything
 from arc_spice.variational_pipelines.RTC_variational_pipeline import (
     RTCVariationalPipeline,
 )
 
 
-def seed_everything(seed):
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-    torch.backends.cudnn.deterministic = True
-    torch.backends.cudnn.benchmark = False
-    np.random.seed(seed)
-    random.seed(seed)
-    os.environ["PYTHONHASHSEED"] = str(seed)
-
-
 def load_test_row():
     lang_pair = {"source": "fr", "target": "en"}
-    (train, _, _), metadata_params = load_multieurlex(
+    dataset_dict, metadata_params = load_multieurlex_for_translation(
         data_dir="data", level=1, lang_pair=lang_pair
     )
+    train = dataset_dict["train"]
     multi_onehot = MultiHot(metadata_params["n_classes"])
     test_row = get_test_row(train)
     class_labels = multi_onehot(test_row["class_labels"])
     return test_row, class_labels, metadata_params
 
 
 def get_test_row(train_data):
-    row_iterator = iter(train_data)
-    for _ in range(randint(1, 25)):
-        test_row = next(row_iterator)
-
     # debug row if needed
     return {
         "source_text": (
@@ -57,7 +42,10 @@ def get_test_row(train_data):
         ),
         "class_labels": [0, 1],
     }
-    # Normal row
+    ## Normal row
+    row_iterator = iter(train_data)
+    for _ in range(randint(1, 25)):
+        test_row = next(row_iterator)
     return test_row