From 00c309eb08a507e7f02fae72e151c62d90a59590 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Tue, 10 Dec 2024 20:13:21 +0000
Subject: [PATCH 01/36] add notebook to read COCO annotations as xarray

---
 .../annotations/notebook_annots_as_xarray.py  | 231 ++++++++++++++++++
 ethology/annotations/utils.py                 |  72 ++++++
 pyproject.toml                                |   3 +
 3 files changed, 306 insertions(+)
 create mode 100644 ethology/annotations/notebook_annots_as_xarray.py
 create mode 100644 ethology/annotations/utils.py

diff --git a/ethology/annotations/notebook_annots_as_xarray.py b/ethology/annotations/notebook_annots_as_xarray.py
new file mode 100644
index 0000000..bdcb59f
--- /dev/null
+++ b/ethology/annotations/notebook_annots_as_xarray.py
@@ -0,0 +1,231 @@
+# %%
+
+import numpy as np
+import xarray as xr
+from utils import read_json_file_as_dict
+
+# %%%%%%%%%%%%%%%%%%%
+# input data
+via_file_path = (
+    "/home/sminano/swc/project_ethology/sample_VIA_annotations/VIA_JSON_1.json"
+)
+coco_file_path = (
+    "/home/sminano/swc/project_ethology/sample_COCO_annotations/sample_annotations_1.json"
+)
+
+# via_data = read_via_json_file_as_dict(via_file_path)
+# print(via_data.keys())  # _via_img_metadata, _via_image_id_list
+
+# %%%%%%%%%%%%%%%%%%%%
+# read as dict
+coco_data = read_json_file_as_dict(coco_file_path)
+
+print(
+    coco_data.keys()
+)  # dict_keys(['annotations', 'categories', 'images', 'info', 'licenses'])
+# %%%%%%%%%%%%%%%%%%%%
+
+
+def compute_homog_data_array_per_image_id(data_str, axis_image_id_in_output):
+    # pair up data with image id
+    pair_data = []
+    for annot in coco_data["annotations"]:
+        if isinstance(annot[data_str], list):
+            pair_data.append(annot[data_str] + [annot["image_id"]])
+        else:
+            pair_data.append([annot[data_str], annot["image_id"]])
+
+    data_and_image_id_array = np.array(pair_data)
+
+    # split
+    data_array_per_image_id = np.split(
+        data_and_image_id_array[:, : data_and_image_id_array.shape[1] - 1],
+        np.where(np.diff(data_and_image_id_array[:, -1]))[0] + 1,
+        axis=0,
+    )
+
+    # pad
+    max_bboxes_per_image = max([d.shape[0] for d in data_array_per_image_id])
+    data_array_per_image_id_with_nans = np.stack(
+        [
+            np.concat(
+                (
+                    d,
+                    np.full(
+                        (max_bboxes_per_image - d.shape[0], d.shape[1]), np.nan
+                    ),
+                )
+            ).squeeze()
+            for d in data_array_per_image_id
+        ],
+        axis=axis_image_id_in_output,  # 1, -1
+    )  # annotation_image_id, image_id, space
+
+    return data_array_per_image_id_with_nans
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Format bboxes data as xarray DataArray
+
+# # get bboxes coordinates per image
+# bbox_and_image_id_array = np.array(
+#     [annot["bbox"] + [annot["image_id"]]
+# for annot in coco_data["annotations"]]
+# )
+# bbox_array_per_image_id = np.split(
+#     bbox_and_image_id_array[:, :4],
+#     np.where(np.diff(bbox_and_image_id_array[:, -1]))[0] + 1,
+#     axis=0,
+# )
+
+# # pad missing annnotation-image-ids with np.nan
+# max_bboxes_per_image = max([d.shape[0] for d in bbox_array_per_image_id])
+# bbox_array_per_image_id_with_nans = np.stack(
+#     [
+#         np.concat(
+#             (
+#                 d,
+#                 np.full(
+#                     (max_bboxes_per_image - d.shape[0], d.shape[1]), np.nan
+#                 ),
+#             )
+#         ).squeeze()
+#         for d in bbox_array_per_image_id
+#     ],
+#     axis=1
+# )  #annotation_image_id, image_id, space
+
+# define bboxes data array
+bboxes_data = compute_homog_data_array_per_image_id(
+    "bbox", axis_image_id_in_output=1
+)
+bboxes_da = xr.DataArray.from_dict(
+    {
+        "dims": [
+            "annotation_image_id",
+            "image_id",
+            "space",
+        ],
+        "data": bboxes_data,
+        "coords": {
+            "annotation_image_id": {
+                "dims": "annotation_image_id",
+                "data": list(range(bboxes_data.shape[0])),  # ---------
+            },
+            "image_id": {
+                "dims": "image_id",
+                "data": np.unique(
+                    [annot["image_id"] for annot in coco_data["annotations"]]
+                ),
+            },
+            "space": {
+                "dims": "space",
+                "data": ["x", "y", "width", "height"],
+            },
+        },
+        # "attrs": {"title": "air temperature"},
+        "name": "bbox",
+    }
+)
+
+# %%%%%%%%%%%%%%%%%%%%
+# Format annotation ID as xarray DataArray
+
+# # get data
+# annot_and_image_id_array = np.array(
+#     [
+#         [annot["id"]] + [annot["image_id"]]
+#         for annot in coco_data["annotations"]
+#     ],
+#     dtype=int,
+# )
+
+# # split based on image id
+# annot_array_per_image_id = np.split(
+#     annot_and_image_id_array[:, 0].reshape(-1, 1),
+#     np.where(np.diff(annot_and_image_id_array[:, -1]))[0] + 1,
+#     axis=0,
+# )
+
+# # pad missing annnotation-image-ids with np.nan
+# # max_bboxes_per_image = max([d.shape[0] for d in annot_array_per_image_id])
+# annot_array_per_image_id_with_nans = np.stack(
+#     [
+#         np.concat(
+#             (
+#                 d,
+#                 np.full(
+#                     (max_bboxes_per_image - d.shape[0], d.shape[1]), np.nan
+#                 ),
+#             )
+#         ).squeeze()
+#         for d in annot_array_per_image_id
+#     ],
+#     axis=-1,
+#     # dtype=int
+# )  # annotation_image_id, image_id
+
+
+# define annot ID data array
+annot_ID_data = compute_homog_data_array_per_image_id(
+    "id", axis_image_id_in_output=-1
+)
+annotation_id_da = xr.DataArray.from_dict(
+    {
+        "dims": [
+            "annotation_image_id",
+            "image_id",
+        ],
+        "data": annot_ID_data,
+        "coords": {
+            "annotation_image_id": {
+                "dims": "annotation_image_id",
+                "data": list(range(annot_ID_data.shape[0])),  # ---------
+            },
+            "image_id": {
+                "dims": "image_id",
+                "data": np.unique(
+                    [annot["image_id"] for annot in coco_data["annotations"]]
+                ),
+            },
+        },
+        "attrs": {"title": "annotations ID per dataset"},
+        "name": "bbox",
+    }
+)
+
+
+# %%
+ds = xr.Dataset(
+    data_vars=dict(
+        bbox=(["annotation_image_id", "image_id", "space"], bboxes_da.data),
+        global_id=(
+            ["annotation_image_id", "image_id"],
+            annotation_id_da.data,
+        ),
+        # category=(["annotation_id", "category_id"], category_da),
+    ),
+    coords=dict(
+        annotation_image_id=bboxes_da.coords["annotation_image_id"],
+        image_id=bboxes_da.coords["image_id"],
+        space=bboxes_da.coords["space"],
+        # category_id=category_da.coords["category_id"],
+    ),
+    # attrs=dict(description="Weather related data."),
+)
+
+# %%%%%%%%%%%%%%%%%%%%
+# Inspect the dataset
+
+print(ds)
+
+# get all annotations in image 4
+ds.bbox.sel(image_id=4)
+
+
+# get the bbox coordinates of the annotation with global ID = 2
+# a.where(a.x + a.y < 4)
+ds.bbox.where(ds.global_id == 2, drop=True)
+
+# get the global ID of the third annotation per image
+ds.global_id.sel(annotation_image_id=3)
diff --git a/ethology/annotations/utils.py b/ethology/annotations/utils.py
new file mode 100644
index 0000000..925489a
--- /dev/null
+++ b/ethology/annotations/utils.py
@@ -0,0 +1,72 @@
+"""Utility functions to work with annotations in JSON format."""
+
+import json
+from pathlib import Path
+
+
+def read_json_file_as_dict(
+    file_path: Path,
+) -> dict:
+    """Read JSON file as dict.
+
+    Parameters
+    ----------
+    file_path : str
+        Path to the JSON file
+
+    Returns
+    -------
+    dict
+        Dictionary with the JSON data
+
+    """
+    try:
+        with open(file_path) as file:
+            return json.load(file)
+    except FileNotFoundError as not_found_error:
+        msg = f"File not found: {file_path}"
+        raise ValueError(msg) from not_found_error
+    except json.JSONDecodeError as decode_error:
+        msg = f"Error decoding JSON data from file: {file_path}"
+        raise ValueError(msg) from decode_error
+
+
+def read_via_json_file_as_dict(file_path: Path) -> dict:
+    """Read VIA JSON file as dict.
+
+    Parameters
+    ----------
+    file_path : str
+        Path to the VIA JSON file
+
+    Returns
+    -------
+    dict
+        Dictionary with the JSON data
+
+    """
+    # Read data
+    data = read_json_file_as_dict(file_path)
+
+    # Check the expected keys are defined in the JSON file
+    expected_keys = [
+        "_via_settings",
+        "_via_img_metadata",
+        "_via_attributes",
+        "_via_data_format_version",
+        "_via_image_id_list",
+    ]
+
+    for ky in expected_keys:
+        if ky not in data:
+            raise ValueError(
+                f"Expected key '{ky}' not found in file: {file_path}"
+            )
+
+    return data
+
+
+# def read_via_json_file_as_xarray(file_path: Path):
+
+
+#     via_dict = read_via_json_file_as_dict(file_path)
diff --git a/pyproject.toml b/pyproject.toml
index 3fb07d7..88443ce 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,9 @@ classifiers = [
 dependencies = [
   "movement",
 ]
+dependencies = [
+    "xarray", # "xarray[accel,viz]",
+]
 
 [project.urls]
 "Homepage" = "https://github.com/neuroinformatics-unit/ethology"

From 261e03c91cb9eae22833ce4840fc87450684dcfb Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Tue, 10 Dec 2024 20:25:38 +0000
Subject: [PATCH 02/36] Simplify

---
 .../annotations/notebook_annots_as_xarray.py  | 168 ++++--------------
 1 file changed, 35 insertions(+), 133 deletions(-)

diff --git a/ethology/annotations/notebook_annots_as_xarray.py b/ethology/annotations/notebook_annots_as_xarray.py
index bdcb59f..08e0a5c 100644
--- a/ethology/annotations/notebook_annots_as_xarray.py
+++ b/ethology/annotations/notebook_annots_as_xarray.py
@@ -1,4 +1,22 @@
-# %%
+"""Explore formatting COCO annotations as an xarray Dataset.
+
+The dataset is made up from the following data variables:
+- bbox:  a 3D array with bounding box coordinates and shape
+        (max_n_bboxes_per_image, n_images, 4).
+        The four coordinates represent (x, y, h, w) per annotation.
+- global_id: a 2D array of shape (max_n_bboxes_per_image, n_images) with
+        the global ID of each annotation.
+
+To add:
+- category: a 2D array of shape (max_n_bboxes_per_image, n_images) with
+        the category ID / str of each annotation.
+- split bbox into position and shape.
+- keep track of image filename?
+
+"""
+
+# %%%%%%%%%%%%%%%%%%%%
+# imports
 
 import numpy as np
 import xarray as xr
@@ -17,15 +35,16 @@
 # print(via_data.keys())  # _via_img_metadata, _via_image_id_list
 
 # %%%%%%%%%%%%%%%%%%%%
-# read as dict
+# read input json as dict
 coco_data = read_json_file_as_dict(coco_file_path)
 
 print(
     coco_data.keys()
 )  # dict_keys(['annotations', 'categories', 'images', 'info', 'licenses'])
-# %%%%%%%%%%%%%%%%%%%%
 
 
+# %%%%%%%%%%%%%%%%%%%%
+# helper fn to format data as homogeneous arrays
 def compute_homog_data_array_per_image_id(data_str, axis_image_id_in_output):
     # pair up data with image id
     pair_data = []
@@ -44,7 +63,7 @@ def compute_homog_data_array_per_image_id(data_str, axis_image_id_in_output):
         axis=0,
     )
 
-    # pad
+    # pad missing annotation-image IDs
     max_bboxes_per_image = max([d.shape[0] for d in data_array_per_image_id])
     data_array_per_image_id_with_nans = np.stack(
         [
@@ -65,153 +84,35 @@ def compute_homog_data_array_per_image_id(data_str, axis_image_id_in_output):
 
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Format bboxes data as xarray DataArray
-
-# # get bboxes coordinates per image
-# bbox_and_image_id_array = np.array(
-#     [annot["bbox"] + [annot["image_id"]]
-# for annot in coco_data["annotations"]]
-# )
-# bbox_array_per_image_id = np.split(
-#     bbox_and_image_id_array[:, :4],
-#     np.where(np.diff(bbox_and_image_id_array[:, -1]))[0] + 1,
-#     axis=0,
-# )
-
-# # pad missing annnotation-image-ids with np.nan
-# max_bboxes_per_image = max([d.shape[0] for d in bbox_array_per_image_id])
-# bbox_array_per_image_id_with_nans = np.stack(
-#     [
-#         np.concat(
-#             (
-#                 d,
-#                 np.full(
-#                     (max_bboxes_per_image - d.shape[0], d.shape[1]), np.nan
-#                 ),
-#             )
-#         ).squeeze()
-#         for d in bbox_array_per_image_id
-#     ],
-#     axis=1
-# )  #annotation_image_id, image_id, space
+# Format data
 
 # define bboxes data array
 bboxes_data = compute_homog_data_array_per_image_id(
     "bbox", axis_image_id_in_output=1
 )
-bboxes_da = xr.DataArray.from_dict(
-    {
-        "dims": [
-            "annotation_image_id",
-            "image_id",
-            "space",
-        ],
-        "data": bboxes_data,
-        "coords": {
-            "annotation_image_id": {
-                "dims": "annotation_image_id",
-                "data": list(range(bboxes_data.shape[0])),  # ---------
-            },
-            "image_id": {
-                "dims": "image_id",
-                "data": np.unique(
-                    [annot["image_id"] for annot in coco_data["annotations"]]
-                ),
-            },
-            "space": {
-                "dims": "space",
-                "data": ["x", "y", "width", "height"],
-            },
-        },
-        # "attrs": {"title": "air temperature"},
-        "name": "bbox",
-    }
-)
-
-# %%%%%%%%%%%%%%%%%%%%
-# Format annotation ID as xarray DataArray
-
-# # get data
-# annot_and_image_id_array = np.array(
-#     [
-#         [annot["id"]] + [annot["image_id"]]
-#         for annot in coco_data["annotations"]
-#     ],
-#     dtype=int,
-# )
-
-# # split based on image id
-# annot_array_per_image_id = np.split(
-#     annot_and_image_id_array[:, 0].reshape(-1, 1),
-#     np.where(np.diff(annot_and_image_id_array[:, -1]))[0] + 1,
-#     axis=0,
-# )
-
-# # pad missing annnotation-image-ids with np.nan
-# # max_bboxes_per_image = max([d.shape[0] for d in annot_array_per_image_id])
-# annot_array_per_image_id_with_nans = np.stack(
-#     [
-#         np.concat(
-#             (
-#                 d,
-#                 np.full(
-#                     (max_bboxes_per_image - d.shape[0], d.shape[1]), np.nan
-#                 ),
-#             )
-#         ).squeeze()
-#         for d in annot_array_per_image_id
-#     ],
-#     axis=-1,
-#     # dtype=int
-# )  # annotation_image_id, image_id
-
 
 # define annot ID data array
 annot_ID_data = compute_homog_data_array_per_image_id(
     "id", axis_image_id_in_output=-1
 )
-annotation_id_da = xr.DataArray.from_dict(
-    {
-        "dims": [
-            "annotation_image_id",
-            "image_id",
-        ],
-        "data": annot_ID_data,
-        "coords": {
-            "annotation_image_id": {
-                "dims": "annotation_image_id",
-                "data": list(range(annot_ID_data.shape[0])),  # ---------
-            },
-            "image_id": {
-                "dims": "image_id",
-                "data": np.unique(
-                    [annot["image_id"] for annot in coco_data["annotations"]]
-                ),
-            },
-        },
-        "attrs": {"title": "annotations ID per dataset"},
-        "name": "bbox",
-    }
-)
-
 
-# %%
+# %%%%%%%%%%%%%%%%%%%%
+# Create xarray Dataset
 ds = xr.Dataset(
     data_vars=dict(
-        bbox=(["annotation_image_id", "image_id", "space"], bboxes_da.data),
+        bbox=(["annotation_image_id", "image_id", "space"], bboxes_data),
         global_id=(
             ["annotation_image_id", "image_id"],
-            annotation_id_da.data,
+            annot_ID_data,
         ),
-        # category=(["annotation_id", "category_id"], category_da),
     ),
     coords=dict(
-        annotation_image_id=bboxes_da.coords["annotation_image_id"],
-        image_id=bboxes_da.coords["image_id"],
-        space=bboxes_da.coords["space"],
-        # category_id=category_da.coords["category_id"],
+        annotation_image_id=list(range(bboxes_data.shape[0])),
+        image_id=np.unique(
+            [annot["image_id"] for annot in coco_data["annotations"]]
+        ),
+        space=["x", "y", "width", "height"],
     ),
-    # attrs=dict(description="Weather related data."),
 )
 
 # %%%%%%%%%%%%%%%%%%%%
@@ -224,8 +125,9 @@ def compute_homog_data_array_per_image_id(data_str, axis_image_id_in_output):
 
 
 # get the bbox coordinates of the annotation with global ID = 2
-# a.where(a.x + a.y < 4)
 ds.bbox.where(ds.global_id == 2, drop=True)
 
 # get the global ID of the third annotation per image
 ds.global_id.sel(annotation_image_id=3)
+
+# %%

From 57dc2b9e1d3e3d8b3dac487e4fc00eec5e2e4eb1 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 11 Dec 2024 13:20:50 +0000
Subject: [PATCH 03/36] Add movement as dependency, fix indentation

---
 pyproject.toml | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 88443ce..0e4914f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,12 +17,17 @@ classifiers = [
   "Programming Language :: Python :: 3.12",
   "Operating System :: OS Independent",
   "License :: OSI Approved :: BSD License",
+  "Development Status :: 2 - Pre-Alpha",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Operating System :: OS Independent",
+  "License :: OSI Approved :: BSD License",
 ]
 dependencies = [
-  "movement",
-]
-dependencies = [
-    "xarray", # "xarray[accel,viz]",
+  "movement"
 ]
 
 [project.urls]

From 5f16511a451badd55acb942ecf9e9622241e2713 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 11 Dec 2024 13:21:16 +0000
Subject: [PATCH 04/36] Add io module for reading manual annotations

---
 ethology/annotations/io.py | 106 +++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 ethology/annotations/io.py

diff --git a/ethology/annotations/io.py b/ethology/annotations/io.py
new file mode 100644
index 0000000..09339ba
--- /dev/null
+++ b/ethology/annotations/io.py
@@ -0,0 +1,106 @@
+"""Module for reading and writing manually labelled annotations."""
+
+import json
+from pathlib import Path
+
+import pandas as pd
+from movement.validators.files import ValidFile
+
+from ethology.annotations.validators import ValidJSON, ValidVIAUntrackedJSON
+
+STANDARD_DF_COLUMNS = [
+    "annotation_id",
+    "image_filename",
+    "image_id",
+    "x_min",
+    "y_min",
+    "width",
+    "height",
+    "superclass",
+    "class",
+]
+
+
+def df_from_via_json_file(file_path: Path):
+    """Validate and read untracked VIA JSON file.
+
+    The data is formated as an untracked annotations DataFrame.
+    """
+    # General file validation
+    file = ValidFile(
+        file_path, expected_permission="r", expected_suffix=[".json"]
+    )
+
+    # JSON file validation
+    json_file = ValidJSON(file.path)
+
+    # VIA Untracked JSON schema validation
+    via_untracked_file = ValidVIAUntrackedJSON(json_file.path)
+
+    # Read as standard dataframe
+    return _df_from_validated_via_json_file(via_untracked_file.path)
+
+
+def _df_from_validated_via_json_file(file_path):
+    """Read VIA JSON file as standard untracked annotations DataFrame."""
+    # Read validated json as dict
+    with open(file_path) as file:
+        data_dict = json.load(file)
+
+    # Get relevant fields
+    image_metadata_dict = data_dict["_via_img_metadata"]
+    via_image_id_list = data_dict[
+        "_via_image_id_list"
+    ]  # ordered list of the keys in image_metadata_dict
+
+    # map filename to keys in image_metadata_dict
+    map_filename_to_via_img_id = {
+        img_dict["filename"]: ky
+        for ky, img_dict in image_metadata_dict.items()
+    }
+
+    # Build standard dataframe
+    list_rows = []
+    # loop thru images
+    for _, img_dict in image_metadata_dict.items():
+        # loop thru annotations in the image
+        for region in img_dict["regions"]:
+            region_shape = region["shape_attributes"]
+            region_attributes = region["region_attributes"]
+
+            # append annotations to df
+            list_rows.append(
+                {
+                    "image_filename": img_dict["filename"],
+                    "x_min": region_shape["x"],
+                    "y_min": region_shape["y"],
+                    "width": region_shape["width"],
+                    "height": region_shape["height"],
+                    "superclass": list(region_attributes.keys())[
+                        0
+                    ],  # takes first key as superclass
+                    "class": region_attributes[
+                        list(region_attributes.keys())[0]
+                    ],
+                },
+            )
+
+    df = pd.DataFrame(
+        list_rows,
+        columns=[
+            col for col in STANDARD_DF_COLUMNS if not col.endswith("_id")
+        ],
+    )
+
+    # add image_id column
+    df["image_id"] = df["image_filename"].apply(
+        lambda x: via_image_id_list.index(map_filename_to_via_img_id[x])
+    )
+
+    # add annotation_id column based on index
+    df["annotation_id"] = df.index
+
+    # reorder columns to match standard
+    df = df.reindex(columns=STANDARD_DF_COLUMNS)
+
+    return df

From 0945291a26afad9a0e154945d88f6993f80ba9bf Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 11 Dec 2024 13:21:41 +0000
Subject: [PATCH 05/36] Add validators for manual annotation files

---
 ethology/annotations/validators.py | 144 +++++++++++++++++++++++++++++
 1 file changed, 144 insertions(+)
 create mode 100644 ethology/annotations/validators.py

diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py
new file mode 100644
index 0000000..a73f9a6
--- /dev/null
+++ b/ethology/annotations/validators.py
@@ -0,0 +1,144 @@
+import json
+from pathlib import Path
+
+import jsonschema
+import jsonschema.exceptions
+from attrs import define, field, validators
+
+# 
+
+
+@define
+class ValidJSON:
+    """Class for validating JSON files.
+
+    Attributes
+    ----------
+    path : pathlib.Path
+        Path to the JSON file.
+
+    Raises
+    ------
+    ValueError
+        If the file is not in JSON format or if it does not contain the
+        expected keys.
+
+    """
+
+    path: Path = field(validator=validators.instance_of(Path))
+
+    @path.validator
+    def _file_is_json(self, attribute, value):
+        """Ensure that the file is a JSON file."""
+        try:
+            with open(value) as file:
+                json.load(file)
+        except FileNotFoundError as not_found_error:
+            raise ValueError(f"File not found: {value}") from not_found_error
+        except json.JSONDecodeError as decode_error:
+            raise ValueError(
+                f"Error decoding JSON data from file: {value}"
+            ) from decode_error
+
+
+@define
+class ValidVIAUntrackedJSON:
+    """Class for validating VIA JSON files for untracked data.
+
+    The validator ensures that the file matches the expected schema.
+
+    https://json-schema.org/understanding-json-schema/reference/object#additional-properties
+
+    Attributes
+    ----------
+    path : pathlib.Path
+        Path to the JSON file.
+
+    Raises
+    ------
+    ValueError
+        If the JSON file does not match the expected schema.
+
+    """
+
+    path: Path = field(validator=validators.instance_of(Path))
+    # expected_schema: dict = field(factory=dict, kw_only=True)
+    # https://stackoverflow.com/questions/16222633/how-would-you-design-json-schema-for-an-arbitrary-key
+
+    @path.validator
+    def _file_macthes_VIA_JSON_schema(self, attribute, value):
+        """Ensure that the JSON file matches the expected schema."""
+        # should the schema be an attribute?
+        VIA_JSON_schema = {
+            "type": "object",
+            "properties": {
+                "_via_settings": {
+                    "type": "object",
+                    "properties": {
+                        "ui": {"type": "object"},
+                        "core": {"type": "object"},
+                        "project": {"type": "object"},
+                    },
+                },
+                "_via_img_metadata": {
+                    "type": "object",
+                    "additionalProperties": {  # ---- does this work?
+                        "type": "object",
+                        "properties": {
+                            "filename": {"type": "string"},
+                            "size": {"type": "integer"},
+                            "regions": {
+                                "type": "list",  # does this work?
+                                "properties": {
+                                    "shape_attributes": {
+                                        "type": "object",
+                                        "properties": {
+                                            "name": {"type": "string"},
+                                            "x": {"type": "integer"},
+                                            "y": {"type": "integer"},
+                                            "width": {"type": "integer"},
+                                            "height": {"type": "integer"},
+                                        },
+                                        "region_attributes": {
+                                            "type": "object"
+                                        },
+                                    },
+                                },
+                            },
+                            "file_attributes": {"type": "object"},
+                        },
+                    },
+                },
+                "_via_attributes": {
+                    "type": "dict",
+                    "properties": {
+                        "region": {"type": "dict"},
+                        "file": {"type": "dict"},
+                    },
+                },
+                "_via_data_format_version": {"type": "string"},
+                "_via_image_id_list": {"type": "list"},
+            },
+        }
+
+        # should have been validated with ValidVIAUntrackedJSON
+        with open(value) as file:
+            data = json.load(file)
+
+        # check schema
+        try:
+            jsonschema.validate(instance=data, schema=VIA_JSON_schema)
+        except jsonschema.exceptions.ValidationError as val_err:
+            raise ValueError(
+                "The JSON data does not match "
+                f"the provided schema: {VIA_JSON_schema}"
+            ) from val_err
+        # except jsonschema.exceptions.SchemaError as schema_err:
+        #     raise ValueError(
+        #         f"Invalid schema provided: {VIA_JSON_schema}"
+        #     ) from schema_err
+
+
+@define
+class ValidCOCOUntrackedJSON:
+    pass

From fd2e30c14ad8b4baf3945c32dc7cc438cfc91725 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 11 Dec 2024 13:36:58 +0000
Subject: [PATCH 06/36] Remove notebook for xarray

---
 .../annotations/notebook_annots_as_xarray.py  | 133 ------------------
 1 file changed, 133 deletions(-)
 delete mode 100644 ethology/annotations/notebook_annots_as_xarray.py

diff --git a/ethology/annotations/notebook_annots_as_xarray.py b/ethology/annotations/notebook_annots_as_xarray.py
deleted file mode 100644
index 08e0a5c..0000000
--- a/ethology/annotations/notebook_annots_as_xarray.py
+++ /dev/null
@@ -1,133 +0,0 @@
-"""Explore formatting COCO annotations as an xarray Dataset.
-
-The dataset is made up from the following data variables:
-- bbox:  a 3D array with bounding box coordinates and shape
-        (max_n_bboxes_per_image, n_images, 4).
-        The four coordinates represent (x, y, h, w) per annotation.
-- global_id: a 2D array of shape (max_n_bboxes_per_image, n_images) with
-        the global ID of each annotation.
-
-To add:
-- category: a 2D array of shape (max_n_bboxes_per_image, n_images) with
-        the category ID / str of each annotation.
-- split bbox into position and shape.
-- keep track of image filename?
-
-"""
-
-# %%%%%%%%%%%%%%%%%%%%
-# imports
-
-import numpy as np
-import xarray as xr
-from utils import read_json_file_as_dict
-
-# %%%%%%%%%%%%%%%%%%%
-# input data
-via_file_path = (
-    "/home/sminano/swc/project_ethology/sample_VIA_annotations/VIA_JSON_1.json"
-)
-coco_file_path = (
-    "/home/sminano/swc/project_ethology/sample_COCO_annotations/sample_annotations_1.json"
-)
-
-# via_data = read_via_json_file_as_dict(via_file_path)
-# print(via_data.keys())  # _via_img_metadata, _via_image_id_list
-
-# %%%%%%%%%%%%%%%%%%%%
-# read input json as dict
-coco_data = read_json_file_as_dict(coco_file_path)
-
-print(
-    coco_data.keys()
-)  # dict_keys(['annotations', 'categories', 'images', 'info', 'licenses'])
-
-
-# %%%%%%%%%%%%%%%%%%%%
-# helper fn to format data as homogeneous arrays
-def compute_homog_data_array_per_image_id(data_str, axis_image_id_in_output):
-    # pair up data with image id
-    pair_data = []
-    for annot in coco_data["annotations"]:
-        if isinstance(annot[data_str], list):
-            pair_data.append(annot[data_str] + [annot["image_id"]])
-        else:
-            pair_data.append([annot[data_str], annot["image_id"]])
-
-    data_and_image_id_array = np.array(pair_data)
-
-    # split
-    data_array_per_image_id = np.split(
-        data_and_image_id_array[:, : data_and_image_id_array.shape[1] - 1],
-        np.where(np.diff(data_and_image_id_array[:, -1]))[0] + 1,
-        axis=0,
-    )
-
-    # pad missing annotation-image IDs
-    max_bboxes_per_image = max([d.shape[0] for d in data_array_per_image_id])
-    data_array_per_image_id_with_nans = np.stack(
-        [
-            np.concat(
-                (
-                    d,
-                    np.full(
-                        (max_bboxes_per_image - d.shape[0], d.shape[1]), np.nan
-                    ),
-                )
-            ).squeeze()
-            for d in data_array_per_image_id
-        ],
-        axis=axis_image_id_in_output,  # 1, -1
-    )  # annotation_image_id, image_id, space
-
-    return data_array_per_image_id_with_nans
-
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Format data
-
-# define bboxes data array
-bboxes_data = compute_homog_data_array_per_image_id(
-    "bbox", axis_image_id_in_output=1
-)
-
-# define annot ID data array
-annot_ID_data = compute_homog_data_array_per_image_id(
-    "id", axis_image_id_in_output=-1
-)
-
-# %%%%%%%%%%%%%%%%%%%%
-# Create xarray Dataset
-ds = xr.Dataset(
-    data_vars=dict(
-        bbox=(["annotation_image_id", "image_id", "space"], bboxes_data),
-        global_id=(
-            ["annotation_image_id", "image_id"],
-            annot_ID_data,
-        ),
-    ),
-    coords=dict(
-        annotation_image_id=list(range(bboxes_data.shape[0])),
-        image_id=np.unique(
-            [annot["image_id"] for annot in coco_data["annotations"]]
-        ),
-        space=["x", "y", "width", "height"],
-    ),
-)
-
-# %%%%%%%%%%%%%%%%%%%%
-# Inspect the dataset
-
-print(ds)
-
-# get all annotations in image 4
-ds.bbox.sel(image_id=4)
-
-
-# get the bbox coordinates of the annotation with global ID = 2
-ds.bbox.where(ds.global_id == 2, drop=True)
-
-# get the global ID of the third annotation per image
-ds.global_id.sel(annotation_image_id=3)
-
-# %%

From c69fea9f4c789305ad7addc775cc25f2374e89a4 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 11 Dec 2024 13:37:11 +0000
Subject: [PATCH 07/36] Fix schema for validation

---
 ethology/annotations/validators.py | 66 ++++++++++++++++++------------
 1 file changed, 40 insertions(+), 26 deletions(-)

diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py
index a73f9a6..88a5c03 100644
--- a/ethology/annotations/validators.py
+++ b/ethology/annotations/validators.py
@@ -1,3 +1,5 @@
+"""Validators for annotation files."""
+
 import json
 from pathlib import Path
 
@@ -5,8 +7,6 @@
 import jsonschema.exceptions
 from attrs import define, field, validators
 
-# 
-
 
 @define
 class ValidJSON:
@@ -63,15 +63,16 @@ class ValidVIAUntrackedJSON:
 
     path: Path = field(validator=validators.instance_of(Path))
     # expected_schema: dict = field(factory=dict, kw_only=True)
-    # https://stackoverflow.com/questions/16222633/how-would-you-design-json-schema-for-an-arbitrary-key
 
     @path.validator
     def _file_macthes_VIA_JSON_schema(self, attribute, value):
         """Ensure that the JSON file matches the expected schema."""
-        # should the schema be an attribute?
+        # Define schema for VIA JSON file for untracked
+        # (aka manually labelled) data
         VIA_JSON_schema = {
             "type": "object",
             "properties": {
+                # settings for browser UI
                 "_via_settings": {
                     "type": "object",
                     "properties": {
@@ -80,27 +81,33 @@ def _file_macthes_VIA_JSON_schema(self, attribute, value):
                         "project": {"type": "object"},
                     },
                 },
+                # annotation data
                 "_via_img_metadata": {
                     "type": "object",
-                    "additionalProperties": {  # ---- does this work?
+                    "additionalProperties": {
+                        # "additionalProperties" to allow any key,
+                        # see https://stackoverflow.com/a/69811612/24834957
                         "type": "object",
                         "properties": {
                             "filename": {"type": "string"},
                             "size": {"type": "integer"},
                             "regions": {
-                                "type": "list",  # does this work?
-                                "properties": {
-                                    "shape_attributes": {
-                                        "type": "object",
-                                        "properties": {
-                                            "name": {"type": "string"},
-                                            "x": {"type": "integer"},
-                                            "y": {"type": "integer"},
-                                            "width": {"type": "integer"},
-                                            "height": {"type": "integer"},
-                                        },
-                                        "region_attributes": {
-                                            "type": "object"
+                                "type": "array",  # a list of dicts
+                                "items": {
+                                    "type": "object",
+                                    "properties": {
+                                        "shape_attributes": {
+                                            "type": "object",
+                                            "properties": {
+                                                "name": {"type": "string"},
+                                                "x": {"type": "integer"},
+                                                "y": {"type": "integer"},
+                                                "width": {"type": "integer"},
+                                                "height": {"type": "integer"},
+                                            },
+                                            "region_attributes": {
+                                                "type": "object"
+                                            },  # we just check it's a dict
                                         },
                                     },
                                 },
@@ -109,15 +116,22 @@ def _file_macthes_VIA_JSON_schema(self, attribute, value):
                         },
                     },
                 },
+                # ordered list of image keys
+                # - the position defines the image ID
+                "_via_image_id_list": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                },
+                # region (aka annotation) and file attributes for VIA UI
                 "_via_attributes": {
-                    "type": "dict",
+                    "type": "object",
                     "properties": {
-                        "region": {"type": "dict"},
-                        "file": {"type": "dict"},
+                        "region": {"type": "object"},
+                        "file": {"type": "object"},
                     },
                 },
+                # version of the VIA data format
                 "_via_data_format_version": {"type": "string"},
-                "_via_image_id_list": {"type": "list"},
             },
         }
 
@@ -125,7 +139,7 @@ def _file_macthes_VIA_JSON_schema(self, attribute, value):
         with open(value) as file:
             data = json.load(file)
 
-        # check schema
+        # check against schema
         try:
             jsonschema.validate(instance=data, schema=VIA_JSON_schema)
         except jsonschema.exceptions.ValidationError as val_err:
@@ -139,6 +153,6 @@ def _file_macthes_VIA_JSON_schema(self, attribute, value):
         #     ) from schema_err
 
 
-@define
-class ValidCOCOUntrackedJSON:
-    pass
+# @define
+# class ValidCOCOUntrackedJSON:
+#     pass

From 0a1d64e6c3464a5c73d8c13adc0ce40465f8ce70 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 11 Dec 2024 14:27:55 +0000
Subject: [PATCH 08/36] Add validation and loading for COCO style file

---
 ethology/annotations/io.py         | 119 +++++++++++++++++++++++------
 ethology/annotations/validators.py | 118 +++++++++++++++++++++++++---
 2 files changed, 203 insertions(+), 34 deletions(-)

diff --git a/ethology/annotations/io.py b/ethology/annotations/io.py
index 09339ba..700b92f 100644
--- a/ethology/annotations/io.py
+++ b/ethology/annotations/io.py
@@ -6,7 +6,11 @@
 import pandas as pd
 from movement.validators.files import ValidFile
 
-from ethology.annotations.validators import ValidJSON, ValidVIAUntrackedJSON
+from ethology.annotations.validators import (
+    ValidCOCOUntrackedJSON,
+    ValidJSON,
+    ValidVIAUntrackedJSON,
+)
 
 STANDARD_DF_COLUMNS = [
     "annotation_id",
@@ -16,15 +20,15 @@
     "y_min",
     "width",
     "height",
-    "superclass",
-    "class",
+    "supercategory",
+    "category",
 ]
 
 
-def df_from_via_json_file(file_path: Path):
+def df_from_via_json_file(file_path: Path) -> pd.DataFrame:
     """Validate and read untracked VIA JSON file.
 
-    The data is formated as an untracked annotations DataFrame.
+    The data is formatted as an untracked annotations DataFrame.
     """
     # General file validation
     file = ValidFile(
@@ -41,13 +45,30 @@ def df_from_via_json_file(file_path: Path):
     return _df_from_validated_via_json_file(via_untracked_file.path)
 
 
+def df_from_coco_json_file(file_path: Path) -> pd.DataFrame:
+    """Validate and read COCO JSON file."""
+    # General file validation
+    file = ValidFile(
+        file_path, expected_permission="r", expected_suffix=[".json"]
+    )
+
+    # JSON file validation
+    json_file = ValidJSON(file.path)
+
+    # COCO Untracked JSON schema validation
+    coco_untracked_file = ValidCOCOUntrackedJSON(json_file.path)
+
+    # Read as standard dataframe
+    return _df_from_validated_coco_json_file(coco_untracked_file.path)
+
+
 def _df_from_validated_via_json_file(file_path):
     """Read VIA JSON file as standard untracked annotations DataFrame."""
     # Read validated json as dict
     with open(file_path) as file:
         data_dict = json.load(file)
 
-    # Get relevant fields
+    # Prepare data
     image_metadata_dict = data_dict["_via_img_metadata"]
     via_image_id_list = data_dict[
         "_via_image_id_list"
@@ -68,28 +89,26 @@ def _df_from_validated_via_json_file(file_path):
             region_shape = region["shape_attributes"]
             region_attributes = region["region_attributes"]
 
+            row = {
+                "image_filename": img_dict["filename"],
+                "x_min": region_shape["x"],
+                "y_min": region_shape["y"],
+                "width": region_shape["width"],
+                "height": region_shape["height"],
+                "supercategory": list(region_attributes.keys())[
+                    0
+                ],  # takes first key as supercategory
+                "category": region_attributes[
+                    list(region_attributes.keys())[0]
+                ],
+            }
+
             # append annotations to df
-            list_rows.append(
-                {
-                    "image_filename": img_dict["filename"],
-                    "x_min": region_shape["x"],
-                    "y_min": region_shape["y"],
-                    "width": region_shape["width"],
-                    "height": region_shape["height"],
-                    "superclass": list(region_attributes.keys())[
-                        0
-                    ],  # takes first key as superclass
-                    "class": region_attributes[
-                        list(region_attributes.keys())[0]
-                    ],
-                },
-            )
+            list_rows.append(row)
 
     df = pd.DataFrame(
         list_rows,
-        columns=[
-            col for col in STANDARD_DF_COLUMNS if not col.endswith("_id")
-        ],
+        # columns=list(row.keys()),  # do I need this?
     )
 
     # add image_id column
@@ -104,3 +123,55 @@ def _df_from_validated_via_json_file(file_path):
     df = df.reindex(columns=STANDARD_DF_COLUMNS)
 
     return df
+
+
+def _df_from_validated_coco_json_file(file_path: Path) -> pd.DataFrame:
+    """Read COCO JSON file as standard untracked annotations DataFrame."""
+    # Read validated json as dict
+    with open(file_path) as file:
+        data_dict = json.load(file)
+
+    # Prepare data
+    map_image_id_to_filename = {
+        img_dict["id"]: img_dict["file_name"]
+        for img_dict in data_dict["images"]
+    }
+
+    map_category_id_to_category_data = {
+        cat_dict["id"]: (cat_dict["name"], cat_dict["supercategory"])
+        for cat_dict in data_dict["categories"]
+    }
+
+    # Build standard dataframe
+    list_rows = []
+    for annot_dict in data_dict["annotations"]:
+        annotation_id = annot_dict["id"]
+        # image data
+        image_id = annot_dict["image_id"]
+        image_filename = map_image_id_to_filename[image_id]
+
+        # bbox data
+        x_min, y_min, width, height = annot_dict["bbox"]
+
+        # class data
+        category_id = annot_dict["category_id"]
+        category, supercategory = map_category_id_to_category_data[category_id]
+
+        row = {
+            "annotation_id": annotation_id,
+            "image_filename": image_filename,
+            "image_id": image_id,
+            "x_min": x_min,
+            "y_min": y_min,
+            "width": width,
+            "height": height,
+            "supercategory": supercategory,
+            "category": category,
+        }
+
+        list_rows.append(row)
+
+    df = pd.DataFrame(list_rows)
+    df.reindex(columns=STANDARD_DF_COLUMNS)
+
+    return df
diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py
index 88a5c03..6a2b998 100644
--- a/ethology/annotations/validators.py
+++ b/ethology/annotations/validators.py
@@ -46,8 +46,9 @@ class ValidVIAUntrackedJSON:
     """Class for validating VIA JSON files for untracked data.
 
     The validator ensures that the file matches the expected schema.
+    The schema validation only checks the type for each specified
+    key if it exists. It does not check for the presence of the keys.
 
-    https://json-schema.org/understanding-json-schema/reference/object#additional-properties
 
     Attributes
     ----------
@@ -59,10 +60,16 @@ class ValidVIAUntrackedJSON:
     ValueError
         If the JSON file does not match the expected schema.
 
+    Notes
+    -----
+    https://json-schema.org/understanding-json-schema/
+
     """
 
+    # TODO: add a check for the presence of the keys
+    # that I use in loading the data
+
     path: Path = field(validator=validators.instance_of(Path))
-    # expected_schema: dict = field(factory=dict, kw_only=True)
 
     @path.validator
     def _file_macthes_VIA_JSON_schema(self, attribute, value):
@@ -135,7 +142,8 @@ def _file_macthes_VIA_JSON_schema(self, attribute, value):
             },
         }
 
-        # should have been validated with ValidVIAUntrackedJSON
+        # should have been validated with ValidJSON
+        # already so this should work fine
         with open(value) as file:
             data = json.load(file)
 
@@ -147,12 +155,102 @@ def _file_macthes_VIA_JSON_schema(self, attribute, value):
                 "The JSON data does not match "
                 f"the provided schema: {VIA_JSON_schema}"
             ) from val_err
-        # except jsonschema.exceptions.SchemaError as schema_err:
-        #     raise ValueError(
-        #         f"Invalid schema provided: {VIA_JSON_schema}"
-        #     ) from schema_err
 
 
-# @define
-# class ValidCOCOUntrackedJSON:
-#     pass
+@define
+class ValidCOCOUntrackedJSON:
+    """Class for validating COCO JSON files for untracked data.
+
+    The validator ensures that the file matches the expected schema.
+    The schema validation only checks the type for each specified
+    key if it exists. It does not check for the presence of the keys.
+
+    Attributes
+    ----------
+    path : pathlib.Path
+        Path to the JSON file.
+
+    Raises
+    ------
+    ValueError
+        If the JSON file does not match the expected schema.
+
+    Notes
+    -----
+    https://json-schema.org/understanding-json-schema/
+
+    """
+
+    path: Path = field(validator=validators.instance_of(Path))
+
+    # TODO: add a check for the presence of the keys
+    # that I use in loading the data
+
+    @path.validator
+    def _file_macthes_COCO_JSON_schema(self, attribute, value):
+        """Ensure that the JSON file matches the expected schema."""
+        # Define schema for VIA JSON file for untracked
+        # (aka manually labelled) data
+        COCO_JSON_schema = {
+            "type": "object",
+            "properties": {
+                "info": {"type": "object"},
+                "licenses": {
+                    "type": "array",
+                },
+                "images": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "file_name": {"type": "string"},
+                            "id": {"type": "integer"},
+                            "width": {"type": "integer"},
+                            "height": {"type": "integer"},
+                        },
+                    },
+                },
+                "annotations": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "id": {"type": "integer"},  # annotation global ID
+                            "image_id": {"type": "integer"},
+                            "bbox": {
+                                "type": "array",
+                                "items": {"type": "integer"},
+                            },
+                            "category_id": {"type": "integer"},
+                            "area": {"type": "integer"},
+                            "iscrowd": {"type": "integer"},
+                        },
+                    },
+                },
+                "categories": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "id": {"type": "integer"},
+                            "name": {"type": "string"},
+                            "supercategory": {"type": "string"},
+                        },
+                    },
+                },
+            },
+        }
+
+        # should have been validated with ValidJSON
+        # already so this should work fine
+        with open(value) as file:
+            data = json.load(file)
+
+        # check against schema
+        try:
+            jsonschema.validate(instance=data, schema=COCO_JSON_schema)
+        except jsonschema.exceptions.ValidationError as val_err:
+            raise ValueError(
+                "The JSON data does not match "
+                f"the provided schema: {COCO_JSON_schema}"
+            ) from val_err

From 57c94ece89c1bab772ab4a10a9614642c9185a29 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 11 Dec 2024 19:21:24 +0000
Subject: [PATCH 09/36] Keys check draft

---
 ethology/annotations/validators.py | 65 ++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py
index 6a2b998..a5e1898 100644
--- a/ethology/annotations/validators.py
+++ b/ethology/annotations/validators.py
@@ -156,6 +156,71 @@ def _file_macthes_VIA_JSON_schema(self, attribute, value):
                 f"the provided schema: {VIA_JSON_schema}"
             ) from val_err
 
+    @path.validator
+    def _file_contains_required_keys(self, attribute, value):
+        """Ensure that the JSON file contains the required keys."""
+        required_keys_main = [
+            "_via_img_metadata",
+            "_via_image_id_list",
+        ]
+
+        required_keys_img_metadata_dicts = [
+            "filename",
+            "regions",
+        ]
+
+        required_keys_region_dicts = [
+            "shape_attributes",
+            "region_attributes",
+        ]
+
+        required_keys_shape_attributes_dicts = [
+            "x",
+            "y",
+            "width",
+            "height",
+        ]
+
+        with open(value) as file:
+            data = json.load(file)
+
+        # check keys first level
+        for key in required_keys_main:
+            if key not in data:
+                raise ValueError(
+                    f"Key '{key}' not found in first level "
+                    f"of the JSON input file: {value}"
+                )
+
+        # check keys in each of the _via_img_metadata dicts
+        for key in required_keys_img_metadata_dicts:
+            for img_str, img_dict in data["_via_img_metadata"]:
+                if key not in img_dict:
+                    raise ValueError(
+                        f"Key '{key}' not found for {img_str}"
+                        " under _via_img_metadata"
+                    )
+
+        # check keys under regions
+        for key in required_keys_region_dicts:
+            for img_str, img_dict in data["_via_img_metadata"]:
+                for region in img_dict["regions"]:
+                    if key not in region:
+                        raise ValueError(
+                            f"Key '{key}' not found for region"
+                            f" under {img_str}"
+                        )
+
+        # check keys under shape_attributes
+        for key in required_keys_shape_attributes_dicts:
+            for img_str, img_dict in data["_via_img_metadata"]:
+                for region in img_dict["regions"]:
+                    if key not in region["shape_attributes"]:
+                        raise ValueError(
+                            f"Key 'shape_attributes > {key}' not found "
+                            f"for region under {img_str}"
+                        )
+
 
 @define
 class ValidCOCOUntrackedJSON:

From 1d19eb984bed26f003cc0dcd3c8d743581f232a1 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 11 Dec 2024 19:49:20 +0000
Subject: [PATCH 10/36] Improve keys check

---
 ethology/annotations/validators.py | 97 +++++++++++++-----------------
 1 file changed, 41 insertions(+), 56 deletions(-)

diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py
index a5e1898..862a336 100644
--- a/ethology/annotations/validators.py
+++ b/ethology/annotations/validators.py
@@ -159,68 +159,53 @@ def _file_macthes_VIA_JSON_schema(self, attribute, value):
     @path.validator
     def _file_contains_required_keys(self, attribute, value):
         """Ensure that the JSON file contains the required keys."""
-        required_keys_main = [
-            "_via_img_metadata",
-            "_via_image_id_list",
-        ]
-
-        required_keys_img_metadata_dicts = [
-            "filename",
-            "regions",
-        ]
-
-        required_keys_region_dicts = [
-            "shape_attributes",
-            "region_attributes",
-        ]
-
-        required_keys_shape_attributes_dicts = [
-            "x",
-            "y",
-            "width",
-            "height",
-        ]
+        required_keys = {
+            "main": ["_via_img_metadata", "_via_image_id_list"],
+            "image_keys": ["filename", "regions"],
+            "region_keys": ["shape_attributes", "region_attributes"],
+            "shape_attributes_keys": ["x", "y", "width", "height"],
+        }
+
+        def _check_keys(
+            list_required_keys: list[str],
+            data_dict: dict,
+            additional_error_message: str = "",
+        ):
+            missing_keys = set(list_required_keys) - data_dict.keys()
+            if missing_keys:
+                raise ValueError(
+                    f"Required key(s) {missing_keys} not "
+                    f"found in {list(data_dict.keys())} "
+                    + additional_error_message
+                    + "."
+                )
 
+        # Read data as dict
         with open(value) as file:
             data = json.load(file)
 
-        # check keys first level
-        for key in required_keys_main:
-            if key not in data:
-                raise ValueError(
-                    f"Key '{key}' not found in first level "
-                    f"of the JSON input file: {value}"
+        # Check first level keys
+        _check_keys(required_keys["main"], data)
+
+        # Check keys in nested dicts
+        for img_str, img_dict in data["_via_img_metadata"].items():
+            # Check keys for each image dictionary
+            _check_keys(
+                required_keys["image_keys"],
+                img_dict,
+                additional_error_message=f"for {img_str}",
+            )
+            # Check keys for each region
+            for region in img_dict["regions"]:
+                _check_keys(required_keys["region_keys"], region)
+
+                # Check keys under shape_attributes
+                _check_keys(
+                    required_keys["shape_attributes_keys"],
+                    region["shape_attributes"],
+                    additional_error_message=f"for region under {img_str}",
                 )
 
-        # check keys in each of the _via_img_metadata dicts
-        for key in required_keys_img_metadata_dicts:
-            for img_str, img_dict in data["_via_img_metadata"]:
-                if key not in img_dict:
-                    raise ValueError(
-                        f"Key '{key}' not found for {img_str}"
-                        " under _via_img_metadata"
-                    )
-
-        # check keys under regions
-        for key in required_keys_region_dicts:
-            for img_str, img_dict in data["_via_img_metadata"]:
-                for region in img_dict["regions"]:
-                    if key not in region:
-                        raise ValueError(
-                            f"Key '{key}' not found for region"
-                            f" under {img_str}"
-                        )
-
-        # check keys under shape_attributes
-        for key in required_keys_shape_attributes_dicts:
-            for img_str, img_dict in data["_via_img_metadata"]:
-                for region in img_dict["regions"]:
-                    if key not in region["shape_attributes"]:
-                        raise ValueError(
-                            f"Key 'shape_attributes > {key}' not found "
-                            f"for region under {img_str}"
-                        )
-
 
 @define
 class ValidCOCOUntrackedJSON:

From 97df63c1eedac3296da50fcc9fa886f1f41fd93e Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 11 Dec 2024 19:57:25 +0000
Subject: [PATCH 11/36] Add keys check for COCO untracked json validator

---
 ethology/annotations/validators.py | 73 ++++++++++++++++++++++++------
 1 file changed, 59 insertions(+), 14 deletions(-)

diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py
index 862a336..20b1317 100644
--- a/ethology/annotations/validators.py
+++ b/ethology/annotations/validators.py
@@ -166,20 +166,6 @@ def _file_contains_required_keys(self, attribute, value):
             "shape_attributes_keys": ["x", "y", "width", "height"],
         }
 
-        def _check_keys(
-            list_required_keys: list[str],
-            data_dict: dict,
-            additional_error_message: str = "",
-        ):
-            missing_keys = set(list_required_keys) - data_dict.keys()
-            if missing_keys:
-                raise ValueError(
-                    f"Required key(s) {missing_keys} not "
-                    f"found in {list(data_dict.keys())} "
-                    + additional_error_message
-                    + "."
-                )
-
         # Read data as dict
         with open(value) as file:
             data = json.load(file)
@@ -304,3 +290,62 @@ def _file_macthes_COCO_JSON_schema(self, attribute, value):
                 "The JSON data does not match "
                 f"the provided schema: {COCO_JSON_schema}"
             ) from val_err
+
+    @path.validator
+    def _file_contains_required_keys(self, attribute, value):
+        """Ensure that the JSON file contains the required keys."""
+        required_keys = {
+            "main": ["images", "annotations", "categories"],
+            "image_keys": [
+                "id",
+                "file_name",
+            ],  # add height and width of image?
+            "annotations_keys": ["id", "image_id", "bbox", "category_id"],
+            "categories_keys": ["id", "name", "supercategory"],
+        }
+
+        # Read data as dict
+        with open(value) as file:
+            data = json.load(file)
+
+        # Check first level keys
+        _check_keys(required_keys["main"], data)
+
+        # Check keys in images dicts
+        for img_dict in data["images"]:
+            _check_keys(
+                required_keys["image_keys"],
+                img_dict,
+                additional_error_message=f"for image dict {img_dict}",
+            )
+
+        # Check keys in annotations dicts
+        for annot_dict in data["annotations"]:
+            _check_keys(
+                required_keys["annotations_keys"],
+                annot_dict,
+                additional_error_message=f"for annotation dict {annot_dict}",
+            )
+
+        # Check keys in categories dicts
+        for cat_dict in data["categories"]:
+            _check_keys(
+                required_keys["categories_keys"],
+                cat_dict,
+                additional_error_message=f"for category dict {cat_dict}",
+            )
+
+
+def _check_keys(
+    list_required_keys: list[str],
+    data_dict: dict,
+    additional_error_message: str = "",
+):
+    missing_keys = set(list_required_keys) - data_dict.keys()
+    if missing_keys:
+        raise ValueError(
+            f"Required key(s) {missing_keys} not "
+            f"found in {list(data_dict.keys())} "
+            + additional_error_message
+            + "."
+        )

From 029bb8b921eed28e4fef25da3e4f6361a4aab1f5 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 11 Dec 2024 20:02:20 +0000
Subject: [PATCH 12/36] Remove some comments

---
 ethology/annotations/io.py | 39 +++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/ethology/annotations/io.py b/ethology/annotations/io.py
index 700b92f..129f241 100644
--- a/ethology/annotations/io.py
+++ b/ethology/annotations/io.py
@@ -29,16 +29,23 @@ def df_from_via_json_file(file_path: Path) -> pd.DataFrame:
     """Validate and read untracked VIA JSON file.
 
     The data is formatted as an untracked annotations DataFrame.
+
+    Parameters
+    ----------
+    file_path : Path
+        Path to the untracked VIA JSON file.
+
+    Returns
+    -------
+    pd.DataFrame
+        Untracked annotations DataFrame.
+
     """
-    # General file validation
+    # Run validators
     file = ValidFile(
         file_path, expected_permission="r", expected_suffix=[".json"]
     )
-
-    # JSON file validation
     json_file = ValidJSON(file.path)
-
-    # VIA Untracked JSON schema validation
     via_untracked_file = ValidVIAUntrackedJSON(json_file.path)
 
     # Read as standard dataframe
@@ -46,16 +53,26 @@ def df_from_via_json_file(file_path: Path) -> pd.DataFrame:
 
 
 def df_from_coco_json_file(file_path: Path) -> pd.DataFrame:
-    """Validate and read COCO JSON file."""
-    # General file validation
+    """Validate and read untracked COCO JSON file.
+
+    The data is formatted as an untracked annotations DataFrame.
+
+    Parameters
+    ----------
+    file_path : Path
+        Path to the untracked COCO JSON file.
+
+    Returns
+    -------
+    pd.DataFrame
+        Untracked annotations DataFrame.
+
+    """
+    # Run validators
     file = ValidFile(
         file_path, expected_permission="r", expected_suffix=[".json"]
     )
-
-    # JSON file validation
     json_file = ValidJSON(file.path)
-
-    # COCO Untracked JSON schema validation
     coco_untracked_file = ValidCOCOUntrackedJSON(json_file.path)
 
     # Read as standard dataframe

From 23d5259bd9287807f6f169a805dd951ceba57955 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 12 Dec 2024 10:17:10 +0000
Subject: [PATCH 13/36] Delete utils

---
 ethology/annotations/utils.py | 72 -----------------------------------
 1 file changed, 72 deletions(-)
 delete mode 100644 ethology/annotations/utils.py

diff --git a/ethology/annotations/utils.py b/ethology/annotations/utils.py
deleted file mode 100644
index 925489a..0000000
--- a/ethology/annotations/utils.py
+++ /dev/null
@@ -1,72 +0,0 @@
-"""Utility functions to work with annotations in JSON format."""
-
-import json
-from pathlib import Path
-
-
-def read_json_file_as_dict(
-    file_path: Path,
-) -> dict:
-    """Read JSON file as dict.
-
-    Parameters
-    ----------
-    file_path : str
-        Path to the JSON file
-
-    Returns
-    -------
-    dict
-        Dictionary with the JSON data
-
-    """
-    try:
-        with open(file_path) as file:
-            return json.load(file)
-    except FileNotFoundError as not_found_error:
-        msg = f"File not found: {file_path}"
-        raise ValueError(msg) from not_found_error
-    except json.JSONDecodeError as decode_error:
-        msg = f"Error decoding JSON data from file: {file_path}"
-        raise ValueError(msg) from decode_error
-
-
-def read_via_json_file_as_dict(file_path: Path) -> dict:
-    """Read VIA JSON file as dict.
-
-    Parameters
-    ----------
-    file_path : str
-        Path to the VIA JSON file
-
-    Returns
-    -------
-    dict
-        Dictionary with the JSON data
-
-    """
-    # Read data
-    data = read_json_file_as_dict(file_path)
-
-    # Check the expected keys are defined in the JSON file
-    expected_keys = [
-        "_via_settings",
-        "_via_img_metadata",
-        "_via_attributes",
-        "_via_data_format_version",
-        "_via_image_id_list",
-    ]
-
-    for ky in expected_keys:
-        if ky not in data:
-            raise ValueError(
-                f"Expected key '{ky}' not found in file: {file_path}"
-            )
-
-    return data
-
-
-# def read_via_json_file_as_xarray(file_path: Path):
-
-
-#     via_dict = read_via_json_file_as_dict(file_path)

From 9fbc4f70b4e81fea1e547d26ebd980962dd9c256 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 12 Dec 2024 10:36:04 +0000
Subject: [PATCH 14/36] Add module for fixtures

---
 .pre-commit-config.yaml       | 1 +
 tests/conftest.py             | 5 +++++
 tests/fixtures/__init__.py    | 0
 tests/fixtures/annotations.py | 1 +
 4 files changed, 7 insertions(+)
 create mode 100644 tests/conftest.py
 create mode 100644 tests/fixtures/__init__.py
 create mode 100644 tests/fixtures/annotations.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8c8a2be..99263f3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,6 +19,7 @@ repos:
           args: [--fix=lf]
         - id: name-tests-test
           args: ["--pytest-test-first"]
+          exclude: ^tests/fixtures
         - id: requirements-txt-fixer
         - id: trailing-whitespace
   - repo: https://github.com/pre-commit/pygrep-hooks
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..9b6026a
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,5 @@
+"""Pytest configuration file."""
+
+pytest_plugins = [
+    "tests.fixtures.annotations",
+]
diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/fixtures/annotations.py b/tests/fixtures/annotations.py
new file mode 100644
index 0000000..11894a8
--- /dev/null
+++ b/tests/fixtures/annotations.py
@@ -0,0 +1 @@
+"""Pytest fixtures for annotation tests."""

From d3809e0dd2465e8fcc7d41261e1e2f93ca9a5b2a Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 12 Dec 2024 12:59:22 +0000
Subject: [PATCH 15/36] Change JSON error to FileNotFound

---
 ethology/annotations/validators.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py
index 20b1317..633c341 100644
--- a/ethology/annotations/validators.py
+++ b/ethology/annotations/validators.py
@@ -17,12 +17,6 @@ class ValidJSON:
     path : pathlib.Path
         Path to the JSON file.
 
-    Raises
-    ------
-    ValueError
-        If the file is not in JSON format or if it does not contain the
-        expected keys.
-
     """
 
     path: Path = field(validator=validators.instance_of(Path))
@@ -34,7 +28,9 @@ def _file_is_json(self, attribute, value):
             with open(value) as file:
                 json.load(file)
         except FileNotFoundError as not_found_error:
-            raise ValueError(f"File not found: {value}") from not_found_error
+            raise FileNotFoundError(
+                f"File not found: {value}"
+            ) from not_found_error
         except json.JSONDecodeError as decode_error:
             raise ValueError(
                 f"Error decoding JSON data from file: {value}"

From 5866d106057abcde43ab9b834e94e1cdd3575bba Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 12 Dec 2024 12:59:44 +0000
Subject: [PATCH 16/36] Add shared fixtures across all tests to conftest

---
 tests/conftest.py | 90 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 89 insertions(+), 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 9b6026a..13e4c9d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,5 +1,93 @@
-"""Pytest configuration file."""
+"""Pytest configuration file with shared fixtures across all tests."""
+
+from pathlib import Path
+
+import pooch
+import pytest
+
+GIN_TEST_DATA_REPO = (
+    "https://gin.g-node.org/neuroinformatics/ethology-test-data"
+)
 
 pytest_plugins = [
     "tests.fixtures.annotations",
 ]
+
+
+@pytest.fixture(scope="session")
+def pooch_registry() -> dict:
+    """Pooch registry for the test data.
+
+    This fixture is common to the entire test session. The
+    file registry is downloaded fresh for every test session.
+
+    Returns
+    -------
+    dict
+        URL and hash of the GIN repository with the test data
+
+    """
+    # Cache the test data in the user's home directory
+    test_data_dir = Path.home() / ".ethology-test-data"
+
+    # Initialise pooch registry
+    registry = pooch.create(
+        test_data_dir,
+        base_url=f"{GIN_TEST_DATA_REPO}/raw/master/test_data",
+    )
+
+    # Download only the registry file from GIN
+    # if known_hash = None, the file is always downloaded.
+    file_registry = pooch.retrieve(
+        url=f"{GIN_TEST_DATA_REPO}/raw/master/files-registry.txt",
+        known_hash=None,
+        fname="files-registry.txt",
+        path=test_data_dir,
+    )
+
+    # Load registry file onto pooch registry
+    registry.load_registry(file_registry)
+
+    return registry
+
+
+@pytest.fixture()
+def get_paths_test_data():
+    """Define a factory fixture to get the paths of the data files
+    under a specific zip.
+
+    The name of the zip file is intended to match a testing module. For
+    example, to get the paths to the test files for the annotations
+    tests module, we would call `get_paths_test_data(pooch_registry,
+    "test_annotations")` in a test. This assumes in the GIN repository
+    there is a zip file named `test_annotations.zip` under the `test_data`
+    directory containing the relevant test files.
+    """
+
+    def _get_paths_test_data(pooch_registry, zip_filename: str) -> dict:
+        """Return the paths of the test files under the specified zip filename.
+
+        The zip filename is expected to match a testing module.
+        """
+        # Fetch the test data for the annotations module
+        list_files_in_local_storage = pooch_registry.fetch(
+            f"{zip_filename}.zip",
+            processor=pooch.Unzip(extract_dir=""),
+            progressbar=True,
+        )
+
+        # Filter out files not under `test_annotations` directory
+        list_files_annotations = [
+            f
+            for f in list_files_in_local_storage
+            if (zip_filename in f) and (not f.endswith(".zip"))
+        ]
+
+        # return paths as dict
+        input_data_dict = {}
+        for f in list_files_annotations:
+            input_data_dict[Path(f).name] = Path(f)
+
+        return input_data_dict
+
+    return _get_paths_test_data

From 8504a59db17695acfa06b06cc20a38348445876d Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 12 Dec 2024 12:59:58 +0000
Subject: [PATCH 17/36] Add annotations_test_data fixture to its module

---
 tests/fixtures/annotations.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tests/fixtures/annotations.py b/tests/fixtures/annotations.py
index 11894a8..dced3eb 100644
--- a/tests/fixtures/annotations.py
+++ b/tests/fixtures/annotations.py
@@ -1 +1,8 @@
-"""Pytest fixtures for annotation tests."""
+"""Pytest fixtures shared across annotation tests."""
+
+import pytest
+
+
+@pytest.fixture()
+def annotations_test_data(pooch_registry, get_paths_test_data):
+    return get_paths_test_data(pooch_registry, "test_annotations")

From a22bfc47330d1babd6ee0d0a9b7d1c1fedb40686 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 12 Dec 2024 13:00:34 +0000
Subject: [PATCH 18/36] Add test for JSON file validator

---
 tests/test_unit/test_annotations/__init__.py  |  0
 .../test_annotations/test_validators.py       | 47 +++++++++++++++++++
 tests/test_unit/test_placeholder.py           |  2 -
 3 files changed, 47 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_unit/test_annotations/__init__.py
 create mode 100644 tests/test_unit/test_annotations/test_validators.py
 delete mode 100644 tests/test_unit/test_placeholder.py

diff --git a/tests/test_unit/test_annotations/__init__.py b/tests/test_unit/test_annotations/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py
new file mode 100644
index 0000000..1341459
--- /dev/null
+++ b/tests/test_unit/test_annotations/test_validators.py
@@ -0,0 +1,47 @@
+# import json
+from contextlib import nullcontext as does_not_raise
+
+# import pooch
+import pytest
+
+from ethology.annotations.validators import (
+    ValidJSON,
+)
+
+
+@pytest.mark.parametrize(
+    "valid_json_file",
+    [
+        "VIA_JSON_sample_1.json",
+        "VIA_JSON_sample_2.json",
+    ],
+)
+def test_valid_json(valid_json_file, annotations_test_data):
+    """Test the ValidJSON validator on valid data."""
+    input_json_file = annotations_test_data[valid_json_file]
+    with does_not_raise():
+        ValidJSON(input_json_file)
+
+
+# @pytest.mark.parametrize(
+#     "invalid_json_file, expected_exception, log_message",
+#     [
+#         (
+#             "invalid_VIA_JSON_sample_1.json",
+#             FileNotFoundError,
+#             "File not found: invalid_VIA_JSON_sample_1.json.",
+#         ),
+#         (
+#             "invalid_VIA_JSON_sample_2.json",
+#             ValueError,
+#             "Error decoding JSON data from file: invalid_VIA_JSON_sample_2.",
+#         ),
+#     ],
+# )
+# def test_valid_json_errors(invalid_json_file,
+# expected_exception, log_message):
+#     """Test the ValidJSON validator on invalid data."""
+#     with pytest.raises(expected_exception) as excinfo:
+#         ValidJSON(invalid_json_file)
+
+#     assert str(excinfo.value) == log_message
diff --git a/tests/test_unit/test_placeholder.py b/tests/test_unit/test_placeholder.py
deleted file mode 100644
index 3ada1ee..0000000
--- a/tests/test_unit/test_placeholder.py
+++ /dev/null
@@ -1,2 +0,0 @@
-def test_placeholder():
-    assert True

From 6edbfc452cf6e7e46ddffcf8d4a73691d04fa9f0 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 12 Dec 2024 14:05:52 +0000
Subject: [PATCH 19/36] Fix fresh download of files-registry

---
 tests/conftest.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 13e4c9d..f759045 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -30,6 +30,12 @@ def pooch_registry() -> dict:
     # Cache the test data in the user's home directory
     test_data_dir = Path.home() / ".ethology-test-data"
 
+    # Remove the file registry if it exists
+    # otherwise it is not downloaded from scratch every time
+    file_registry_path = test_data_dir / "files-registry.txt"
+    if file_registry_path.is_file():
+        Path(file_registry_path).unlink()
+
     # Initialise pooch registry
     registry = pooch.create(
         test_data_dir,
@@ -37,12 +43,11 @@ def pooch_registry() -> dict:
     )
 
     # Download only the registry file from GIN
-    # if known_hash = None, the file is always downloaded.
     file_registry = pooch.retrieve(
         url=f"{GIN_TEST_DATA_REPO}/raw/master/files-registry.txt",
         known_hash=None,
-        fname="files-registry.txt",
-        path=test_data_dir,
+        fname=file_registry_path.name,
+        path=file_registry_path.parent,
     )
 
     # Load registry file onto pooch registry

From 9976715a0416d250ddd53a3424d7ce2fb2a5bbc7 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 12 Dec 2024 14:24:03 +0000
Subject: [PATCH 20/36] Fix tests to work with unzipped subdirectories

---
 tests/conftest.py                             | 38 ++++++++-----------
 .../test_annotations/test_validators.py       | 27 ++++++++-----
 2 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index f759045..37d1041 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -69,30 +69,24 @@ def get_paths_test_data():
     directory containing the relevant test files.
     """
 
-    def _get_paths_test_data(pooch_registry, zip_filename: str) -> dict:
+    def _get_paths_test_data(pooch_registry, subdir_name: str) -> dict:
         """Return the paths of the test files under the specified zip filename.
 
-        The zip filename is expected to match a testing module.
+        subdir_name is the name of the subdirectory under `test_data`.
         """
-        # Fetch the test data for the annotations module
-        list_files_in_local_storage = pooch_registry.fetch(
-            f"{zip_filename}.zip",
-            processor=pooch.Unzip(extract_dir=""),
-            progressbar=True,
-        )
-
-        # Filter out files not under `test_annotations` directory
-        list_files_annotations = [
-            f
-            for f in list_files_in_local_storage
-            if (zip_filename in f) and (not f.endswith(".zip"))
-        ]
-
-        # return paths as dict
-        input_data_dict = {}
-        for f in list_files_annotations:
-            input_data_dict[Path(f).name] = Path(f)
-
-        return input_data_dict
+        test_filename_to_path = {}
+        for relative_filepath in pooch_registry.registry:
+            # relative to test_data
+            if relative_filepath.startswith(f"{subdir_name}/"):
+                # fetch file from pooch registry
+                fetched_filepath = pooch_registry.fetch(
+                    relative_filepath,  # under test_data
+                    progressbar=True,
+                )
+
+                test_filename_to_path[Path(fetched_filepath).name] = Path(
+                    fetched_filepath
+                )
+        return test_filename_to_path
 
     return _get_paths_test_data
diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py
index 1341459..a0612df 100644
--- a/tests/test_unit/test_annotations/test_validators.py
+++ b/tests/test_unit/test_annotations/test_validators.py
@@ -4,24 +4,33 @@
 # import pooch
 import pytest
 
-from ethology.annotations.validators import (
-    ValidJSON,
-)
+from ethology.annotations.validators import ValidJSON
+
+# @pytest.fixture()
+# def
 
 
 @pytest.mark.parametrize(
-    "valid_json_file",
+    "input_json_file, expected_exception, log_message",
     [
-        "VIA_JSON_sample_1.json",
-        "VIA_JSON_sample_2.json",
+        ("VIA_JSON_sample_1.json", does_not_raise(), ""),
+        ("VIA_JSON_sample_2.json", does_not_raise(), ""),
     ],
 )
-def test_valid_json(valid_json_file, annotations_test_data):
+def test_valid_json(
+    annotations_test_data,
+    input_json_file,
+    expected_exception,
+    log_message,
+):
     """Test the ValidJSON validator on valid data."""
-    input_json_file = annotations_test_data[valid_json_file]
-    with does_not_raise():
+    input_json_file = annotations_test_data[input_json_file]
+    with expected_exception as excinfo:
         ValidJSON(input_json_file)
 
+    if log_message:
+        assert str(excinfo.value) == log_message
+
 
 # @pytest.mark.parametrize(
 #     "invalid_json_file, expected_exception, log_message",

From a4ffb4411cd39dfa407f1222fd67053feb49c266 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 16 Dec 2024 11:29:25 +0000
Subject: [PATCH 21/36] Add test for JSON validator

---
 ethology/annotations/validators.py            |  4 +-
 .../test_annotations/test_validators.py       | 55 ++++++++++++++++---
 2 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py
index 633c341..ebb0d58 100644
--- a/ethology/annotations/validators.py
+++ b/ethology/annotations/validators.py
@@ -29,11 +29,11 @@ def _file_is_json(self, attribute, value):
                 json.load(file)
         except FileNotFoundError as not_found_error:
             raise FileNotFoundError(
-                f"File not found: {value}"
+                f"File not found: {value}."
             ) from not_found_error
         except json.JSONDecodeError as decode_error:
             raise ValueError(
-                f"Error decoding JSON data from file: {value}"
+                f"Error decoding JSON data from file: {value}."
             ) from decode_error
 
 
diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py
index a0612df..b71bdd5 100644
--- a/tests/test_unit/test_annotations/test_validators.py
+++ b/tests/test_unit/test_annotations/test_validators.py
@@ -6,15 +6,55 @@
 
 from ethology.annotations.validators import ValidJSON
 
-# @pytest.fixture()
-# def
+
+@pytest.fixture()
+def via_json_valid_file(annotations_test_data):
+    return annotations_test_data["VIA_JSON_sample_1.json"]
+
+
+@pytest.fixture()
+def coco_json_valid_file(annotations_test_data):
+    return annotations_test_data["COCO_JSON_sample_1.json"]
+
+
+@pytest.fixture()
+def json_with_decode_error(tmp_path):
+    """Return the path to a JSON file with a decoding error."""
+    json_file = tmp_path / "JSON_decode_error.json"
+    with open(json_file, "w") as f:
+        f.write("invalid_json")
+    return json_file
+
+
+@pytest.fixture()
+def json_file_not_found(tmp_path):
+    """Return the path to a JSON file that does not exist."""
+    return tmp_path / "JSON_file_not_found.json"
 
 
 @pytest.mark.parametrize(
     "input_json_file, expected_exception, log_message",
     [
-        ("VIA_JSON_sample_1.json", does_not_raise(), ""),
-        ("VIA_JSON_sample_2.json", does_not_raise(), ""),
+        (
+            "via_json_valid_file",
+            does_not_raise(),
+            "",
+        ),
+        (
+            "coco_json_valid_file",
+            does_not_raise(),
+            "",
+        ),
+        (
+            "json_with_decode_error",
+            pytest.raises(ValueError),
+            "Error decoding JSON data from file: {}.",
+        ),
+        (
+            "json_file_not_found",
+            pytest.raises(FileNotFoundError),
+            "File not found: {}.",
+        ),
     ],
 )
 def test_valid_json(
@@ -22,14 +62,15 @@ def test_valid_json(
     input_json_file,
     expected_exception,
     log_message,
+    request,
 ):
-    """Test the ValidJSON validator on valid data."""
-    input_json_file = annotations_test_data[input_json_file]
+    """Test the ValidJSON validator."""
+    input_json_file = request.getfixturevalue(input_json_file)
     with expected_exception as excinfo:
         ValidJSON(input_json_file)
 
     if log_message:
-        assert str(excinfo.value) == log_message
+        assert str(excinfo.value) == log_message.format(input_json_file)
 
 
 # @pytest.mark.parametrize(

From b901658a80ccc4ac859058964ec724c5a5df6146 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 16 Dec 2024 12:55:30 +0000
Subject: [PATCH 22/36] Factor out schemas and include schema validation in
 ValidJSON

---
 ethology/annotations/io.py                    |   8 +-
 ethology/annotations/json_schemas.py          | 118 ++++++++++++
 ethology/annotations/validators.py            | 142 +++++---------
 .../test_annotations/test_validators.py       | 177 +++++++++++++-----
 4 files changed, 292 insertions(+), 153 deletions(-)
 create mode 100644 ethology/annotations/json_schemas.py

diff --git a/ethology/annotations/io.py b/ethology/annotations/io.py
index 129f241..e62d9f0 100644
--- a/ethology/annotations/io.py
+++ b/ethology/annotations/io.py
@@ -6,6 +6,10 @@
 import pandas as pd
 from movement.validators.files import ValidFile
 
+from ethology.annotations.json_schemas import (
+    COCO_UNTRACKED_SCHEMA,
+    VIA_UNTRACKED_SCHEMA,
+)
 from ethology.annotations.validators import (
     ValidCOCOUntrackedJSON,
     ValidJSON,
@@ -45,7 +49,7 @@ def df_from_via_json_file(file_path: Path) -> pd.DataFrame:
     file = ValidFile(
         file_path, expected_permission="r", expected_suffix=[".json"]
     )
-    json_file = ValidJSON(file.path)
+    json_file = ValidJSON(path=file.path, schema=VIA_UNTRACKED_SCHEMA)
     via_untracked_file = ValidVIAUntrackedJSON(json_file.path)
 
     # Read as standard dataframe
@@ -72,7 +76,7 @@ def df_from_coco_json_file(file_path: Path) -> pd.DataFrame:
     file = ValidFile(
         file_path, expected_permission="r", expected_suffix=[".json"]
     )
-    json_file = ValidJSON(file.path)
+    json_file = ValidJSON(path=file.path, schema=COCO_UNTRACKED_SCHEMA)
     coco_untracked_file = ValidCOCOUntrackedJSON(json_file.path)
 
     # Read as standard dataframe
diff --git a/ethology/annotations/json_schemas.py b/ethology/annotations/json_schemas.py
new file mode 100644
index 0000000..7e9d265
--- /dev/null
+++ b/ethology/annotations/json_schemas.py
@@ -0,0 +1,118 @@
+"""JSON schemas for VIA and COCO annotations."""
+
+VIA_UNTRACKED_SCHEMA = {
+    "type": "object",
+    "properties": {
+        # settings for browser UI
+        "_via_settings": {
+            "type": "object",
+            "properties": {
+                "ui": {"type": "object"},
+                "core": {"type": "object"},
+                "project": {"type": "object"},
+            },
+        },
+        # annotation data
+        "_via_img_metadata": {
+            "type": "object",
+            "additionalProperties": {
+                # "additionalProperties" to allow any key,
+                # see https://stackoverflow.com/a/69811612/24834957
+                "type": "object",
+                "properties": {
+                    "filename": {"type": "string"},
+                    "size": {"type": "integer"},
+                    "regions": {
+                        "type": "array",  # a list of dicts
+                        "items": {
+                            "type": "object",
+                            "properties": {
+                                "shape_attributes": {
+                                    "type": "object",
+                                    "properties": {
+                                        "name": {"type": "string"},
+                                        "x": {"type": "integer"},
+                                        "y": {"type": "integer"},
+                                        "width": {"type": "integer"},
+                                        "height": {"type": "integer"},
+                                    },
+                                    "region_attributes": {
+                                        "type": "object"
+                                    },  # we just check it's a dict
+                                },
+                            },
+                        },
+                    },
+                    "file_attributes": {"type": "object"},
+                },
+            },
+        },
+        # ordered list of image keys
+        # - the position defines the image ID
+        "_via_image_id_list": {
+            "type": "array",
+            "items": {"type": "string"},
+        },
+        # region (aka annotation) and file attributes for VIA UI
+        "_via_attributes": {
+            "type": "object",
+            "properties": {
+                "region": {"type": "object"},
+                "file": {"type": "object"},
+            },
+        },
+        # version of the VIA data format
+        "_via_data_format_version": {"type": "string"},
+    },
+}
+
+
+COCO_UNTRACKED_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "info": {"type": "object"},
+        "licenses": {
+            "type": "array",
+        },
+        "images": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "file_name": {"type": "string"},
+                    "id": {"type": "integer"},
+                    "width": {"type": "integer"},
+                    "height": {"type": "integer"},
+                },
+            },
+        },
+        "annotations": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "id": {"type": "integer"},  # annotation global ID
+                    "image_id": {"type": "integer"},
+                    "bbox": {
+                        "type": "array",
+                        "items": {"type": "integer"},
+                    },
+                    "category_id": {"type": "integer"},
+                    "area": {"type": "integer"},
+                    "iscrowd": {"type": "integer"},
+                },
+            },
+        },
+        "categories": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "id": {"type": "integer"},
+                    "name": {"type": "string"},
+                    "supercategory": {"type": "string"},
+                },
+            },
+        },
+    },
+}
diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py
index ebb0d58..b328d97 100644
--- a/ethology/annotations/validators.py
+++ b/ethology/annotations/validators.py
@@ -17,9 +17,28 @@ class ValidJSON:
     path : pathlib.Path
         Path to the JSON file.
 
+    schema : dict
+        JSON schema to validate the file against.
+
+    Raises
+    ------
+    FileNotFoundError
+        If the file does not exist.
+    ValueError
+        If the JSON file cannot be decoded, or
+        if the type of any of its keys does not match those
+        specified in the schema.
+
+
+    Notes
+    -----
+    https://json-schema.org/understanding-json-schema/
+
     """
 
+    # Required attributes
     path: Path = field(validator=validators.instance_of(Path))
+    schema: dict = field()
 
     @path.validator
     def _file_is_json(self, attribute, value):
@@ -36,122 +55,45 @@ def _file_is_json(self, attribute, value):
                 f"Error decoding JSON data from file: {value}."
             ) from decode_error
 
+    @path.validator
+    def _file_matches_JSON_schema(self, attribute, value):
+        """Ensure that the JSON file matches the expected schema.
+
+        The schema validation only checks the type for each specified
+        key if it exists. It does not check for the presence of the keys.
+        """
+        # read json file
+        with open(value) as file:
+            data = json.load(file)
+
+        # check against schema
+        try:
+            jsonschema.validate(instance=data, schema=self.schema)
+        except jsonschema.exceptions.ValidationError as val_err:
+            raise ValueError(
+                "The JSON data does not match "
+                f"the provided schema: {self.schema}."
+            ) from val_err
+
 
 @define
 class ValidVIAUntrackedJSON:
     """Class for validating VIA JSON files for untracked data.
 
-    The validator ensures that the file matches the expected schema.
-    The schema validation only checks the type for each specified
-    key if it exists. It does not check for the presence of the keys.
+    Checks the VIA JSON file for untracked data contains the required keys.
 
+    Note that the validation against the schema does not check the existence
+    of the keys, only the type of their values if they exist.
 
     Attributes
     ----------
     path : pathlib.Path
         Path to the JSON file.
 
-    Raises
-    ------
-    ValueError
-        If the JSON file does not match the expected schema.
-
-    Notes
-    -----
-    https://json-schema.org/understanding-json-schema/
-
     """
 
-    # TODO: add a check for the presence of the keys
-    # that I use in loading the data
-
     path: Path = field(validator=validators.instance_of(Path))
 
-    @path.validator
-    def _file_macthes_VIA_JSON_schema(self, attribute, value):
-        """Ensure that the JSON file matches the expected schema."""
-        # Define schema for VIA JSON file for untracked
-        # (aka manually labelled) data
-        VIA_JSON_schema = {
-            "type": "object",
-            "properties": {
-                # settings for browser UI
-                "_via_settings": {
-                    "type": "object",
-                    "properties": {
-                        "ui": {"type": "object"},
-                        "core": {"type": "object"},
-                        "project": {"type": "object"},
-                    },
-                },
-                # annotation data
-                "_via_img_metadata": {
-                    "type": "object",
-                    "additionalProperties": {
-                        # "additionalProperties" to allow any key,
-                        # see https://stackoverflow.com/a/69811612/24834957
-                        "type": "object",
-                        "properties": {
-                            "filename": {"type": "string"},
-                            "size": {"type": "integer"},
-                            "regions": {
-                                "type": "array",  # a list of dicts
-                                "items": {
-                                    "type": "object",
-                                    "properties": {
-                                        "shape_attributes": {
-                                            "type": "object",
-                                            "properties": {
-                                                "name": {"type": "string"},
-                                                "x": {"type": "integer"},
-                                                "y": {"type": "integer"},
-                                                "width": {"type": "integer"},
-                                                "height": {"type": "integer"},
-                                            },
-                                            "region_attributes": {
-                                                "type": "object"
-                                            },  # we just check it's a dict
-                                        },
-                                    },
-                                },
-                            },
-                            "file_attributes": {"type": "object"},
-                        },
-                    },
-                },
-                # ordered list of image keys
-                # - the position defines the image ID
-                "_via_image_id_list": {
-                    "type": "array",
-                    "items": {"type": "string"},
-                },
-                # region (aka annotation) and file attributes for VIA UI
-                "_via_attributes": {
-                    "type": "object",
-                    "properties": {
-                        "region": {"type": "object"},
-                        "file": {"type": "object"},
-                    },
-                },
-                # version of the VIA data format
-                "_via_data_format_version": {"type": "string"},
-            },
-        }
-
-        # should have been validated with ValidJSON
-        # already so this should work fine
-        with open(value) as file:
-            data = json.load(file)
-
-        # check against schema
-        try:
-            jsonschema.validate(instance=data, schema=VIA_JSON_schema)
-        except jsonschema.exceptions.ValidationError as val_err:
-            raise ValueError(
-                "The JSON data does not match "
-                f"the provided schema: {VIA_JSON_schema}"
-            ) from val_err
-
     @path.validator
     def _file_contains_required_keys(self, attribute, value):
         """Ensure that the JSON file contains the required keys."""
diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py
index b71bdd5..4c9fd34 100644
--- a/tests/test_unit/test_annotations/test_validators.py
+++ b/tests/test_unit/test_annotations/test_validators.py
@@ -1,97 +1,172 @@
-# import json
+import json
 from contextlib import nullcontext as does_not_raise
 
-# import pooch
 import pytest
 
+from ethology.annotations.json_schemas import (
+    COCO_UNTRACKED_SCHEMA,
+    VIA_UNTRACKED_SCHEMA,
+)
 from ethology.annotations.validators import ValidJSON
 
 
 @pytest.fixture()
-def via_json_valid_file(annotations_test_data):
-    return annotations_test_data["VIA_JSON_sample_1.json"]
+def json_file_with_decode_error(tmp_path):
+    """Return factory of paths to JSON files with a decoding error."""
+    json_file = tmp_path / "JSON_decode_error.json"
+    with open(json_file, "w") as f:
+        f.write("just-a-string")
+    return json_file
 
 
 @pytest.fixture()
-def coco_json_valid_file(annotations_test_data):
-    return annotations_test_data["COCO_JSON_sample_1.json"]
+def json_file_with_not_found_error(tmp_path):
+    """Return the path to a JSON file that does not exist."""
+    return tmp_path / "JSON_file_not_found.json"
 
 
 @pytest.fixture()
-def json_with_decode_error(tmp_path):
-    """Return the path to a JSON file with a decoding error."""
-    json_file = tmp_path / "JSON_decode_error.json"
-    with open(json_file, "w") as f:
-        f.write("invalid_json")
-    return json_file
+def via_json_file_with_schema_error(tmp_path, annotations_test_data):
+    """Return path to a JSON file that doesn't match the expected schema."""
+    # read valid json file
+    via_json_valid_filepath = annotations_test_data["VIA_JSON_sample_1.json"]
+    with open(via_json_valid_filepath) as f:
+        data = json.load(f)
+
+    # change type of specific keys
+    # - change "_via_image_id_list" from list of strings to list of integers
+    # TODO: what if I change several?
+    data["_via_image_id_list"] = list(range(len(data["_via_image_id_list"])))
+
+    # save the modified data to a new file under tmp_path
+    out_json = tmp_path / "VIA_JSON_schema_error.json"
+    with open(out_json, "w") as f:
+        json.dump(data, f)
+    return out_json
 
 
 @pytest.fixture()
-def json_file_not_found(tmp_path):
-    """Return the path to a JSON file that does not exist."""
-    return tmp_path / "JSON_file_not_found.json"
+def coco_json_file_with_schema_error(
+    tmp_path,
+    annotations_test_data,
+):
+    """Return path to a JSON file that doesn't match the expected schema."""
+    # read valid json file
+    via_json_valid_filepath = annotations_test_data["COCO_JSON_sample_1.json"]
+    with open(via_json_valid_filepath) as f:
+        data = json.load(f)
+
+    # change "annotations" from list of dicts to list of lists
+    # TODO: what if I change several?
+    data["annotations"] = [[d] for d in data["annotations"]]
+
+    # save the modified data to a new file under tmp_path
+    out_json = tmp_path / "VIA_JSON_schema_error.json"
+    with open(out_json, "w") as f:
+        json.dump(data, f)
+    return out_json
+
+
+@pytest.mark.parametrize(
+    "input_json_file, input_schema",
+    [
+        ("VIA_JSON_sample_1.json", VIA_UNTRACKED_SCHEMA),
+        ("VIA_JSON_sample_2.json", VIA_UNTRACKED_SCHEMA),
+        ("COCO_JSON_sample_1.json", COCO_UNTRACKED_SCHEMA),
+        ("COCO_JSON_sample_2.json", COCO_UNTRACKED_SCHEMA),
+    ],
+)
+def test_valid_json(
+    annotations_test_data,
+    input_json_file,
+    input_schema,
+):
+    """Test the ValidJSON validator with valid files."""
+    input_json_file = annotations_test_data[input_json_file]
+    with does_not_raise():
+        ValidJSON(path=input_json_file, schema=input_schema)
 
 
 @pytest.mark.parametrize(
-    "input_json_file, expected_exception, log_message",
+    "invalid_json_file_str, input_schema, expected_exception, log_message",
     [
         (
-            "via_json_valid_file",
-            does_not_raise(),
-            "",
+            "json_file_with_decode_error",
+            VIA_UNTRACKED_SCHEMA,  # should be independent of schema
+            pytest.raises(ValueError),
+            "Error decoding JSON data from file: {}.",
         ),
         (
-            "coco_json_valid_file",
-            does_not_raise(),
-            "",
+            "json_file_with_not_found_error",
+            VIA_UNTRACKED_SCHEMA,  # should be independent of schema
+            pytest.raises(FileNotFoundError),
+            "File not found: {}.",
         ),
         (
-            "json_with_decode_error",
+            "via_json_file_with_schema_error",
+            VIA_UNTRACKED_SCHEMA,
             pytest.raises(ValueError),
-            "Error decoding JSON data from file: {}.",
+            "The JSON data does not match the provided schema: {}.",
         ),
         (
-            "json_file_not_found",
-            pytest.raises(FileNotFoundError),
-            "File not found: {}.",
+            "coco_json_file_with_schema_error",
+            COCO_UNTRACKED_SCHEMA,
+            pytest.raises(ValueError),
+            "The JSON data does not match the provided schema: {}.",
         ),
     ],
 )
-def test_valid_json(
-    annotations_test_data,
-    input_json_file,
+def test_valid_json_error(
+    invalid_json_file_str,
+    input_schema,
     expected_exception,
     log_message,
     request,
 ):
-    """Test the ValidJSON validator."""
-    input_json_file = request.getfixturevalue(input_json_file)
+    """Test the ValidJSON validator throws the expected error."""
+    invalid_json_file = request.getfixturevalue(invalid_json_file_str)
+
     with expected_exception as excinfo:
-        ValidJSON(input_json_file)
+        ValidJSON(path=invalid_json_file, schema=input_schema)
 
-    if log_message:
-        assert str(excinfo.value) == log_message.format(input_json_file)
+    if "schema" in invalid_json_file_str:
+        assert str(excinfo.value) == log_message.format(input_schema)
+    else:
+        assert str(excinfo.value) == log_message.format(invalid_json_file)
 
 
 # @pytest.mark.parametrize(
-#     "invalid_json_file, expected_exception, log_message",
+#     "valid_json_file, input_schema",
+#     [
+#         ("VIA_JSON_sample_1.json", VIA_UNTRACKED_SCHEMA),
+#         ("COCO_JSON_sample_1.json", COCO_UNTRACKED_SCHEMA),
+#     ],
+# )
+# @pytest.mark.parametrize(
+#     "invalid_json_factory, expected_exception, log_message",
 #     [
 #         (
-#             "invalid_VIA_JSON_sample_1.json",
-#             FileNotFoundError,
-#             "File not found: invalid_VIA_JSON_sample_1.json.",
-#         ),
-#         (
-#             "invalid_VIA_JSON_sample_2.json",
-#             ValueError,
-#             "Error decoding JSON data from file: invalid_VIA_JSON_sample_2.",
+#             "get_json_file_with_schema_error",
+#             pytest.raises(ValueError),
+#             "The JSON data does not match the provided schema: {}.",
 #         ),
 #     ],
 # )
-# def test_valid_json_errors(invalid_json_file,
-# expected_exception, log_message):
-#     """Test the ValidJSON validator on invalid data."""
-#     with pytest.raises(expected_exception) as excinfo:
-#         ValidJSON(invalid_json_file)
-
-#     assert str(excinfo.value) == log_message
+# def test_valid_json_schema_error(
+#     valid_json_file,
+#     input_schema,
+#     invalid_json_factory,
+#     expected_exception,
+#     log_message,
+#     tmp_path,
+#     request,
+# ):
+#     """Test the ValidJSON validator throws the expected error."""
+#     invalid_json_factory = request.getfixturevalue(invalid_json_factory)
+#     invalid_json_file = invalid_json_factory(valid_json_file)
+
+#     with expected_exception as excinfo:
+#         ValidJSON(path=invalid_json_file, schema=input_schema)
+
+#     if log_message:
+#         assert str(excinfo.value) == log_message.format(input_schema)

From 05696a7e4d0a6fd451c40584b6a400f2112cf6de Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 16 Dec 2024 13:14:40 +0000
Subject: [PATCH 23/36] Make schema optional

---
 ethology/annotations/validators.py            | 21 +++++++++++--------
 .../test_annotations/test_validators.py       | 20 ++++++++++++------
 2 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py
index b328d97..1a0d22f 100644
--- a/ethology/annotations/validators.py
+++ b/ethology/annotations/validators.py
@@ -38,7 +38,9 @@ class ValidJSON:
 
     # Required attributes
     path: Path = field(validator=validators.instance_of(Path))
-    schema: dict = field()
+
+    # Optional attributes
+    schema: dict | None = field(default=None)
 
     @path.validator
     def _file_is_json(self, attribute, value):
@@ -66,14 +68,15 @@ def _file_matches_JSON_schema(self, attribute, value):
         with open(value) as file:
             data = json.load(file)
 
-        # check against schema
-        try:
-            jsonschema.validate(instance=data, schema=self.schema)
-        except jsonschema.exceptions.ValidationError as val_err:
-            raise ValueError(
-                "The JSON data does not match "
-                f"the provided schema: {self.schema}."
-            ) from val_err
+        # check against schema if provided
+        if self.schema:
+            try:
+                jsonschema.validate(instance=data, schema=self.schema)
+            except jsonschema.exceptions.ValidationError as val_err:
+                raise ValueError(
+                    "The JSON data does not match "
+                    f"the provided schema: {self.schema}."
+                ) from val_err
 
 
 @define
diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py
index 4c9fd34..7fe9820 100644
--- a/tests/test_unit/test_annotations/test_validators.py
+++ b/tests/test_unit/test_annotations/test_validators.py
@@ -68,20 +68,28 @@ def coco_json_file_with_schema_error(
 
 
 @pytest.mark.parametrize(
-    "input_json_file, input_schema",
+    "input_json_file_suffix",
+    ["1", "2"],
+)
+@pytest.mark.parametrize(
+    "input_file_standard, input_schema",
     [
-        ("VIA_JSON_sample_1.json", VIA_UNTRACKED_SCHEMA),
-        ("VIA_JSON_sample_2.json", VIA_UNTRACKED_SCHEMA),
-        ("COCO_JSON_sample_1.json", COCO_UNTRACKED_SCHEMA),
-        ("COCO_JSON_sample_2.json", COCO_UNTRACKED_SCHEMA),
+        ("VIA", VIA_UNTRACKED_SCHEMA),
+        ("VIA", None),
+        ("COCO", COCO_UNTRACKED_SCHEMA),
+        ("COCO", None),
     ],
 )
 def test_valid_json(
     annotations_test_data,
-    input_json_file,
+    input_file_standard,
+    input_json_file_suffix,
     input_schema,
 ):
     """Test the ValidJSON validator with valid files."""
+    input_json_file = (
+        f"{input_file_standard}_JSON_sample_{input_json_file_suffix}.json"
+    )
     input_json_file = annotations_test_data[input_json_file]
     with does_not_raise():
         ValidJSON(path=input_json_file, schema=input_schema)

From 8223701da977425cf803e66e06b093757b097396 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 16 Dec 2024 14:32:35 +0000
Subject: [PATCH 24/36] Add tests for schemas

---
 ethology/annotations/json_schemas.py          |   1 -
 ethology/annotations/validators.py            |   7 +-
 .../test_annotations/test_validators.py       | 167 +++++++++---------
 3 files changed, 87 insertions(+), 88 deletions(-)

diff --git a/ethology/annotations/json_schemas.py b/ethology/annotations/json_schemas.py
index 7e9d265..161c924 100644
--- a/ethology/annotations/json_schemas.py
+++ b/ethology/annotations/json_schemas.py
@@ -66,7 +66,6 @@
     },
 }
 
-
 COCO_UNTRACKED_SCHEMA = {
     "type": "object",
     "properties": {
diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py
index 1a0d22f..2ad2b4a 100644
--- a/ethology/annotations/validators.py
+++ b/ethology/annotations/validators.py
@@ -5,6 +5,7 @@
 
 import jsonschema
 import jsonschema.exceptions
+import jsonschema.validators
 from attrs import define, field, validators
 
 
@@ -73,10 +74,8 @@ def _file_matches_JSON_schema(self, attribute, value):
             try:
                 jsonschema.validate(instance=data, schema=self.schema)
             except jsonschema.exceptions.ValidationError as val_err:
-                raise ValueError(
-                    "The JSON data does not match "
-                    f"the provided schema: {self.schema}."
-                ) from val_err
+                # forward the error message as it is quite informative
+                raise val_err
 
 
 @define
diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py
index 7fe9820..dd66ad9 100644
--- a/tests/test_unit/test_annotations/test_validators.py
+++ b/tests/test_unit/test_annotations/test_validators.py
@@ -1,6 +1,7 @@
 import json
 from contextlib import nullcontext as does_not_raise
 
+import jsonschema
 import pytest
 
 from ethology.annotations.json_schemas import (
@@ -25,74 +26,83 @@ def json_file_with_not_found_error(tmp_path):
     return tmp_path / "JSON_file_not_found.json"
 
 
-@pytest.fixture()
-def via_json_file_with_schema_error(tmp_path, annotations_test_data):
+def _json_file_with_schema_error(out_parent_path, json_valid_path):
     """Return path to a JSON file that doesn't match the expected schema."""
     # read valid json file
-    via_json_valid_filepath = annotations_test_data["VIA_JSON_sample_1.json"]
-    with open(via_json_valid_filepath) as f:
+    with open(json_valid_path) as f:
         data = json.load(f)
 
-    # change type of specific keys
-    # - change "_via_image_id_list" from list of strings to list of integers
-    # TODO: what if I change several?
-    data["_via_image_id_list"] = list(range(len(data["_via_image_id_list"])))
-
-    # save the modified data to a new file under tmp_path
-    out_json = tmp_path / "VIA_JSON_schema_error.json"
+    # modify so that it doesn't match the corresponding schema
+    if "VIA" in json_valid_path.name:
+        # change "width" of a bounding box from int to float
+        data["_via_img_metadata"][
+            "09.08_09.08.2023-01-Left_frame_001764.png15086122"
+        ]["regions"][0]["shape_attributes"]["width"] = 49.5
+    elif "COCO" in json_valid_path.name:
+        # change "annotations" from list of dicts to list of lists
+        data["annotations"] = [[d] for d in data["annotations"]]
+
+    # save the modified json to a new file
+    out_json = out_parent_path / f"{json_valid_path.name}_schema_error.json"
     with open(out_json, "w") as f:
         json.dump(data, f)
     return out_json
 
 
 @pytest.fixture()
-def coco_json_file_with_schema_error(
+def via_json_file_with_schema_error(
     tmp_path,
     annotations_test_data,
 ):
-    """Return path to a JSON file that doesn't match the expected schema."""
-    # read valid json file
-    via_json_valid_filepath = annotations_test_data["COCO_JSON_sample_1.json"]
-    with open(via_json_valid_filepath) as f:
-        data = json.load(f)
+    """Return path to a VIA JSON file that doesn't match its schema."""
+    return _json_file_with_schema_error(
+        tmp_path,
+        annotations_test_data["VIA_JSON_sample_1.json"],
+    )
 
-    # change "annotations" from list of dicts to list of lists
-    # TODO: what if I change several?
-    data["annotations"] = [[d] for d in data["annotations"]]
 
-    # save the modified data to a new file under tmp_path
-    out_json = tmp_path / "VIA_JSON_schema_error.json"
-    with open(out_json, "w") as f:
-        json.dump(data, f)
-    return out_json
+@pytest.fixture()
+def coco_json_file_with_schema_error(
+    tmp_path,
+    annotations_test_data,
+):
+    """Return path to a COCO JSON file that doesn't match its schema."""
+    return _json_file_with_schema_error(
+        tmp_path,
+        annotations_test_data["COCO_JSON_sample_1.json"],
+    )
 
 
-@pytest.mark.parametrize(
-    "input_json_file_suffix",
-    ["1", "2"],
-)
 @pytest.mark.parametrize(
     "input_file_standard, input_schema",
     [
-        ("VIA", VIA_UNTRACKED_SCHEMA),
         ("VIA", None),
-        ("COCO", COCO_UNTRACKED_SCHEMA),
+        ("VIA", VIA_UNTRACKED_SCHEMA),
         ("COCO", None),
+        ("COCO", COCO_UNTRACKED_SCHEMA),
     ],
 )
+@pytest.mark.parametrize(
+    "input_json_file_suffix",
+    ["JSON_sample_1.json", "JSON_sample_2.json"],
+)
 def test_valid_json(
-    annotations_test_data,
     input_file_standard,
     input_json_file_suffix,
     input_schema,
+    annotations_test_data,
 ):
     """Test the ValidJSON validator with valid files."""
-    input_json_file = (
-        f"{input_file_standard}_JSON_sample_{input_json_file_suffix}.json"
-    )
-    input_json_file = annotations_test_data[input_json_file]
+    # get path to file
+    filepath = annotations_test_data[
+        f"{input_file_standard}_{input_json_file_suffix}"
+    ]
+
     with does_not_raise():
-        ValidJSON(path=input_json_file, schema=input_schema)
+        ValidJSON(
+            path=filepath,
+            schema=input_schema,
+        )
 
 
 @pytest.mark.parametrize(
@@ -100,27 +110,55 @@ def test_valid_json(
     [
         (
             "json_file_with_decode_error",
-            VIA_UNTRACKED_SCHEMA,  # should be independent of schema
+            None,  # should be independent of schema
             pytest.raises(ValueError),
             "Error decoding JSON data from file: {}.",
         ),
         (
             "json_file_with_not_found_error",
-            VIA_UNTRACKED_SCHEMA,  # should be independent of schema
+            None,  # should be independent of schema
             pytest.raises(FileNotFoundError),
             "File not found: {}.",
         ),
         (
             "via_json_file_with_schema_error",
             VIA_UNTRACKED_SCHEMA,
-            pytest.raises(ValueError),
-            "The JSON data does not match the provided schema: {}.",
+            pytest.raises(jsonschema.exceptions.ValidationError),
+            "49.5 is not of type 'integer'\n\n"
+            "Failed validating 'type' in "
+            "schema['properties']['_via_img_metadata']['additionalProperties']"
+            "['properties']['regions']['items']['properties']"
+            "['shape_attributes']['properties']['width']:\n"
+            "    {'type': 'integer'}\n\n"
+            "On instance['_via_img_metadata']"
+            "['09.08_09.08.2023-01-Left_frame_001764.png15086122']['regions']"
+            "[0]['shape_attributes']['width']:\n"
+            "    49.5",
         ),
         (
             "coco_json_file_with_schema_error",
             COCO_UNTRACKED_SCHEMA,
-            pytest.raises(ValueError),
-            "The JSON data does not match the provided schema: {}.",
+            pytest.raises(jsonschema.exceptions.ValidationError),
+            "[{'area': 432, 'bbox': [1278, 556, 16, 27], 'category_id': 1, "
+            "'id': 8917, 'image_id': 199, 'iscrowd': 0}] is not of type "
+            "'object'\n\n"
+            "Failed validating 'type' in "
+            "schema['properties']['annotations']['items']:\n"
+            "    {'type': 'object',\n"
+            "     'properties': {'id': {'type': 'integer'},\n"
+            "                    'image_id': {'type': 'integer'},\n"
+            "                    'bbox': {'type': 'array', 'items': "
+            "{'type': 'integer'}},\n"
+            "                    'category_id': {'type': 'integer'},\n"
+            "                    'area': {'type': 'integer'},\n"
+            "                    'iscrowd': {'type': 'integer'}}}\n\n"
+            "On instance['annotations'][4343]:\n"
+            "    [{'area': 432,\n"
+            "      'bbox': [1278, 556, 16, 27],\n"
+            "      'category_id': 1,\n"
+            "      'id': 8917,\n"
+            "      'image_id': 199,\n"
+            "      'iscrowd': 0}]",
         ),
     ],
 )
@@ -131,50 +169,13 @@ def test_valid_json_error(
     log_message,
     request,
 ):
-    """Test the ValidJSON validator throws the expected error."""
+    """Test the ValidJSON validator throws the expected errors."""
     invalid_json_file = request.getfixturevalue(invalid_json_file_str)
 
     with expected_exception as excinfo:
         ValidJSON(path=invalid_json_file, schema=input_schema)
 
-    if "schema" in invalid_json_file_str:
-        assert str(excinfo.value) == log_message.format(input_schema)
+    if input_schema:
+        assert str(excinfo.value) == log_message
     else:
         assert str(excinfo.value) == log_message.format(invalid_json_file)
-
-
-# @pytest.mark.parametrize(
-#     "valid_json_file, input_schema",
-#     [
-#         ("VIA_JSON_sample_1.json", VIA_UNTRACKED_SCHEMA),
-#         ("COCO_JSON_sample_1.json", COCO_UNTRACKED_SCHEMA),
-#     ],
-# )
-# @pytest.mark.parametrize(
-#     "invalid_json_factory, expected_exception, log_message",
-#     [
-#         (
-#             "get_json_file_with_schema_error",
-#             pytest.raises(ValueError),
-#             "The JSON data does not match the provided schema: {}.",
-#         ),
-#     ],
-# )
-# def test_valid_json_schema_error(
-#     valid_json_file,
-#     input_schema,
-#     invalid_json_factory,
-#     expected_exception,
-#     log_message,
-#     tmp_path,
-#     request,
-# ):
-#     """Test the ValidJSON validator throws the expected error."""
-#     invalid_json_factory = request.getfixturevalue(invalid_json_factory)
-#     invalid_json_file = invalid_json_factory(valid_json_file)
-
-#     with expected_exception as excinfo:
-#         ValidJSON(path=invalid_json_file, schema=input_schema)
-
-#     if log_message:
-#         assert str(excinfo.value) == log_message.format(input_schema)

From 92d6b74e59838c1ebc3d382726fa3e21cca660bd Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 16 Dec 2024 14:34:26 +0000
Subject: [PATCH 25/36] Reduce error message check for schema validation

---
 .../test_annotations/test_validators.py       | 81 +++++++------------
 1 file changed, 27 insertions(+), 54 deletions(-)

diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py
index dd66ad9..00a8662 100644
--- a/tests/test_unit/test_annotations/test_validators.py
+++ b/tests/test_unit/test_annotations/test_validators.py
@@ -26,29 +26,6 @@ def json_file_with_not_found_error(tmp_path):
     return tmp_path / "JSON_file_not_found.json"
 
 
-def _json_file_with_schema_error(out_parent_path, json_valid_path):
-    """Return path to a JSON file that doesn't match the expected schema."""
-    # read valid json file
-    with open(json_valid_path) as f:
-        data = json.load(f)
-
-    # modify so that it doesn't match the corresponding schema
-    if "VIA" in json_valid_path.name:
-        # change "width" of a bounding box from int to float
-        data["_via_img_metadata"][
-            "09.08_09.08.2023-01-Left_frame_001764.png15086122"
-        ]["regions"][0]["shape_attributes"]["width"] = 49.5
-    elif "COCO" in json_valid_path.name:
-        # change "annotations" from list of dicts to list of lists
-        data["annotations"] = [[d] for d in data["annotations"]]
-
-    # save the modified json to a new file
-    out_json = out_parent_path / f"{json_valid_path.name}_schema_error.json"
-    with open(out_json, "w") as f:
-        json.dump(data, f)
-    return out_json
-
-
 @pytest.fixture()
 def via_json_file_with_schema_error(
     tmp_path,
@@ -73,6 +50,29 @@ def coco_json_file_with_schema_error(
     )
 
 
+def _json_file_with_schema_error(out_parent_path, json_valid_path):
+    """Return path to a JSON file that doesn't match the expected schema."""
+    # read valid json file
+    with open(json_valid_path) as f:
+        data = json.load(f)
+
+    # modify so that it doesn't match the corresponding schema
+    if "VIA" in json_valid_path.name:
+        # change "width" of a bounding box from int to float
+        data["_via_img_metadata"][
+            "09.08_09.08.2023-01-Left_frame_001764.png15086122"
+        ]["regions"][0]["shape_attributes"]["width"] = 49.5
+    elif "COCO" in json_valid_path.name:
+        # change "annotations" from list of dicts to list of lists
+        data["annotations"] = [[d] for d in data["annotations"]]
+
+    # save the modified json to a new file
+    out_json = out_parent_path / f"{json_valid_path.name}_schema_error.json"
+    with open(out_json, "w") as f:
+        json.dump(data, f)
+    return out_json
+
+
 @pytest.mark.parametrize(
     "input_file_standard, input_schema",
     [
@@ -93,7 +93,6 @@ def test_valid_json(
     annotations_test_data,
 ):
     """Test the ValidJSON validator with valid files."""
-    # get path to file
     filepath = annotations_test_data[
         f"{input_file_standard}_{input_json_file_suffix}"
     ]
@@ -124,16 +123,7 @@ def test_valid_json(
             "via_json_file_with_schema_error",
             VIA_UNTRACKED_SCHEMA,
             pytest.raises(jsonschema.exceptions.ValidationError),
-            "49.5 is not of type 'integer'\n\n"
-            "Failed validating 'type' in "
-            "schema['properties']['_via_img_metadata']['additionalProperties']"
-            "['properties']['regions']['items']['properties']"
-            "['shape_attributes']['properties']['width']:\n"
-            "    {'type': 'integer'}\n\n"
-            "On instance['_via_img_metadata']"
-            "['09.08_09.08.2023-01-Left_frame_001764.png15086122']['regions']"
-            "[0]['shape_attributes']['width']:\n"
-            "    49.5",
+            "49.5 is not of type 'integer'\n\n",
         ),
         (
             "coco_json_file_with_schema_error",
@@ -141,24 +131,7 @@ def test_valid_json(
             pytest.raises(jsonschema.exceptions.ValidationError),
             "[{'area': 432, 'bbox': [1278, 556, 16, 27], 'category_id': 1, "
             "'id': 8917, 'image_id': 199, 'iscrowd': 0}] is not of type "
-            "'object'\n\n"
-            "Failed validating 'type' in "
-            "schema['properties']['annotations']['items']:\n"
-            "    {'type': 'object',\n"
-            "     'properties': {'id': {'type': 'integer'},\n"
-            "                    'image_id': {'type': 'integer'},\n"
-            "                    'bbox': {'type': 'array', 'items': "
-            "{'type': 'integer'}},\n"
-            "                    'category_id': {'type': 'integer'},\n"
-            "                    'area': {'type': 'integer'},\n"
-            "                    'iscrowd': {'type': 'integer'}}}\n\n"
-            "On instance['annotations'][4343]:\n"
-            "    [{'area': 432,\n"
-            "      'bbox': [1278, 556, 16, 27],\n"
-            "      'category_id': 1,\n"
-            "      'id': 8917,\n"
-            "      'image_id': 199,\n"
-            "      'iscrowd': 0}]",
+            "'object'\n\n",
         ),
     ],
 )
@@ -176,6 +149,6 @@ def test_valid_json_error(
         ValidJSON(path=invalid_json_file, schema=input_schema)
 
     if input_schema:
-        assert str(excinfo.value) == log_message
+        assert log_message in str(excinfo.value)
     else:
-        assert str(excinfo.value) == log_message.format(invalid_json_file)
+        assert log_message.format(invalid_json_file) == str(excinfo.value)

From a6523e1650226734aa3b9b94c5ce9ec3cf9a488f Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 16 Dec 2024 16:27:24 +0000
Subject: [PATCH 26/36] Add tests for keys check WIP

---
 ethology/annotations/validators.py            |  87 +-----------
 .../test_annotations/test_validators.py       | 125 +++++++++++++++++-
 2 files changed, 129 insertions(+), 83 deletions(-)

diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py
index 2ad2b4a..e297978 100644
--- a/ethology/annotations/validators.py
+++ b/ethology/annotations/validators.py
@@ -84,9 +84,6 @@ class ValidVIAUntrackedJSON:
 
     Checks the VIA JSON file for untracked data contains the required keys.
 
-    Note that the validation against the schema does not check the existence
-    of the keys, only the type of their values if they exist.
-
     Attributes
     ----------
     path : pathlib.Path
@@ -119,7 +116,7 @@ def _file_contains_required_keys(self, attribute, value):
             _check_keys(
                 required_keys["image_keys"],
                 img_dict,
-                additional_error_message=f"for {img_str}",
+                additional_error_message=f" for {img_str}",
             )
             # Check keys for each region
             for region in img_dict["regions"]:
@@ -129,7 +126,7 @@ def _file_contains_required_keys(self, attribute, value):
                 _check_keys(
                     required_keys["shape_attributes_keys"],
                     region["shape_attributes"],
-                    additional_error_message=f"for region under {img_str}",
+                    additional_error_message=f" for region under {img_str}",
                 )
 
 
@@ -159,78 +156,6 @@ class ValidCOCOUntrackedJSON:
 
     path: Path = field(validator=validators.instance_of(Path))
 
-    # TODO: add a check for the presence of the keys
-    # that I use in loading the data
-
-    @path.validator
-    def _file_macthes_COCO_JSON_schema(self, attribute, value):
-        """Ensure that the JSON file matches the expected schema."""
-        # Define schema for VIA JSON file for untracked
-        # (aka manually labelled) data
-        COCO_JSON_schema = {
-            "type": "object",
-            "properties": {
-                "info": {"type": "object"},
-                "licenses": {
-                    "type": "array",
-                },
-                "images": {
-                    "type": "array",
-                    "items": {
-                        "type": "object",
-                        "properties": {
-                            "file_name": {"type": "string"},
-                            "id": {"type": "integer"},
-                            "width": {"type": "integer"},
-                            "height": {"type": "integer"},
-                        },
-                    },
-                },
-                "annotations": {
-                    "type": "array",
-                    "items": {
-                        "type": "object",
-                        "properties": {
-                            "id": {"type": "integer"},  # annotation global ID
-                            "image_id": {"type": "integer"},
-                            "bbox": {
-                                "type": "array",
-                                "items": {"type": "integer"},
-                            },
-                            "category_id": {"type": "integer"},
-                            "area": {"type": "integer"},
-                            "iscrowd": {"type": "integer"},
-                        },
-                    },
-                },
-                "categories": {
-                    "type": "array",
-                    "items": {
-                        "type": "object",
-                        "properties": {
-                            "id": {"type": "integer"},
-                            "name": {"type": "string"},
-                            "supercategory": {"type": "string"},
-                        },
-                    },
-                },
-            },
-        }
-
-        # should have been validated with ValidJSON
-        # already so this should work fine
-        with open(value) as file:
-            data = json.load(file)
-
-        # check against schema
-        try:
-            jsonschema.validate(instance=data, schema=COCO_JSON_schema)
-        except jsonschema.exceptions.ValidationError as val_err:
-            raise ValueError(
-                "The JSON data does not match "
-                f"the provided schema: {COCO_JSON_schema}"
-            ) from val_err
-
     @path.validator
     def _file_contains_required_keys(self, attribute, value):
         """Ensure that the JSON file contains the required keys."""
@@ -256,7 +181,7 @@ def _file_contains_required_keys(self, attribute, value):
             _check_keys(
                 required_keys["image_keys"],
                 img_dict,
-                additional_error_message=f"for image dict {img_dict}",
+                additional_error_message=f" for image dict {img_dict}",
             )
 
         # Check keys in annotations dicts
@@ -264,7 +189,7 @@ def _file_contains_required_keys(self, attribute, value):
             _check_keys(
                 required_keys["annotations_keys"],
                 annot_dict,
-                additional_error_message=f"for annotation dict {annot_dict}",
+                additional_error_message=f" for annotation dict {annot_dict}",
             )
 
         # Check keys in categories dicts
@@ -272,7 +197,7 @@ def _file_contains_required_keys(self, attribute, value):
             _check_keys(
                 required_keys["categories_keys"],
                 cat_dict,
-                additional_error_message=f"for category dict {cat_dict}",
+                additional_error_message=f" for category dict {cat_dict}",
             )
 
 
@@ -285,7 +210,7 @@ def _check_keys(
     if missing_keys:
         raise ValueError(
             f"Required key(s) {missing_keys} not "
-            f"found in {list(data_dict.keys())} "
+            f"found in {list(data_dict.keys())}"
             + additional_error_message
             + "."
         )
diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py
index 00a8662..0f6b7d3 100644
--- a/tests/test_unit/test_annotations/test_validators.py
+++ b/tests/test_unit/test_annotations/test_validators.py
@@ -8,7 +8,10 @@
     COCO_UNTRACKED_SCHEMA,
     VIA_UNTRACKED_SCHEMA,
 )
-from ethology.annotations.validators import ValidJSON
+from ethology.annotations.validators import (
+    ValidJSON,
+    ValidVIAUntrackedJSON,
+)
 
 
 @pytest.fixture()
@@ -73,6 +76,47 @@ def _json_file_with_schema_error(out_parent_path, json_valid_path):
     return out_json
 
 
+@pytest.fixture()
+def via_json_file_with_missing_keys(tmp_path, annotations_test_data):
+    def _via_json_file_with_missing_keys(
+        valid_json_filename, required_keys_to_pop
+    ):
+        """Return path to a JSON file that is missing required keys."""
+        # read valid json file
+        valid_json_path = annotations_test_data[valid_json_filename]
+        with open(valid_json_path) as f:
+            data = json.load(f)
+
+        # remove any keys in the first level
+        for key in required_keys_to_pop.get("main", []):
+            data.pop(key)
+
+        # remove keys in nested dicts
+        for _, img_dict in data["_via_img_metadata"].items():
+            # remove keys for each image dictionary
+            for key in required_keys_to_pop.get("image_keys", []):
+                img_dict.pop(key)
+
+            for region in img_dict["regions"]:
+                # remove keys for each region
+                for key in required_keys_to_pop.get("region_keys", []):
+                    region.pop(key)
+
+                # remove keys under shape_attributes
+                for key in required_keys_to_pop.get(
+                    "shape_attributes_keys", []
+                ):
+                    region["shape_attributes"].pop(key)
+
+        # save the modified json to a new file
+        out_json = tmp_path / f"{valid_json_path.name}_missing_keys.json"
+        with open(out_json, "w") as f:
+            json.dump(data, f)
+        return out_json
+
+    return _via_json_file_with_missing_keys
+
+
 @pytest.mark.parametrize(
     "input_file_standard, input_schema",
     [
@@ -135,7 +179,7 @@ def test_valid_json(
         ),
     ],
 )
-def test_valid_json_error(
+def test_valid_json_errors(
     invalid_json_file_str,
     input_schema,
     expected_exception,
@@ -152,3 +196,80 @@ def test_valid_json_error(
         assert log_message in str(excinfo.value)
     else:
         assert log_message.format(invalid_json_file) == str(excinfo.value)
+
+
+@pytest.mark.parametrize(
+    "input_json_file",
+    [
+        "VIA_JSON_sample_1.json",
+        "VIA_JSON_sample_2.json",
+    ],
+)
+def test_valid_via_untracked_json(annotations_test_data, input_json_file):
+    filepath = annotations_test_data[input_json_file]
+    with does_not_raise():
+        ValidVIAUntrackedJSON(
+            path=filepath,
+        )
+
+
+@pytest.mark.parametrize(
+    "valid_json_file",
+    [
+        "VIA_JSON_sample_1.json",
+        "VIA_JSON_sample_2.json",
+    ],
+)
+@pytest.mark.parametrize(
+    "missing_keys, expected_exception, log_message",
+    [
+        (
+            {"main": ["_via_image_id_list"]},
+            pytest.raises(ValueError),
+            "Required key(s) {'_via_image_id_list'} not found "
+            "in ['_via_settings', '_via_img_metadata', '_via_attributes', "
+            "'_via_data_format_version'].",
+        ),
+        (
+            {"image_keys": ["filename"]},
+            pytest.raises(ValueError),
+            "Required key(s) {'filename'} not found "
+            "in ['size', 'regions', 'file_attributes'] "
+            "for 09.08_09.08.2023-01-Left_frame_001764.png15086122.",
+        ),
+        (
+            {"region_keys": ["shape_attributes"]},
+            pytest.raises(ValueError),
+            "The JSON data does not contain the required keys: annotations.",
+        ),
+        (
+            {"shape_attributes_keys": ["x"]},
+            pytest.raises(ValueError),
+            "The JSON data does not contain the required keys: annotations.",
+        ),
+    ],
+)
+def test_valid_via_untracked_json_missing_keys(
+    valid_json_file,
+    missing_keys,
+    via_json_file_with_missing_keys,
+    expected_exception,
+    log_message,
+):
+    # create invalid json file with missing keys
+    invalid_json_file = via_json_file_with_missing_keys(
+        valid_json_file, missing_keys
+    )
+
+    # run validatio
+    with expected_exception as excinfo:
+        ValidVIAUntrackedJSON(
+            path=invalid_json_file,
+        )
+
+    assert str(excinfo.value) == log_message
+
+
+# def test_valid_via_untracked_json ---> checks required keys
+# def test_valid_coco_untracked_json ---> checks required keys
+# def test_check_keys?

From ab65d95802acb4ef8b9badb372054688c753d510 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 16 Dec 2024 17:15:28 +0000
Subject: [PATCH 27/36] Add test for VIA JSON untracked validator

---
 ethology/annotations/validators.py            | 24 +++----
 .../test_annotations/test_validators.py       | 66 +++++++++++--------
 2 files changed, 53 insertions(+), 37 deletions(-)

diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py
index e297978..74b1462 100644
--- a/ethology/annotations/validators.py
+++ b/ethology/annotations/validators.py
@@ -116,17 +116,21 @@ def _file_contains_required_keys(self, attribute, value):
             _check_keys(
                 required_keys["image_keys"],
                 img_dict,
-                additional_error_message=f" for {img_str}",
+                additional_message=f" for {img_str}",
             )
             # Check keys for each region
-            for region in img_dict["regions"]:
-                _check_keys(required_keys["region_keys"], region)
+            for i, region in enumerate(img_dict["regions"]):
+                _check_keys(
+                    required_keys["region_keys"],
+                    region,
+                    additional_message=f" for region {i} under {img_str}",
+                )
 
                 # Check keys under shape_attributes
                 _check_keys(
                     required_keys["shape_attributes_keys"],
                     region["shape_attributes"],
-                    additional_error_message=f" for region under {img_str}",
+                    additional_message=f" for region {i} under {img_str}",
                 )
 
 
@@ -181,7 +185,7 @@ def _file_contains_required_keys(self, attribute, value):
             _check_keys(
                 required_keys["image_keys"],
                 img_dict,
-                additional_error_message=f" for image dict {img_dict}",
+                additional_message=f" for image dict {img_dict}",
             )
 
         # Check keys in annotations dicts
@@ -189,7 +193,7 @@ def _file_contains_required_keys(self, attribute, value):
             _check_keys(
                 required_keys["annotations_keys"],
                 annot_dict,
-                additional_error_message=f" for annotation dict {annot_dict}",
+                additional_message=f" for annotation dict {annot_dict}",
             )
 
         # Check keys in categories dicts
@@ -197,20 +201,18 @@ def _file_contains_required_keys(self, attribute, value):
             _check_keys(
                 required_keys["categories_keys"],
                 cat_dict,
-                additional_error_message=f" for category dict {cat_dict}",
+                additional_message=f" for category dict {cat_dict}",
             )
 
 
 def _check_keys(
     list_required_keys: list[str],
     data_dict: dict,
-    additional_error_message: str = "",
+    additional_message: str = "",
 ):
     missing_keys = set(list_required_keys) - data_dict.keys()
     if missing_keys:
         raise ValueError(
             f"Required key(s) {missing_keys} not "
-            f"found in {list(data_dict.keys())}"
-            + additional_error_message
-            + "."
+            f"found in {list(data_dict.keys())}" + additional_message + "."
         )
diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py
index 0f6b7d3..4c96511 100644
--- a/tests/test_unit/test_annotations/test_validators.py
+++ b/tests/test_unit/test_annotations/test_validators.py
@@ -8,10 +8,7 @@
     COCO_UNTRACKED_SCHEMA,
     VIA_UNTRACKED_SCHEMA,
 )
-from ethology.annotations.validators import (
-    ValidJSON,
-    ValidVIAUntrackedJSON,
-)
+from ethology.annotations.validators import ValidJSON, ValidVIAUntrackedJSON
 
 
 @pytest.fixture()
@@ -81,7 +78,12 @@ def via_json_file_with_missing_keys(tmp_path, annotations_test_data):
     def _via_json_file_with_missing_keys(
         valid_json_filename, required_keys_to_pop
     ):
-        """Return path to a JSON file that is missing required keys."""
+        """Return path to a JSON file that is missing required keys.
+
+        If a key to pop refers to a nested dictionary, it is removed from
+        the first element.
+
+        """
         # read valid json file
         valid_json_path = annotations_test_data[valid_json_filename]
         with open(valid_json_path) as f:
@@ -91,28 +93,35 @@ def _via_json_file_with_missing_keys(
         for key in required_keys_to_pop.get("main", []):
             data.pop(key)
 
-        # remove keys in nested dicts
-        for _, img_dict in data["_via_img_metadata"].items():
-            # remove keys for each image dictionary
+        # remove keys in nested dictionaries
+        edited_image_dicts = {}
+        if "_via_img_metadata" in data:
+            # remove image keys for first image dictionary
+            img_str, img_dict = list(data["_via_img_metadata"].items())[
+                0
+            ]  # list(data["_via_img_metadata"].values())[0]
             for key in required_keys_to_pop.get("image_keys", []):
                 img_dict.pop(key)
+                edited_image_dicts["image_keys"] = img_str
 
-            for region in img_dict["regions"]:
-                # remove keys for each region
-                for key in required_keys_to_pop.get("region_keys", []):
-                    region.pop(key)
+            # remove region keys for first region under second image dictionary
+            img_str, img_dict = list(data["_via_img_metadata"].items())[1]
+            for key in required_keys_to_pop.get("region_keys", []):
+                img_dict["regions"][0].pop(key)
+                edited_image_dicts["region_keys"] = img_str
 
-                # remove keys under shape_attributes
-                for key in required_keys_to_pop.get(
-                    "shape_attributes_keys", []
-                ):
-                    region["shape_attributes"].pop(key)
+            # remove shape_attributes keys for first region under third image
+            # dictionary
+            img_str, img_dict = list(data["_via_img_metadata"].items())[2]
+            for key in required_keys_to_pop.get("shape_attributes_keys", []):
+                img_dict["regions"][0]["shape_attributes"].pop(key)
+                edited_image_dicts["shape_attributes_keys"] = img_str
 
         # save the modified json to a new file
         out_json = tmp_path / f"{valid_json_path.name}_missing_keys.json"
         with open(out_json, "w") as f:
             json.dump(data, f)
-        return out_json
+        return out_json, edited_image_dicts
 
     return _via_json_file_with_missing_keys
 
@@ -226,26 +235,28 @@ def test_valid_via_untracked_json(annotations_test_data, input_json_file):
         (
             {"main": ["_via_image_id_list"]},
             pytest.raises(ValueError),
-            "Required key(s) {'_via_image_id_list'} not found "
+            "Required key(s) {{'_via_image_id_list'}} not found "
             "in ['_via_settings', '_via_img_metadata', '_via_attributes', "
             "'_via_data_format_version'].",
         ),
         (
             {"image_keys": ["filename"]},
             pytest.raises(ValueError),
-            "Required key(s) {'filename'} not found "
+            "Required key(s) {{'filename'}} not found "
             "in ['size', 'regions', 'file_attributes'] "
-            "for 09.08_09.08.2023-01-Left_frame_001764.png15086122.",
+            "for {}.",
         ),
         (
             {"region_keys": ["shape_attributes"]},
             pytest.raises(ValueError),
-            "The JSON data does not contain the required keys: annotations.",
+            "Required key(s) {{'shape_attributes'}} not found in "
+            "['region_attributes'] for region 0 under {}.",
         ),
         (
             {"shape_attributes_keys": ["x"]},
             pytest.raises(ValueError),
-            "The JSON data does not contain the required keys: annotations.",
+            "Required key(s) {{'x'}} not found in "
+            "['name', 'y', 'width', 'height'] for region 0 under {}.",
         ),
     ],
 )
@@ -257,17 +268,20 @@ def test_valid_via_untracked_json_missing_keys(
     log_message,
 ):
     # create invalid json file with missing keys
-    invalid_json_file = via_json_file_with_missing_keys(
+    invalid_json_file, edited_image_dicts = via_json_file_with_missing_keys(
         valid_json_file, missing_keys
     )
 
-    # run validatio
+    # get key of affected image in _via_img_metadata
+    img_key_str = edited_image_dicts.get(list(missing_keys.keys())[0], None)
+
+    # run validation
     with expected_exception as excinfo:
         ValidVIAUntrackedJSON(
             path=invalid_json_file,
         )
 
-    assert str(excinfo.value) == log_message
+    assert str(excinfo.value) == log_message.format(img_key_str)
 
 
 # def test_valid_via_untracked_json ---> checks required keys

From 3e241865d0cd6e7a72442824a04317c7302568e2 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 16 Dec 2024 17:58:02 +0000
Subject: [PATCH 28/36] Add test for valid_coco_untracked_json

---
 ethology/annotations/validators.py            |   7 +-
 .../test_annotations/test_validators.py       | 162 +++++++++++++++---
 2 files changed, 142 insertions(+), 27 deletions(-)

diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py
index 74b1462..f433577 100644
--- a/ethology/annotations/validators.py
+++ b/ethology/annotations/validators.py
@@ -165,10 +165,7 @@ def _file_contains_required_keys(self, attribute, value):
         """Ensure that the JSON file contains the required keys."""
         required_keys = {
             "main": ["images", "annotations", "categories"],
-            "image_keys": [
-                "id",
-                "file_name",
-            ],  # add height and width of image?
+            "image_keys": ["id", "file_name"],  # "height", "width"?
             "annotations_keys": ["id", "image_id", "bbox", "category_id"],
             "categories_keys": ["id", "name", "supercategory"],
         }
@@ -213,6 +210,6 @@ def _check_keys(
     missing_keys = set(list_required_keys) - data_dict.keys()
     if missing_keys:
         raise ValueError(
-            f"Required key(s) {missing_keys} not "
+            f"Required key(s) {sorted(missing_keys)} not "
             f"found in {list(data_dict.keys())}" + additional_message + "."
         )
diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py
index 4c96511..a3fb678 100644
--- a/tests/test_unit/test_annotations/test_validators.py
+++ b/tests/test_unit/test_annotations/test_validators.py
@@ -8,7 +8,11 @@
     COCO_UNTRACKED_SCHEMA,
     VIA_UNTRACKED_SCHEMA,
 )
-from ethology.annotations.validators import ValidJSON, ValidVIAUntrackedJSON
+from ethology.annotations.validators import (
+    ValidCOCOUntrackedJSON,
+    ValidJSON,
+    ValidVIAUntrackedJSON,
+)
 
 
 @pytest.fixture()
@@ -75,15 +79,12 @@ def _json_file_with_schema_error(out_parent_path, json_valid_path):
 
 @pytest.fixture()
 def via_json_file_with_missing_keys(tmp_path, annotations_test_data):
+    """Return factory of paths to VIA JSON files with missing required keys."""
+
     def _via_json_file_with_missing_keys(
         valid_json_filename, required_keys_to_pop
     ):
-        """Return path to a JSON file that is missing required keys.
-
-        If a key to pop refers to a nested dictionary, it is removed from
-        the first element.
-
-        """
+        """Return path to a JSON file that is missing required keys."""
         # read valid json file
         valid_json_path = annotations_test_data[valid_json_filename]
         with open(valid_json_path) as f:
@@ -96,26 +97,26 @@ def _via_json_file_with_missing_keys(
         # remove keys in nested dictionaries
         edited_image_dicts = {}
         if "_via_img_metadata" in data:
+            list_img_metadata_tuples = list(data["_via_img_metadata"].items())
+
             # remove image keys for first image dictionary
-            img_str, img_dict = list(data["_via_img_metadata"].items())[
-                0
-            ]  # list(data["_via_img_metadata"].values())[0]
+            img_str, img_dict = list_img_metadata_tuples[0]
+            edited_image_dicts["image_keys"] = img_str
             for key in required_keys_to_pop.get("image_keys", []):
                 img_dict.pop(key)
-                edited_image_dicts["image_keys"] = img_str
 
             # remove region keys for first region under second image dictionary
-            img_str, img_dict = list(data["_via_img_metadata"].items())[1]
+            img_str, img_dict = list_img_metadata_tuples[1]
+            edited_image_dicts["region_keys"] = img_str
             for key in required_keys_to_pop.get("region_keys", []):
                 img_dict["regions"][0].pop(key)
-                edited_image_dicts["region_keys"] = img_str
 
             # remove shape_attributes keys for first region under third image
             # dictionary
-            img_str, img_dict = list(data["_via_img_metadata"].items())[2]
+            img_str, img_dict = list_img_metadata_tuples[2]
+            edited_image_dicts["shape_attributes_keys"] = img_str
             for key in required_keys_to_pop.get("shape_attributes_keys", []):
                 img_dict["regions"][0]["shape_attributes"].pop(key)
-                edited_image_dicts["shape_attributes_keys"] = img_str
 
         # save the modified json to a new file
         out_json = tmp_path / f"{valid_json_path.name}_missing_keys.json"
@@ -126,6 +127,54 @@ def _via_json_file_with_missing_keys(
     return _via_json_file_with_missing_keys
 
 
+@pytest.fixture()
+def coco_json_file_with_missing_keys(tmp_path, annotations_test_data):
+    """Return factory of paths to COCO JSON files with missing required
+    keys.
+    """
+
+    def _coco_json_file_with_missing_keys(
+        valid_json_filename, required_keys_to_pop
+    ):
+        """Return path to a JSON file that is missing required keys."""
+        # read valid json file
+        valid_json_path = annotations_test_data[valid_json_filename]
+        with open(valid_json_path) as f:
+            data = json.load(f)
+
+        # remove any keys in the first level
+        for key in required_keys_to_pop.get("main", []):
+            data.pop(key)
+
+        edited_image_dicts = {}
+
+        # remove required keys in first images dictionary
+        if "images" in data:
+            edited_image_dicts["image_keys"] = data["images"][0]
+            for key in required_keys_to_pop.get("image_keys", []):
+                data["images"][0].pop(key)
+
+        # remove required keys in first annotations dictionary
+        if "annotations" in data:
+            edited_image_dicts["annotations_keys"] = data["annotations"][0]
+            for key in required_keys_to_pop.get("annotations_keys", []):
+                data["annotations"][0].pop(key)
+
+        # remove required keys in first categories dictionary
+        if "categories" in data:
+            edited_image_dicts["categories_keys"] = data["categories"][0]
+            for key in required_keys_to_pop.get("categories_keys", []):
+                data["categories"][0].pop(key)
+
+        # save the modified json to a new file
+        out_json = tmp_path / f"{valid_json_path.name}_missing_keys.json"
+        with open(out_json, "w") as f:
+            json.dump(data, f)
+        return out_json, edited_image_dicts
+
+    return _coco_json_file_with_missing_keys
+
+
 @pytest.mark.parametrize(
     "input_file_standard, input_schema",
     [
@@ -235,27 +284,34 @@ def test_valid_via_untracked_json(annotations_test_data, input_json_file):
         (
             {"main": ["_via_image_id_list"]},
             pytest.raises(ValueError),
-            "Required key(s) {{'_via_image_id_list'}} not found "
+            "Required key(s) ['_via_image_id_list'] not found "
             "in ['_via_settings', '_via_img_metadata', '_via_attributes', "
             "'_via_data_format_version'].",
         ),
+        (
+            {"main": ["_via_image_id_list", "_via_img_metadata"]},
+            pytest.raises(ValueError),
+            "Required key(s) ['_via_image_id_list', '_via_img_metadata'] "
+            "not found in ['_via_settings', '_via_attributes', "
+            "'_via_data_format_version'].",
+        ),
         (
             {"image_keys": ["filename"]},
             pytest.raises(ValueError),
-            "Required key(s) {{'filename'}} not found "
+            "Required key(s) ['filename'] not found "
             "in ['size', 'regions', 'file_attributes'] "
             "for {}.",
         ),
         (
             {"region_keys": ["shape_attributes"]},
             pytest.raises(ValueError),
-            "Required key(s) {{'shape_attributes'}} not found in "
+            "Required key(s) ['shape_attributes'] not found in "
             "['region_attributes'] for region 0 under {}.",
         ),
         (
             {"shape_attributes_keys": ["x"]},
             pytest.raises(ValueError),
-            "Required key(s) {{'x'}} not found in "
+            "Required key(s) ['x'] not found in "
             "['name', 'y', 'width', 'height'] for region 0 under {}.",
         ),
     ],
@@ -284,6 +340,68 @@ def test_valid_via_untracked_json_missing_keys(
     assert str(excinfo.value) == log_message.format(img_key_str)
 
 
-# def test_valid_via_untracked_json ---> checks required keys
-# def test_valid_coco_untracked_json ---> checks required keys
-# def test_check_keys?
+@pytest.mark.parametrize(
+    "valid_json_file",
+    [
+        "COCO_JSON_sample_1.json",
+        "COCO_JSON_sample_2.json",
+    ],
+)
+@pytest.mark.parametrize(
+    "missing_keys, expected_exception, log_message",
+    [
+        (
+            {"main": ["categories"]},
+            pytest.raises(ValueError),
+            "Required key(s) ['categories'] not found "
+            "in ['annotations', 'images', 'info', 'licenses'].",
+        ),
+        (
+            {"main": ["categories", "images"]},
+            pytest.raises(ValueError),
+            "Required key(s) ['categories', 'images'] not found "
+            "in ['annotations', 'info', 'licenses'].",
+        ),
+        (
+            {"image_keys": ["file_name"]},
+            pytest.raises(ValueError),
+            "Required key(s) ['file_name'] not found in "
+            "['height', 'id', 'width'] for image dict {}.",
+        ),
+        (
+            {"annotations_keys": ["category_id"]},
+            pytest.raises(ValueError),
+            "Required key(s) ['category_id'] not found in "
+            "['area', 'bbox', 'id', 'image_id', 'iscrowd'] for "
+            "annotation dict {}.",
+        ),
+        (
+            {"categories_keys": ["id"]},
+            pytest.raises(ValueError),
+            "Required key(s) ['id'] not found in "
+            "['name', 'supercategory'] for category dict {}.",
+        ),
+    ],
+)
+def test_valid_coco_untracked_json(
+    valid_json_file,
+    missing_keys,
+    coco_json_file_with_missing_keys,
+    expected_exception,
+    log_message,
+):
+    # create invalid json file with missing keys
+    invalid_json_file, edited_image_dicts = coco_json_file_with_missing_keys(
+        valid_json_file, missing_keys
+    )
+
+    # get key of affected image in _via_img_metadata
+    img_dict = edited_image_dicts.get(list(missing_keys.keys())[0], None)
+
+    # run validation
+    with expected_exception as excinfo:
+        ValidCOCOUntrackedJSON(
+            path=invalid_json_file,
+        )
+
+    assert str(excinfo.value) == log_message.format(img_dict)

From e8fcb64ba29db7c657f04e6fbbeb7b49cd2b6d6e Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 16 Dec 2024 18:15:27 +0000
Subject: [PATCH 29/36] Add test for check_keys

---
 ethology/annotations/validators.py            |  2 +-
 .../test_annotations/test_validators.py       | 54 +++++++++++++++++--
 2 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py
index f433577..13deb1b 100644
--- a/ethology/annotations/validators.py
+++ b/ethology/annotations/validators.py
@@ -211,5 +211,5 @@ def _check_keys(
     if missing_keys:
         raise ValueError(
             f"Required key(s) {sorted(missing_keys)} not "
-            f"found in {list(data_dict.keys())}" + additional_message + "."
+            f"found in {list(data_dict.keys())}{additional_message}."
         )
diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py
index a3fb678..d290de4 100644
--- a/tests/test_unit/test_annotations/test_validators.py
+++ b/tests/test_unit/test_annotations/test_validators.py
@@ -12,6 +12,7 @@
     ValidCOCOUntrackedJSON,
     ValidJSON,
     ValidVIAUntrackedJSON,
+    _check_keys,
 )
 
 
@@ -61,13 +62,12 @@ def _json_file_with_schema_error(out_parent_path, json_valid_path):
         data = json.load(f)
 
     # modify so that it doesn't match the corresponding schema
+    # if VIA, change "width" of a bounding box from int to float
+    # if COCO, change "annotations" from list of dicts to list of lists
     if "VIA" in json_valid_path.name:
-        # change "width" of a bounding box from int to float
-        data["_via_img_metadata"][
-            "09.08_09.08.2023-01-Left_frame_001764.png15086122"
-        ]["regions"][0]["shape_attributes"]["width"] = 49.5
+        _, img_dict = list(data["_via_img_metadata"].items())[0]
+        img_dict["regions"][0]["shape_attributes"]["width"] = 49.5
     elif "COCO" in json_valid_path.name:
-        # change "annotations" from list of dicts to list of lists
         data["annotations"] = [[d] for d in data["annotations"]]
 
     # save the modified json to a new file
@@ -405,3 +405,47 @@ def test_valid_coco_untracked_json(
         )
 
     assert str(excinfo.value) == log_message.format(img_dict)
+
+
+@pytest.mark.parametrize(
+    "list_required_keys, data_dict, additional_message, expected_exception",
+    [
+        (
+            ["images", "annotations", "categories"],
+            {"images": "", "annotations": "", "categories": ""},
+            "",
+            does_not_raise(),
+        ),
+        (
+            ["images", "annotations", "categories"],
+            {"annotations": "", "categories": ""},
+            "",
+            pytest.raises(ValueError),
+        ),  # one missing key
+        (
+            ["images", "annotations", "categories"],
+            {"annotations": ""},
+            "",
+            pytest.raises(ValueError),
+        ),  # two missing keys
+        (
+            ["images", "annotations", "categories"],
+            {"annotations": "", "categories": ""},
+            "TEST",
+            pytest.raises(ValueError),
+        ),  # one missing key
+    ],
+)
+def test_check_keys(
+    list_required_keys, data_dict, additional_message, expected_exception
+):
+    """Test the _check_keys helper function."""
+    with expected_exception as excinfo:
+        _check_keys(list_required_keys, data_dict, additional_message)
+
+    if excinfo:
+        missing_keys = set(list_required_keys) - data_dict.keys()
+        assert str(excinfo.value) == (
+            f"Required key(s) {sorted(missing_keys)} not "
+            f"found in {list(data_dict.keys())}{additional_message}."
+        )

From 06b47c122d93c2a93a2b5b61e36ea44b893d77cc Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 16 Dec 2024 18:30:22 +0000
Subject: [PATCH 30/36] Remove untracked from names

---
 ethology/annotations/io.py                    | 17 +++---
 ethology/annotations/json_schemas.py          |  4 +-
 ethology/annotations/validators.py            |  4 +-
 tests/conftest.py                             | 10 ++--
 .../test_annotations/test_validators.py       | 56 +++++++++----------
 5 files changed, 44 insertions(+), 47 deletions(-)

diff --git a/ethology/annotations/io.py b/ethology/annotations/io.py
index e62d9f0..1cf137e 100644
--- a/ethology/annotations/io.py
+++ b/ethology/annotations/io.py
@@ -6,14 +6,11 @@
 import pandas as pd
 from movement.validators.files import ValidFile
 
-from ethology.annotations.json_schemas import (
-    COCO_UNTRACKED_SCHEMA,
-    VIA_UNTRACKED_SCHEMA,
-)
+from ethology.annotations.json_schemas import COCO_SCHEMA, VIA_SCHEMA
 from ethology.annotations.validators import (
-    ValidCOCOUntrackedJSON,
+    ValidCOCOJSON,
     ValidJSON,
-    ValidVIAUntrackedJSON,
+    ValidVIAJSON,
 )
 
 STANDARD_DF_COLUMNS = [
@@ -49,8 +46,8 @@ def df_from_via_json_file(file_path: Path) -> pd.DataFrame:
     file = ValidFile(
         file_path, expected_permission="r", expected_suffix=[".json"]
     )
-    json_file = ValidJSON(path=file.path, schema=VIA_UNTRACKED_SCHEMA)
-    via_untracked_file = ValidVIAUntrackedJSON(json_file.path)
+    json_file = ValidJSON(path=file.path, schema=VIA_SCHEMA)
+    via_untracked_file = ValidVIAJSON(json_file.path)
 
     # Read as standard dataframe
     return _df_from_validated_via_json_file(via_untracked_file.path)
@@ -76,8 +73,8 @@ def df_from_coco_json_file(file_path: Path) -> pd.DataFrame:
     file = ValidFile(
         file_path, expected_permission="r", expected_suffix=[".json"]
     )
-    json_file = ValidJSON(path=file.path, schema=COCO_UNTRACKED_SCHEMA)
-    coco_untracked_file = ValidCOCOUntrackedJSON(json_file.path)
+    json_file = ValidJSON(path=file.path, schema=COCO_SCHEMA)
+    coco_untracked_file = ValidCOCOJSON(json_file.path)
 
     # Read as standard dataframe
     return _df_from_validated_coco_json_file(coco_untracked_file.path)
diff --git a/ethology/annotations/json_schemas.py b/ethology/annotations/json_schemas.py
index 161c924..79a6773 100644
--- a/ethology/annotations/json_schemas.py
+++ b/ethology/annotations/json_schemas.py
@@ -1,6 +1,6 @@
 """JSON schemas for VIA and COCO annotations."""
 
-VIA_UNTRACKED_SCHEMA = {
+VIA_SCHEMA = {
     "type": "object",
     "properties": {
         # settings for browser UI
@@ -66,7 +66,7 @@
     },
 }
 
-COCO_UNTRACKED_SCHEMA = {
+COCO_SCHEMA = {
     "type": "object",
     "properties": {
         "info": {"type": "object"},
diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py
index 13deb1b..f03bb55 100644
--- a/ethology/annotations/validators.py
+++ b/ethology/annotations/validators.py
@@ -79,7 +79,7 @@ def _file_matches_JSON_schema(self, attribute, value):
 
 
 @define
-class ValidVIAUntrackedJSON:
+class ValidVIAJSON:
     """Class for validating VIA JSON files for untracked data.
 
     Checks the VIA JSON file for untracked data contains the required keys.
@@ -135,7 +135,7 @@ def _file_contains_required_keys(self, attribute, value):
 
 
 @define
-class ValidCOCOUntrackedJSON:
+class ValidCOCOJSON:
     """Class for validating COCO JSON files for untracked data.
 
     The validator ensures that the file matches the expected schema.
diff --git a/tests/conftest.py b/tests/conftest.py
index 37d1041..28d0ec7 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -59,14 +59,14 @@ def pooch_registry() -> dict:
 @pytest.fixture()
 def get_paths_test_data():
     """Define a factory fixture to get the paths of the data files
-    under a specific zip.
+    under a specific subdirectory in the GIN repository.
 
-    The name of the zip file is intended to match a testing module. For
+    The name of the subdirectories is intended to match a testing module. For
     example, to get the paths to the test files for the annotations
-    tests module, we would call `get_paths_test_data(pooch_registry,
+    module, we would call `get_paths_test_data(pooch_registry,
     "test_annotations")` in a test. This assumes in the GIN repository
-    there is a zip file named `test_annotations.zip` under the `test_data`
-    directory containing the relevant test files.
+    there is a subdirectory named `test_annotations` under the `test_data`
+    directory with the relevant test files.
     """
 
     def _get_paths_test_data(pooch_registry, subdir_name: str) -> dict:
diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py
index d290de4..604c7c4 100644
--- a/tests/test_unit/test_annotations/test_validators.py
+++ b/tests/test_unit/test_annotations/test_validators.py
@@ -4,14 +4,11 @@
 import jsonschema
 import pytest
 
-from ethology.annotations.json_schemas import (
-    COCO_UNTRACKED_SCHEMA,
-    VIA_UNTRACKED_SCHEMA,
-)
+from ethology.annotations.json_schemas import COCO_SCHEMA, VIA_SCHEMA
 from ethology.annotations.validators import (
-    ValidCOCOUntrackedJSON,
+    ValidCOCOJSON,
     ValidJSON,
-    ValidVIAUntrackedJSON,
+    ValidVIAJSON,
     _check_keys,
 )
 
@@ -179,9 +176,9 @@ def _coco_json_file_with_missing_keys(
     "input_file_standard, input_schema",
     [
         ("VIA", None),
-        ("VIA", VIA_UNTRACKED_SCHEMA),
+        ("VIA", VIA_SCHEMA),
         ("COCO", None),
-        ("COCO", COCO_UNTRACKED_SCHEMA),
+        ("COCO", COCO_SCHEMA),
     ],
 )
 @pytest.mark.parametrize(
@@ -194,7 +191,7 @@ def test_valid_json(
     input_schema,
     annotations_test_data,
 ):
-    """Test the ValidJSON validator with valid files."""
+    """Test the ValidJSON validator with valid inputs."""
     filepath = annotations_test_data[
         f"{input_file_standard}_{input_json_file_suffix}"
     ]
@@ -223,13 +220,13 @@ def test_valid_json(
         ),
         (
             "via_json_file_with_schema_error",
-            VIA_UNTRACKED_SCHEMA,
+            VIA_SCHEMA,
             pytest.raises(jsonschema.exceptions.ValidationError),
             "49.5 is not of type 'integer'\n\n",
         ),
         (
             "coco_json_file_with_schema_error",
-            COCO_UNTRACKED_SCHEMA,
+            COCO_SCHEMA,
             pytest.raises(jsonschema.exceptions.ValidationError),
             "[{'area': 432, 'bbox': [1278, 556, 16, 27], 'category_id': 1, "
             "'id': 8917, 'image_id': 199, 'iscrowd': 0}] is not of type "
@@ -263,16 +260,17 @@ def test_valid_json_errors(
         "VIA_JSON_sample_2.json",
     ],
 )
-def test_valid_via_untracked_json(annotations_test_data, input_json_file):
+def test_valid_via_json(annotations_test_data, input_json_file):
+    """Test the ValidVIAJSON validator with valid inputs."""
     filepath = annotations_test_data[input_json_file]
     with does_not_raise():
-        ValidVIAUntrackedJSON(
+        ValidVIAJSON(
             path=filepath,
         )
 
 
 @pytest.mark.parametrize(
-    "valid_json_file",
+    "valid_via_json_file",
     [
         "VIA_JSON_sample_1.json",
         "VIA_JSON_sample_2.json",
@@ -316,24 +314,25 @@ def test_valid_via_untracked_json(annotations_test_data, input_json_file):
         ),
     ],
 )
-def test_valid_via_untracked_json_missing_keys(
-    valid_json_file,
+def test_valid_via_json_missing_keys(
+    valid_via_json_file,
     missing_keys,
     via_json_file_with_missing_keys,
     expected_exception,
     log_message,
 ):
-    # create invalid json file with missing keys
+    """Test the ValidVIAJSON when input has missing keys."""
+    # create invalid VIA json file with missing keys
     invalid_json_file, edited_image_dicts = via_json_file_with_missing_keys(
-        valid_json_file, missing_keys
+        valid_via_json_file, missing_keys
     )
 
-    # get key of affected image in _via_img_metadata
+    # get key of affected images in _via_img_metadata
     img_key_str = edited_image_dicts.get(list(missing_keys.keys())[0], None)
 
     # run validation
     with expected_exception as excinfo:
-        ValidVIAUntrackedJSON(
+        ValidVIAJSON(
             path=invalid_json_file,
         )
 
@@ -341,7 +340,7 @@ def test_valid_via_untracked_json_missing_keys(
 
 
 @pytest.mark.parametrize(
-    "valid_json_file",
+    "valid_coco_json_file",
     [
         "COCO_JSON_sample_1.json",
         "COCO_JSON_sample_2.json",
@@ -383,16 +382,17 @@ def test_valid_via_untracked_json_missing_keys(
         ),
     ],
 )
-def test_valid_coco_untracked_json(
-    valid_json_file,
+def test_valid_coco_json_missing_keys(
+    valid_coco_json_file,
     missing_keys,
     coco_json_file_with_missing_keys,
     expected_exception,
     log_message,
 ):
+    """Test the ValidCOCOJSON when input has missing keys."""
     # create invalid json file with missing keys
     invalid_json_file, edited_image_dicts = coco_json_file_with_missing_keys(
-        valid_json_file, missing_keys
+        valid_coco_json_file, missing_keys
     )
 
     # get key of affected image in _via_img_metadata
@@ -400,7 +400,7 @@ def test_valid_coco_untracked_json(
 
     # run validation
     with expected_exception as excinfo:
-        ValidCOCOUntrackedJSON(
+        ValidCOCOJSON(
             path=invalid_json_file,
         )
 
@@ -415,7 +415,7 @@ def test_valid_coco_untracked_json(
             {"images": "", "annotations": "", "categories": ""},
             "",
             does_not_raise(),
-        ),
+        ),  # zero missing keys
         (
             ["images", "annotations", "categories"],
             {"annotations": "", "categories": ""},
@@ -431,9 +431,9 @@ def test_valid_coco_untracked_json(
         (
             ["images", "annotations", "categories"],
             {"annotations": "", "categories": ""},
-            "TEST",
+            "FOO",
             pytest.raises(ValueError),
-        ),  # one missing key
+        ),  # one missing key with additional message
     ],
 )
 def test_check_keys(

From 147eb11be7fa1cb05c524e3525241e79ce854ec2 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 16 Dec 2024 18:45:08 +0000
Subject: [PATCH 31/36] Remove dataloader (now in separate PR)

---
 ethology/annotations/io.py | 195 -------------------------------------
 1 file changed, 195 deletions(-)
 delete mode 100644 ethology/annotations/io.py

diff --git a/ethology/annotations/io.py b/ethology/annotations/io.py
deleted file mode 100644
index 1cf137e..0000000
--- a/ethology/annotations/io.py
+++ /dev/null
@@ -1,195 +0,0 @@
-"""Module for reading and writing manually labelled annotations."""
-
-import json
-from pathlib import Path
-
-import pandas as pd
-from movement.validators.files import ValidFile
-
-from ethology.annotations.json_schemas import COCO_SCHEMA, VIA_SCHEMA
-from ethology.annotations.validators import (
-    ValidCOCOJSON,
-    ValidJSON,
-    ValidVIAJSON,
-)
-
-STANDARD_DF_COLUMNS = [
-    "annotation_id",
-    "image_filename",
-    "image_id",
-    "x_min",
-    "y_min",
-    "width",
-    "height",
-    "supercategory",
-    "category",
-]
-
-
-def df_from_via_json_file(file_path: Path) -> pd.DataFrame:
-    """Validate and read untracked VIA JSON file.
-
-    The data is formatted as an untracked annotations DataFrame.
-
-    Parameters
-    ----------
-    file_path : Path
-        Path to the untracked VIA JSON file.
-
-    Returns
-    -------
-    pd.DataFrame
-        Untracked annotations DataFrame.
-
-    """
-    # Run validators
-    file = ValidFile(
-        file_path, expected_permission="r", expected_suffix=[".json"]
-    )
-    json_file = ValidJSON(path=file.path, schema=VIA_SCHEMA)
-    via_untracked_file = ValidVIAJSON(json_file.path)
-
-    # Read as standard dataframe
-    return _df_from_validated_via_json_file(via_untracked_file.path)
-
-
-def df_from_coco_json_file(file_path: Path) -> pd.DataFrame:
-    """Validate and read untracked COCO JSON file.
-
-    The data is formatted as an untracked annotations DataFrame.
-
-    Parameters
-    ----------
-    file_path : Path
-        Path to the untracked COCO JSON file.
-
-    Returns
-    -------
-    pd.DataFrame
-        Untracked annotations DataFrame.
-
-    """
-    # Run validators
-    file = ValidFile(
-        file_path, expected_permission="r", expected_suffix=[".json"]
-    )
-    json_file = ValidJSON(path=file.path, schema=COCO_SCHEMA)
-    coco_untracked_file = ValidCOCOJSON(json_file.path)
-
-    # Read as standard dataframe
-    return _df_from_validated_coco_json_file(coco_untracked_file.path)
-
-
-def _df_from_validated_via_json_file(file_path):
-    """Read VIA JSON file as standard untracked annotations DataFrame."""
-    # Read validated json as dict
-    with open(file_path) as file:
-        data_dict = json.load(file)
-
-    # Prepare data
-    image_metadata_dict = data_dict["_via_img_metadata"]
-    via_image_id_list = data_dict[
-        "_via_image_id_list"
-    ]  # ordered list of the keys in image_metadata_dict
-
-    # map filename to keys in image_metadata_dict
-    map_filename_to_via_img_id = {
-        img_dict["filename"]: ky
-        for ky, img_dict in image_metadata_dict.items()
-    }
-
-    # Build standard dataframe
-    list_rows = []
-    # loop thru images
-    for _, img_dict in image_metadata_dict.items():
-        # loop thru annotations in the image
-        for region in img_dict["regions"]:
-            region_shape = region["shape_attributes"]
-            region_attributes = region["region_attributes"]
-
-            row = {
-                "image_filename": img_dict["filename"],
-                "x_min": region_shape["x"],
-                "y_min": region_shape["y"],
-                "width": region_shape["width"],
-                "height": region_shape["height"],
-                "supercategory": list(region_attributes.keys())[
-                    0
-                ],  # takes first key as supercategory
-                "category": region_attributes[
-                    list(region_attributes.keys())[0]
-                ],
-            }
-
-            # append annotations to df
-            list_rows.append(row)
-
-    df = pd.DataFrame(
-        list_rows,
-        # columns=list(row.keys()),  # do I need this?
-    )
-
-    # add image_id column
-    df["image_id"] = df["image_filename"].apply(
-        lambda x: via_image_id_list.index(map_filename_to_via_img_id[x])
-    )
-
-    # add annotation_id column based on index
-    df["annotation_id"] = df.index
-
-    # reorder columns to match standard
-    df = df.reindex(columns=STANDARD_DF_COLUMNS)
-
-    return df
-
-
-def _df_from_validated_coco_json_file(file_path: Path) -> pd.DataFrame:
-    """Read COCO JSON file as standard untracked annotations DataFrame."""
-    # Read validated json as dict
-    with open(file_path) as file:
-        data_dict = json.load(file)
-
-    # Prepare data
-    map_image_id_to_filename = {
-        img_dict["id"]: img_dict["file_name"]
-        for img_dict in data_dict["images"]
-    }
-
-    map_category_id_to_category_data = {
-        cat_dict["id"]: (cat_dict["name"], cat_dict["supercategory"])
-        for cat_dict in data_dict["categories"]
-    }
-
-    # Build standard dataframe
-    list_rows = []
-    for annot_dict in data_dict["annotations"]:
-        annotation_id = annot_dict["id"]
-        # image data
-        image_id = annot_dict["image_id"]
-        image_filename = map_image_id_to_filename[image_id]
-
-        # bbox data
-        x_min, y_min, width, height = annot_dict["bbox"]
-
-        # class data
-        category_id = annot_dict["category_id"]
-        category, supercategory = map_category_id_to_category_data[category_id]
-
-        row = {
-            "annotation_id": annotation_id,
-            "image_filename": image_filename,
-            "image_id": image_id,
-            "x_min": x_min,
-            "y_min": y_min,
-            "width": width,
-            "height": height,
-            "supercategory": supercategory,
-            "category": category,
-        }
-
-        list_rows.append(row)
-
-    df = pd.DataFrame(list_rows)
-    df.reindex(columns=STANDARD_DF_COLUMNS)
-
-    return df

From 3af9b4e159865a641a0cfea56b7fded48ef4ec7c Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 16 Dec 2024 18:55:59 +0000
Subject: [PATCH 32/36] Add references to JSON schemas

---
 ethology/annotations/json_schemas.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/ethology/annotations/json_schemas.py b/ethology/annotations/json_schemas.py
index 79a6773..99d9387 100644
--- a/ethology/annotations/json_schemas.py
+++ b/ethology/annotations/json_schemas.py
@@ -1,4 +1,11 @@
-"""JSON schemas for VIA and COCO annotations."""
+"""JSON schemas for VIA and COCO annotations.
+
+References
+----------
+- https://github.com/python-jsonschema/jsonschema
+- https://json-schema.org/understanding-json-schema/
+
+"""
 
 VIA_SCHEMA = {
     "type": "object",

From 6d9cafff9a8a44f6a4cab06ad2975f269bb89713 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 16 Dec 2024 19:22:38 +0000
Subject: [PATCH 33/36] Fix duplicates

---
 pyproject.toml | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0e4914f..3fb07d7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,17 +17,9 @@ classifiers = [
   "Programming Language :: Python :: 3.12",
   "Operating System :: OS Independent",
   "License :: OSI Approved :: BSD License",
-  "Development Status :: 2 - Pre-Alpha",
-  "Programming Language :: Python",
-  "Programming Language :: Python :: 3",
-  "Programming Language :: Python :: 3.10",
-  "Programming Language :: Python :: 3.11",
-  "Programming Language :: Python :: 3.12",
-  "Operating System :: OS Independent",
-  "License :: OSI Approved :: BSD License",
 ]
 dependencies = [
-  "movement"
+  "movement",
 ]
 
 [project.urls]

From d44be8bbc74a83afccf8c9f482debc2dd1d31404 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Tue, 17 Dec 2024 17:54:03 +0000
Subject: [PATCH 34/36] Add comments and references to schemas module

---
 ethology/annotations/json_schemas.py | 51 ++++++++++++++++++++--------
 1 file changed, 36 insertions(+), 15 deletions(-)

diff --git a/ethology/annotations/json_schemas.py b/ethology/annotations/json_schemas.py
index 99d9387..4cbdd5a 100644
--- a/ethology/annotations/json_schemas.py
+++ b/ethology/annotations/json_schemas.py
@@ -1,16 +1,28 @@
-"""JSON schemas for VIA and COCO annotations.
+"""JSON schemas for manual annotations files.
+
+We use JSON schemas to check the structure of a supported
+annotation file via validators.
+
+Note that the schema validation only checks the type of a key
+if that key is present. It does not check for the presence of
+the keys.
 
 References
 ----------
 - https://github.com/python-jsonschema/jsonschema
 - https://json-schema.org/understanding-json-schema/
+- https://cocodataset.org/#format-data
+- https://gitlab.com/vgg/via/-/blob/master/via-2.x.y/CodeDoc.md?ref_type=heads#description-of-via-project-json-file
 
 """
 
+# The VIA schema corresponds to the
+# format exported by VGG Image Annotator 2.x.y
+# for manual labels
 VIA_SCHEMA = {
     "type": "object",
     "properties": {
-        # settings for browser UI
+        # settings for the browser-based UI of VIA
         "_via_settings": {
             "type": "object",
             "properties": {
@@ -19,18 +31,20 @@
                 "project": {"type": "object"},
             },
         },
-        # annotation data
+        # annotations data per image
         "_via_img_metadata": {
             "type": "object",
             "additionalProperties": {
-                # "additionalProperties" to allow any key,
-                # see https://stackoverflow.com/a/69811612/24834957
+                # Each image under _via_img_metadata is indexed
+                # using a unique key: FILENAME-FILESIZE.
+                # We use "additionalProperties" to allow for any
+                # key name, see https://stackoverflow.com/a/69811612/24834957
                 "type": "object",
                 "properties": {
                     "filename": {"type": "string"},
                     "size": {"type": "integer"},
                     "regions": {
-                        "type": "array",  # a list of dicts
+                        "type": "array",  # 'regions' is a list of dicts
                         "items": {
                             "type": "object",
                             "properties": {
@@ -43,9 +57,7 @@
                                         "width": {"type": "integer"},
                                         "height": {"type": "integer"},
                                     },
-                                    "region_attributes": {
-                                        "type": "object"
-                                    },  # we just check it's a dict
+                                    "region_attributes": {"type": "object"},
                                 },
                             },
                         },
@@ -54,13 +66,15 @@
                 },
             },
         },
-        # ordered list of image keys
-        # - the position defines the image ID
+        # _via_image_id_list contains an
+        # ordered list of image keys using a unique key: FILENAME-FILESIZE,
+        # the position in the list defines the image ID
         "_via_image_id_list": {
             "type": "array",
             "items": {"type": "string"},
         },
-        # region (aka annotation) and file attributes for VIA UI
+        # region attributes and file attributes, to
+        # display in VIA's UI and to classify the data
         "_via_attributes": {
             "type": "object",
             "properties": {
@@ -68,11 +82,14 @@
                 "file": {"type": "object"},
             },
         },
-        # version of the VIA data format
+        # version of the VIA tool used
         "_via_data_format_version": {"type": "string"},
     },
 }
 
+# The COCO schema follows the COCO dataset
+# format for object detection
+# See https://cocodataset.org/#format-data
 COCO_SCHEMA = {
     "type": "object",
     "properties": {
@@ -97,15 +114,19 @@
             "items": {
                 "type": "object",
                 "properties": {
-                    "id": {"type": "integer"},  # annotation global ID
+                    "id": {"type": "integer"},
                     "image_id": {"type": "integer"},
                     "bbox": {
                         "type": "array",
                         "items": {"type": "integer"},
                     },
+                    # (box coordinates are measured from the
+                    # top left image corner and are 0-indexed)
                     "category_id": {"type": "integer"},
-                    "area": {"type": "integer"},
+                    "area": {"type": "number"},
+                    # float according to the official schema
                     "iscrowd": {"type": "integer"},
+                    # 0 or 1 according to the official schema
                 },
             },
         },

From 316779e2e3b7474d7736c1bdba7cf265eec84715 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Tue, 17 Dec 2024 18:01:23 +0000
Subject: [PATCH 35/36] Add comments to the validators

---
 ethology/annotations/validators.py | 49 ++++++++++++++++--------------
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py
index f03bb55..888defc 100644
--- a/ethology/annotations/validators.py
+++ b/ethology/annotations/validators.py
@@ -11,14 +11,17 @@
 
 @define
 class ValidJSON:
-    """Class for validating JSON files.
+    """Class for valid JSON files.
+
+    It checks the JSON file exists, can be decoded, and optionally
+    validates the file against a JSON schema.
 
     Attributes
     ----------
     path : pathlib.Path
         Path to the JSON file.
 
-    schema : dict
+    schema : dict, optional
         JSON schema to validate the file against.
 
     Raises
@@ -26,9 +29,10 @@ class ValidJSON:
     FileNotFoundError
         If the file does not exist.
     ValueError
-        If the JSON file cannot be decoded, or
-        if the type of any of its keys does not match those
-        specified in the schema.
+        If the JSON file cannot be decoded.
+    jsonschema.exceptions.ValidationError
+        If the type of any of the keys in the JSON file
+        does not match the type specified in the schema.
 
 
     Notes
@@ -63,7 +67,8 @@ def _file_matches_JSON_schema(self, attribute, value):
         """Ensure that the JSON file matches the expected schema.
 
         The schema validation only checks the type for each specified
-        key if it exists. It does not check for the presence of the keys.
+        key if the key exists. It does not check for the presence of
+        the keys.
         """
         # read json file
         with open(value) as file:
@@ -80,14 +85,19 @@ def _file_matches_JSON_schema(self, attribute, value):
 
 @define
 class ValidVIAJSON:
-    """Class for validating VIA JSON files for untracked data.
+    """Class for valid VIA JSON files for untracked data.
 
-    Checks the VIA JSON file for untracked data contains the required keys.
+    It checks the input VIA JSON file contains the required keys.
 
     Attributes
     ----------
     path : pathlib.Path
-        Path to the JSON file.
+        Path to the VIA JSON file.
+
+    Raises
+    ------
+    ValueError
+        If the VIA JSON file misses any of the required keys.
 
     """
 
@@ -95,7 +105,7 @@ class ValidVIAJSON:
 
     @path.validator
     def _file_contains_required_keys(self, attribute, value):
-        """Ensure that the JSON file contains the required keys."""
+        """Ensure that the VIA JSON file contains the required keys."""
         required_keys = {
             "main": ["_via_img_metadata", "_via_image_id_list"],
             "image_keys": ["filename", "regions"],
@@ -136,25 +146,19 @@ def _file_contains_required_keys(self, attribute, value):
 
 @define
 class ValidCOCOJSON:
-    """Class for validating COCO JSON files for untracked data.
+    """Class valid COCO JSON files for untracked data.
 
-    The validator ensures that the file matches the expected schema.
-    The schema validation only checks the type for each specified
-    key if it exists. It does not check for the presence of the keys.
+    It checks the input COCO JSON file contains the required keys.
 
     Attributes
     ----------
     path : pathlib.Path
-        Path to the JSON file.
+        Path to the COCO JSON file.
 
     Raises
     ------
     ValueError
-        If the JSON file does not match the expected schema.
-
-    Notes
-    -----
-    https://json-schema.org/understanding-json-schema/
+        If the COCO JSON file misses any of the required keys.
 
     """
 
@@ -162,10 +166,10 @@ class ValidCOCOJSON:
 
     @path.validator
     def _file_contains_required_keys(self, attribute, value):
-        """Ensure that the JSON file contains the required keys."""
+        """Ensure that the COCO JSON file contains the required keys."""
         required_keys = {
             "main": ["images", "annotations", "categories"],
-            "image_keys": ["id", "file_name"],  # "height", "width"?
+            "image_keys": ["id", "file_name"],  # add "height" and "width"?
             "annotations_keys": ["id", "image_id", "bbox", "category_id"],
             "categories_keys": ["id", "name", "supercategory"],
         }
@@ -207,6 +211,7 @@ def _check_keys(
     data_dict: dict,
     additional_message: str = "",
 ):
+    """Check if the required keys are present in the input data_dict."""
     missing_keys = set(list_required_keys) - data_dict.keys()
     if missing_keys:
         raise ValueError(

From e9dbda40ed2e791c14db23f1fac0c50edb37e2d4 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Tue, 17 Dec 2024 18:57:42 +0000
Subject: [PATCH 36/36] Make ValidCOCOJSON and ValidVIAJSON inherit from
 ValidJSON

---
 ethology/annotations/validators.py | 32 ++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py
index 888defc..4ff1179 100644
--- a/ethology/annotations/validators.py
+++ b/ethology/annotations/validators.py
@@ -3,11 +3,14 @@
 import json
 from pathlib import Path
 
+import attrs
 import jsonschema
 import jsonschema.exceptions
 import jsonschema.validators
 from attrs import define, field, validators
 
+from ethology.annotations.json_schemas import COCO_SCHEMA, VIA_SCHEMA
+
 
 @define
 class ValidJSON:
@@ -84,7 +87,7 @@ def _file_matches_JSON_schema(self, attribute, value):
 
 
 @define
-class ValidVIAJSON:
+class ValidVIAJSON(ValidJSON):
     """Class for valid VIA JSON files for untracked data.
 
     It checks the input VIA JSON file contains the required keys.
@@ -94,6 +97,9 @@ class ValidVIAJSON:
     path : pathlib.Path
         Path to the VIA JSON file.
 
+    schema : dict, optional
+        JSON schema to validate the file against. Default is VIA_SCHEMA.
+
     Raises
     ------
     ValueError
@@ -101,8 +107,17 @@ class ValidVIAJSON:
 
     """
 
-    path: Path = field(validator=validators.instance_of(Path))
+    # run the parent's validators first
+    path: Path = field(validator=attrs.fields(ValidJSON).path.validator)
+    schema: dict = field(
+        validator=attrs.fields(ValidJSON).schema.validator,  # type: ignore
+        default=VIA_SCHEMA,
+    )
 
+    # TODO: add a validator to check the schema defines types
+    # for the required keys
+
+    # run additional validators
     @path.validator
     def _file_contains_required_keys(self, attribute, value):
         """Ensure that the VIA JSON file contains the required keys."""
@@ -145,7 +160,7 @@ def _file_contains_required_keys(self, attribute, value):
 
 
 @define
-class ValidCOCOJSON:
+class ValidCOCOJSON(ValidJSON):
     """Class valid COCO JSON files for untracked data.
 
     It checks the input COCO JSON file contains the required keys.
@@ -162,8 +177,17 @@ class ValidCOCOJSON:
 
     """
 
-    path: Path = field(validator=validators.instance_of(Path))
+    # run the parent's validators first
+    path: Path = field(validator=attrs.fields(ValidJSON).path.validator)
+    schema: dict = field(
+        validator=attrs.fields(ValidJSON).schema.validator,  # type: ignore
+        default=COCO_SCHEMA,
+    )
+
+    # TODO: add a validator to check the schema defines types
+    # for the required keys
 
+    # run additional validators
     @path.validator
     def _file_contains_required_keys(self, attribute, value):
         """Ensure that the COCO JSON file contains the required keys."""