neuroinformatics-unit · sfmig · Dec 10, 2024 · Dec 10, 2024 · Dec 11, 2024 · Dec 11, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -19,6 +19,7 @@ repos:
           args: [--fix=lf]
         - id: name-tests-test
           args: ["--pytest-test-first"]
+          exclude: ^tests/fixtures
         - id: requirements-txt-fixer
         - id: trailing-whitespace
   - repo: https://github.com/pre-commit/pygrep-hooks

diff --git a/ethology/annotations/json_schemas.py b/ethology/annotations/json_schemas.py
@@ -0,0 +1,145 @@
+"""JSON schemas for manual annotations files.
+
+We use JSON schemas to check the structure of a supported
+annotation file via validators.
+
+Note that the schema validation only checks the type of a key
+if that key is present. It does not check for the presence of
+the keys.
+
+References
+----------
+- https://github.com/python-jsonschema/jsonschema
+- https://json-schema.org/understanding-json-schema/
+- https://cocodataset.org/#format-data
+- https://gitlab.com/vgg/via/-/blob/master/via-2.x.y/CodeDoc.md?ref_type=heads#description-of-via-project-json-file
+
+"""
+
+# The VIA schema corresponds to the
+# format exported by VGG Image Annotator 2.x.y
+# for manual labels
+VIA_SCHEMA = {
+    "type": "object",
+    "properties": {
+        # settings for the browser-based UI of VIA
+        "_via_settings": {
+            "type": "object",
+            "properties": {
+                "ui": {"type": "object"},
+                "core": {"type": "object"},
+                "project": {"type": "object"},
+            },
+        },
+        # annotations data per image
+        "_via_img_metadata": {
+            "type": "object",
+            "additionalProperties": {
+                # Each image under _via_img_metadata is indexed
+                # using a unique key: FILENAME-FILESIZE.
+                # We use "additionalProperties" to allow for any
+                # key name, see https://stackoverflow.com/a/69811612/24834957
+                "type": "object",
+                "properties": {
+                    "filename": {"type": "string"},
+                    "size": {"type": "integer"},
+                    "regions": {
+                        "type": "array",  # 'regions' is a list of dicts
+                        "items": {
+                            "type": "object",
+                            "properties": {
+                                "shape_attributes": {
+                                    "type": "object",
+                                    "properties": {
+                                        "name": {"type": "string"},
+                                        "x": {"type": "integer"},
+                                        "y": {"type": "integer"},
+                                        "width": {"type": "integer"},
+                                        "height": {"type": "integer"},
+                                    },
+                                    "region_attributes": {"type": "object"},
+                                },
+                            },
+                        },
+                    },
+                    "file_attributes": {"type": "object"},
+                },
+            },
+        },
+        # _via_image_id_list contains an
+        # ordered list of image keys using a unique key: FILENAME-FILESIZE,
+        # the position in the list defines the image ID
+        "_via_image_id_list": {
+            "type": "array",
+            "items": {"type": "string"},
+        },
+        # region attributes and file attributes, to
+        # display in VIA's UI and to classify the data
+        "_via_attributes": {
+            "type": "object",
+            "properties": {
+                "region": {"type": "object"},
+                "file": {"type": "object"},
+            },
+        },
+        # version of the VIA tool used
+        "_via_data_format_version": {"type": "string"},
+    },
+}
+
+# The COCO schema follows the COCO dataset
+# format for object detection
+# See https://cocodataset.org/#format-data
+COCO_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "info": {"type": "object"},
+        "licenses": {
+            "type": "array",
+        },
+        "images": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "file_name": {"type": "string"},
+                    "id": {"type": "integer"},
+                    "width": {"type": "integer"},
+                    "height": {"type": "integer"},
+                },
+            },
+        },
+        "annotations": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "id": {"type": "integer"},
+                    "image_id": {"type": "integer"},
+                    "bbox": {
+                        "type": "array",
+                        "items": {"type": "integer"},
+                    },
+                    # (box coordinates are measured from the
+                    # top left image corner and are 0-indexed)
+                    "category_id": {"type": "integer"},
+                    "area": {"type": "number"},
+                    # float according to the official schema
+                    "iscrowd": {"type": "integer"},
+                    # 0 or 1 according to the official schema
+                },
+            },
+        },
+        "categories": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "id": {"type": "integer"},
+                    "name": {"type": "string"},
+                    "supercategory": {"type": "string"},
+                },
+            },
+        },
+    },
+}
diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py
@@ -0,0 +1,244 @@
+"""Validators for annotation files."""
+
+import json
+from pathlib import Path
+
+import attrs
+import jsonschema
+import jsonschema.exceptions
+import jsonschema.validators
+from attrs import define, field, validators
+
+from ethology.annotations.json_schemas import COCO_SCHEMA, VIA_SCHEMA
+
+
+@define
+class ValidJSON:
+    """Class for valid JSON files.
+
+    It checks the JSON file exists, can be decoded, and optionally
+    validates the file against a JSON schema.
+
+    Attributes
+    ----------
+    path : pathlib.Path
+        Path to the JSON file.
+
+    schema : dict, optional
+        JSON schema to validate the file against.
+
+    Raises
+    ------
+    FileNotFoundError
+        If the file does not exist.
+    ValueError
+        If the JSON file cannot be decoded.
+    jsonschema.exceptions.ValidationError
+        If the type of any of the keys in the JSON file
+        does not match the type specified in the schema.
+
+
+    Notes
+    -----
+    https://json-schema.org/understanding-json-schema/
+
+    """
+
+    # Required attributes
+    path: Path = field(validator=validators.instance_of(Path))
+
+    # Optional attributes
+    schema: dict | None = field(default=None)
+
+    @path.validator
+    def _file_is_json(self, attribute, value):
+        """Ensure that the file is a JSON file."""
+        try:
+            with open(value) as file:
+                json.load(file)
+        except FileNotFoundError as not_found_error:
+            raise FileNotFoundError(
+                f"File not found: {value}."
+            ) from not_found_error
+        except json.JSONDecodeError as decode_error:
+            raise ValueError(
+                f"Error decoding JSON data from file: {value}."
+            ) from decode_error
+
+    @path.validator
+    def _file_matches_JSON_schema(self, attribute, value):
+        """Ensure that the JSON file matches the expected schema.
+
+        The schema validation only checks the type for each specified
+        key if the key exists. It does not check for the presence of
+        the keys.
+        """
+        # read json file
+        with open(value) as file:
+            data = json.load(file)
+
+        # check against schema if provided
+        if self.schema:
+            try:
+                jsonschema.validate(instance=data, schema=self.schema)
+            except jsonschema.exceptions.ValidationError as val_err:
+                # forward the error message as it is quite informative
+                raise val_err
+
+
+@define
+class ValidVIAJSON(ValidJSON):
+    """Class for valid VIA JSON files for untracked data.
+
+    It checks the input VIA JSON file contains the required keys.
+
+    Attributes
+    ----------
+    path : pathlib.Path
+        Path to the VIA JSON file.
+
+    schema : dict, optional
+        JSON schema to validate the file against. Default is VIA_SCHEMA.
+
+    Raises
+    ------
+    ValueError
+        If the VIA JSON file misses any of the required keys.
+
+    """
+
+    # run the parent's validators first
+    path: Path = field(validator=attrs.fields(ValidJSON).path.validator)
+    schema: dict = field(
+        validator=attrs.fields(ValidJSON).schema.validator,  # type: ignore
+        default=VIA_SCHEMA,
+    )
+
+    # TODO: add a validator to check the schema defines types
+    # for the required keys
+
+    # run additional validators
+    @path.validator
+    def _file_contains_required_keys(self, attribute, value):
+        """Ensure that the VIA JSON file contains the required keys."""
+        required_keys = {
+            "main": ["_via_img_metadata", "_via_image_id_list"],
+            "image_keys": ["filename", "regions"],
+            "region_keys": ["shape_attributes", "region_attributes"],
+            "shape_attributes_keys": ["x", "y", "width", "height"],
+        }
+
+        # Read data as dict
+        with open(value) as file:
+            data = json.load(file)
+
+        # Check first level keys
+        _check_keys(required_keys["main"], data)
+
+        # Check keys in nested dicts
+        for img_str, img_dict in data["_via_img_metadata"].items():
+            # Check keys for each image dictionary
+            _check_keys(
+                required_keys["image_keys"],
+                img_dict,
+                additional_message=f" for {img_str}",
+            )
+            # Check keys for each region
+            for i, region in enumerate(img_dict["regions"]):
+                _check_keys(
+                    required_keys["region_keys"],
+                    region,
+                    additional_message=f" for region {i} under {img_str}",
+                )
+
+                # Check keys under shape_attributes
+                _check_keys(
+                    required_keys["shape_attributes_keys"],
+                    region["shape_attributes"],
+                    additional_message=f" for region {i} under {img_str}",
+                )
+
+
+@define
+class ValidCOCOJSON(ValidJSON):
+    """Class valid COCO JSON files for untracked data.
+
+    It checks the input COCO JSON file contains the required keys.
+
+    Attributes
+    ----------
+    path : pathlib.Path
+        Path to the COCO JSON file.
+
+    Raises
+    ------
+    ValueError
+        If the COCO JSON file misses any of the required keys.
+
+    """
+
+    # run the parent's validators first
+    path: Path = field(validator=attrs.fields(ValidJSON).path.validator)
+    schema: dict = field(
+        validator=attrs.fields(ValidJSON).schema.validator,  # type: ignore
+        default=COCO_SCHEMA,
+    )
+
+    # TODO: add a validator to check the schema defines types
+    # for the required keys
+
+    # run additional validators
+    @path.validator
+    def _file_contains_required_keys(self, attribute, value):
+        """Ensure that the COCO JSON file contains the required keys."""
+        required_keys = {
+            "main": ["images", "annotations", "categories"],
+            "image_keys": ["id", "file_name"],  # add "height" and "width"?
+            "annotations_keys": ["id", "image_id", "bbox", "category_id"],
+            "categories_keys": ["id", "name", "supercategory"],
+        }
+
+        # Read data as dict
+        with open(value) as file:
+            data = json.load(file)
+
+        # Check first level keys
+        _check_keys(required_keys["main"], data)
+
+        # Check keys in images dicts
+        for img_dict in data["images"]:
+            _check_keys(
+                required_keys["image_keys"],
+                img_dict,
+                additional_message=f" for image dict {img_dict}",
+            )
+
+        # Check keys in annotations dicts
+        for annot_dict in data["annotations"]:
+            _check_keys(
+                required_keys["annotations_keys"],
+                annot_dict,
+                additional_message=f" for annotation dict {annot_dict}",
+            )
+
+        # Check keys in categories dicts
+        for cat_dict in data["categories"]:
+            _check_keys(
+                required_keys["categories_keys"],
+                cat_dict,
+                additional_message=f" for category dict {cat_dict}",
+            )
+
+
+def _check_keys(
+    list_required_keys: list[str],
+    data_dict: dict,
+    additional_message: str = "",
+):
+    """Check if the required keys are present in the input data_dict."""
+    missing_keys = set(list_required_keys) - data_dict.keys()
+    if missing_keys:
+        raise ValueError(
+            f"Required key(s) {sorted(missing_keys)} not "
+            f"found in {list(data_dict.keys())}{additional_message}."
+        )