Add validation and loading for COCO style file

neuroinformatics-unit · sfmig · Dec 10, 2024 · Dec 10, 2024 · Dec 11, 2024 · Dec 11, 2024
commit 0a1d64e6c3464a5c73d8c13adc0ce40465f8ce70
diff --git a/ethology/annotations/io.py b/ethology/annotations/io.py
@@ -6,7 +6,11 @@
 import pandas as pd
 from movement.validators.files import ValidFile
 
-from ethology.annotations.validators import ValidJSON, ValidVIAUntrackedJSON
+from ethology.annotations.validators import (
+    ValidCOCOUntrackedJSON,
+    ValidJSON,
+    ValidVIAUntrackedJSON,
+)
 
 STANDARD_DF_COLUMNS = [
     "annotation_id",
@@ -16,15 +20,15 @@
     "y_min",
     "width",
     "height",
-    "superclass",
-    "class",
+    "supercategory",
+    "category",
 ]
 
 
-def df_from_via_json_file(file_path: Path):
+def df_from_via_json_file(file_path: Path) -> pd.DataFrame:
     """Validate and read untracked VIA JSON file.
 
-    The data is formated as an untracked annotations DataFrame.
+    The data is formatted as an untracked annotations DataFrame.
     """
     # General file validation
     file = ValidFile(
@@ -41,13 +45,30 @@ def df_from_via_json_file(file_path: Path):
     return _df_from_validated_via_json_file(via_untracked_file.path)
 
 
+def df_from_coco_json_file(file_path: Path) -> pd.DataFrame:
+    """Validate and read COCO JSON file."""
+    # General file validation
+    file = ValidFile(
+        file_path, expected_permission="r", expected_suffix=[".json"]
+    )
+
+    # JSON file validation
+    json_file = ValidJSON(file.path)
+
+    # COCO Untracked JSON schema validation
+    coco_untracked_file = ValidCOCOUntrackedJSON(json_file.path)
+
+    # Read as standard dataframe
+    return _df_from_validated_coco_json_file(coco_untracked_file.path)
+
+
 def _df_from_validated_via_json_file(file_path):
     """Read VIA JSON file as standard untracked annotations DataFrame."""
     # Read validated json as dict
     with open(file_path) as file:
         data_dict = json.load(file)
 
-    # Get relevant fields
+    # Prepare data
     image_metadata_dict = data_dict["_via_img_metadata"]
     via_image_id_list = data_dict[
         "_via_image_id_list"
@@ -68,28 +89,26 @@ def _df_from_validated_via_json_file(file_path):
             region_shape = region["shape_attributes"]
             region_attributes = region["region_attributes"]
 
+            row = {
+                "image_filename": img_dict["filename"],
+                "x_min": region_shape["x"],
+                "y_min": region_shape["y"],
+                "width": region_shape["width"],
+                "height": region_shape["height"],
+                "supercategory": list(region_attributes.keys())[
+                    0
+                ],  # takes first key as supercategory
+                "category": region_attributes[
+                    list(region_attributes.keys())[0]
+                ],
+            }
+
             # append annotations to df
-            list_rows.append(
-                {
-                    "image_filename": img_dict["filename"],
-                    "x_min": region_shape["x"],
-                    "y_min": region_shape["y"],
-                    "width": region_shape["width"],
-                    "height": region_shape["height"],
-                    "superclass": list(region_attributes.keys())[
-                        0
-                    ],  # takes first key as superclass
-                    "class": region_attributes[
-                        list(region_attributes.keys())[0]
-                    ],
-                },
-            )
+            list_rows.append(row)
 
     df = pd.DataFrame(
         list_rows,
-        columns=[
-            col for col in STANDARD_DF_COLUMNS if not col.endswith("_id")
-        ],
+        # columns=list(row.keys()),  # do I need this?
     )
 
     # add image_id column
@@ -104,3 +123,55 @@ def _df_from_validated_via_json_file(file_path):
     df = df.reindex(columns=STANDARD_DF_COLUMNS)
 
     return df
+
+
+def _df_from_validated_coco_json_file(file_path: Path) -> pd.DataFrame:
+    """Read COCO JSON file as standard untracked annotations DataFrame."""
+    # Read validated json as dict
+    with open(file_path) as file:
+        data_dict = json.load(file)
+
+    # Prepare data
+    map_image_id_to_filename = {
+        img_dict["id"]: img_dict["file_name"]
+        for img_dict in data_dict["images"]
+    }
+
+    map_category_id_to_category_data = {
+        cat_dict["id"]: (cat_dict["name"], cat_dict["supercategory"])
+        for cat_dict in data_dict["categories"]
+    }
+
+    # Build standard dataframe
+    list_rows = []
+    for annot_dict in data_dict["annotations"]:
+        annotation_id = annot_dict["id"]
+        # image data
+        image_id = annot_dict["image_id"]
+        image_filename = map_image_id_to_filename[image_id]
+
+        # bbox data
+        x_min, y_min, width, height = annot_dict["bbox"]
+
+        # class data
+        category_id = annot_dict["category_id"]
+        category, supercategory = map_category_id_to_category_data[category_id]
+
+        row = {
+            "annotation_id": annotation_id,
+            "image_filename": image_filename,
+            "image_id": image_id,
+            "x_min": x_min,
+            "y_min": y_min,
+            "width": width,
+            "height": height,
+            "supercategory": supercategory,
+            "category": category,
+        }
+
+        list_rows.append(row)
+
+    df = pd.DataFrame(list_rows)
+    df.reindex(columns=STANDARD_DF_COLUMNS)
+
+    return df
diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py
@@ -46,8 +46,9 @@ class ValidVIAUntrackedJSON:
     """Class for validating VIA JSON files for untracked data.
 
     The validator ensures that the file matches the expected schema.
+    The schema validation only checks the type for each specified
+    key if it exists. It does not check for the presence of the keys.
 
-    https://json-schema.org/understanding-json-schema/reference/object#additional-properties
 
     Attributes
     ----------
@@ -59,10 +60,16 @@ class ValidVIAUntrackedJSON:
     ValueError
         If the JSON file does not match the expected schema.
 
+    Notes
+    -----
+    https://json-schema.org/understanding-json-schema/
+
     """
 
+    # TODO: add a check for the presence of the keys
+    # that I use in loading the data
+
     path: Path = field(validator=validators.instance_of(Path))
-    # expected_schema: dict = field(factory=dict, kw_only=True)
 
     @path.validator
     def _file_macthes_VIA_JSON_schema(self, attribute, value):
@@ -135,7 +142,8 @@ def _file_macthes_VIA_JSON_schema(self, attribute, value):
             },
         }
 
-        # should have been validated with ValidVIAUntrackedJSON
+        # should have been validated with ValidJSON
+        # already so this should work fine
         with open(value) as file:
             data = json.load(file)
 
@@ -147,12 +155,102 @@ def _file_macthes_VIA_JSON_schema(self, attribute, value):
                 "The JSON data does not match "
                 f"the provided schema: {VIA_JSON_schema}"
             ) from val_err
-        # except jsonschema.exceptions.SchemaError as schema_err:
-        #     raise ValueError(
-        #         f"Invalid schema provided: {VIA_JSON_schema}"
-        #     ) from schema_err
 
 
-# @define
-# class ValidCOCOUntrackedJSON:
-#     pass
+@define
+class ValidCOCOUntrackedJSON:
+    """Class for validating COCO JSON files for untracked data.
+
+    The validator ensures that the file matches the expected schema.
+    The schema validation only checks the type for each specified
+    key if it exists. It does not check for the presence of the keys.
+
+    Attributes
+    ----------
+    path : pathlib.Path
+        Path to the JSON file.
+
+    Raises
+    ------
+    ValueError
+        If the JSON file does not match the expected schema.
+
+    Notes
+    -----
+    https://json-schema.org/understanding-json-schema/
+
+    """
+
+    path: Path = field(validator=validators.instance_of(Path))
+
+    # TODO: add a check for the presence of the keys
+    # that I use in loading the data
+
+    @path.validator
+    def _file_macthes_COCO_JSON_schema(self, attribute, value):
+        """Ensure that the JSON file matches the expected schema."""
+        # Define schema for VIA JSON file for untracked
+        # (aka manually labelled) data
+        COCO_JSON_schema = {
+            "type": "object",
+            "properties": {
+                "info": {"type": "object"},
+                "licenses": {
+                    "type": "array",
+                },
+                "images": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "file_name": {"type": "string"},
+                            "id": {"type": "integer"},
+                            "width": {"type": "integer"},
+                            "height": {"type": "integer"},
+                        },
+                    },
+                },
+                "annotations": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "id": {"type": "integer"},  # annotation global ID
+                            "image_id": {"type": "integer"},
+                            "bbox": {
+                                "type": "array",
+                                "items": {"type": "integer"},
+                            },
+                            "category_id": {"type": "integer"},
+                            "area": {"type": "integer"},
+                            "iscrowd": {"type": "integer"},
+                        },
+                    },
+                },
+                "categories": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "id": {"type": "integer"},
+                            "name": {"type": "string"},
+                            "supercategory": {"type": "string"},
+                        },
+                    },
+                },
+            },
+        }
+
+        # should have been validated with ValidJSON
+        # already so this should work fine
+        with open(value) as file:
+            data = json.load(file)
+
+        # check against schema
+        try:
+            jsonschema.validate(instance=data, schema=COCO_JSON_schema)
+        except jsonschema.exceptions.ValidationError as val_err:
+            raise ValueError(
+                "The JSON data does not match "
+                f"the provided schema: {COCO_JSON_schema}"
+            ) from val_err