From 00c309eb08a507e7f02fae72e151c62d90a59590 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Tue, 10 Dec 2024 20:13:21 +0000 Subject: [PATCH 01/36] add notebook to read COCO annotations as xarray --- .../annotations/notebook_annots_as_xarray.py | 231 ++++++++++++++++++ ethology/annotations/utils.py | 72 ++++++ pyproject.toml | 3 + 3 files changed, 306 insertions(+) create mode 100644 ethology/annotations/notebook_annots_as_xarray.py create mode 100644 ethology/annotations/utils.py diff --git a/ethology/annotations/notebook_annots_as_xarray.py b/ethology/annotations/notebook_annots_as_xarray.py new file mode 100644 index 0000000..bdcb59f --- /dev/null +++ b/ethology/annotations/notebook_annots_as_xarray.py @@ -0,0 +1,231 @@ +# %% + +import numpy as np +import xarray as xr +from utils import read_json_file_as_dict + +# %%%%%%%%%%%%%%%%%%% +# input data +via_file_path = ( + "/home/sminano/swc/project_ethology/sample_VIA_annotations/VIA_JSON_1.json" +) +coco_file_path = ( + "/home/sminano/swc/project_ethology/sample_COCO_annotations/sample_annotations_1.json" +) + +# via_data = read_via_json_file_as_dict(via_file_path) +# print(via_data.keys()) # _via_img_metadata, _via_image_id_list + +# %%%%%%%%%%%%%%%%%%%% +# read as dict +coco_data = read_json_file_as_dict(coco_file_path) + +print( + coco_data.keys() +) # dict_keys(['annotations', 'categories', 'images', 'info', 'licenses']) +# %%%%%%%%%%%%%%%%%%%% + + +def compute_homog_data_array_per_image_id(data_str, axis_image_id_in_output): + # pair up data with image id + pair_data = [] + for annot in coco_data["annotations"]: + if isinstance(annot[data_str], list): + pair_data.append(annot[data_str] + [annot["image_id"]]) + else: + pair_data.append([annot[data_str], annot["image_id"]]) + + data_and_image_id_array = np.array(pair_data) + + # split + data_array_per_image_id = np.split( + data_and_image_id_array[:, : data_and_image_id_array.shape[1] - 1], + np.where(np.diff(data_and_image_id_array[:, -1]))[0] + 1, + axis=0, + ) + + # pad + max_bboxes_per_image = max([d.shape[0] for d in data_array_per_image_id]) + data_array_per_image_id_with_nans = np.stack( + [ + np.concat( + ( + d, + np.full( + (max_bboxes_per_image - d.shape[0], d.shape[1]), np.nan + ), + ) + ).squeeze() + for d in data_array_per_image_id + ], + axis=axis_image_id_in_output, # 1, -1 + ) # annotation_image_id, image_id, space + + return data_array_per_image_id_with_nans + + +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Format bboxes data as xarray DataArray + +# # get bboxes coordinates per image +# bbox_and_image_id_array = np.array( +# [annot["bbox"] + [annot["image_id"]] +# for annot in coco_data["annotations"]] +# ) +# bbox_array_per_image_id = np.split( +# bbox_and_image_id_array[:, :4], +# np.where(np.diff(bbox_and_image_id_array[:, -1]))[0] + 1, +# axis=0, +# ) + +# # pad missing annnotation-image-ids with np.nan +# max_bboxes_per_image = max([d.shape[0] for d in bbox_array_per_image_id]) +# bbox_array_per_image_id_with_nans = np.stack( +# [ +# np.concat( +# ( +# d, +# np.full( +# (max_bboxes_per_image - d.shape[0], d.shape[1]), np.nan +# ), +# ) +# ).squeeze() +# for d in bbox_array_per_image_id +# ], +# axis=1 +# ) #annotation_image_id, image_id, space + +# define bboxes data array +bboxes_data = compute_homog_data_array_per_image_id( + "bbox", axis_image_id_in_output=1 +) +bboxes_da = xr.DataArray.from_dict( + { + "dims": [ + "annotation_image_id", + "image_id", + "space", + ], + "data": bboxes_data, + "coords": { + "annotation_image_id": { + "dims": "annotation_image_id", + "data": list(range(bboxes_data.shape[0])), # --------- + }, + "image_id": { + "dims": "image_id", + "data": np.unique( + [annot["image_id"] for annot in coco_data["annotations"]] + ), + }, + "space": { + "dims": "space", + "data": ["x", "y", "width", "height"], + }, + }, + # "attrs": {"title": "air temperature"}, + "name": "bbox", + } +) + +# %%%%%%%%%%%%%%%%%%%% +# Format annotation ID as xarray DataArray + +# # get data +# annot_and_image_id_array = np.array( +# [ +# [annot["id"]] + [annot["image_id"]] +# for annot in coco_data["annotations"] +# ], +# dtype=int, +# ) + +# # split based on image id +# annot_array_per_image_id = np.split( +# annot_and_image_id_array[:, 0].reshape(-1, 1), +# np.where(np.diff(annot_and_image_id_array[:, -1]))[0] + 1, +# axis=0, +# ) + +# # pad missing annnotation-image-ids with np.nan +# # max_bboxes_per_image = max([d.shape[0] for d in annot_array_per_image_id]) +# annot_array_per_image_id_with_nans = np.stack( +# [ +# np.concat( +# ( +# d, +# np.full( +# (max_bboxes_per_image - d.shape[0], d.shape[1]), np.nan +# ), +# ) +# ).squeeze() +# for d in annot_array_per_image_id +# ], +# axis=-1, +# # dtype=int +# ) # annotation_image_id, image_id + + +# define annot ID data array +annot_ID_data = compute_homog_data_array_per_image_id( + "id", axis_image_id_in_output=-1 +) +annotation_id_da = xr.DataArray.from_dict( + { + "dims": [ + "annotation_image_id", + "image_id", + ], + "data": annot_ID_data, + "coords": { + "annotation_image_id": { + "dims": "annotation_image_id", + "data": list(range(annot_ID_data.shape[0])), # --------- + }, + "image_id": { + "dims": "image_id", + "data": np.unique( + [annot["image_id"] for annot in coco_data["annotations"]] + ), + }, + }, + "attrs": {"title": "annotations ID per dataset"}, + "name": "bbox", + } +) + + +# %% +ds = xr.Dataset( + data_vars=dict( + bbox=(["annotation_image_id", "image_id", "space"], bboxes_da.data), + global_id=( + ["annotation_image_id", "image_id"], + annotation_id_da.data, + ), + # category=(["annotation_id", "category_id"], category_da), + ), + coords=dict( + annotation_image_id=bboxes_da.coords["annotation_image_id"], + image_id=bboxes_da.coords["image_id"], + space=bboxes_da.coords["space"], + # category_id=category_da.coords["category_id"], + ), + # attrs=dict(description="Weather related data."), +) + +# %%%%%%%%%%%%%%%%%%%% +# Inspect the dataset + +print(ds) + +# get all annotations in image 4 +ds.bbox.sel(image_id=4) + + +# get the bbox coordinates of the annotation with global ID = 2 +# a.where(a.x + a.y < 4) +ds.bbox.where(ds.global_id == 2, drop=True) + +# get the global ID of the third annotation per image +ds.global_id.sel(annotation_image_id=3) diff --git a/ethology/annotations/utils.py b/ethology/annotations/utils.py new file mode 100644 index 0000000..925489a --- /dev/null +++ b/ethology/annotations/utils.py @@ -0,0 +1,72 @@ +"""Utility functions to work with annotations in JSON format.""" + +import json +from pathlib import Path + + +def read_json_file_as_dict( + file_path: Path, +) -> dict: + """Read JSON file as dict. + + Parameters + ---------- + file_path : str + Path to the JSON file + + Returns + ------- + dict + Dictionary with the JSON data + + """ + try: + with open(file_path) as file: + return json.load(file) + except FileNotFoundError as not_found_error: + msg = f"File not found: {file_path}" + raise ValueError(msg) from not_found_error + except json.JSONDecodeError as decode_error: + msg = f"Error decoding JSON data from file: {file_path}" + raise ValueError(msg) from decode_error + + +def read_via_json_file_as_dict(file_path: Path) -> dict: + """Read VIA JSON file as dict. + + Parameters + ---------- + file_path : str + Path to the VIA JSON file + + Returns + ------- + dict + Dictionary with the JSON data + + """ + # Read data + data = read_json_file_as_dict(file_path) + + # Check the expected keys are defined in the JSON file + expected_keys = [ + "_via_settings", + "_via_img_metadata", + "_via_attributes", + "_via_data_format_version", + "_via_image_id_list", + ] + + for ky in expected_keys: + if ky not in data: + raise ValueError( + f"Expected key '{ky}' not found in file: {file_path}" + ) + + return data + + +# def read_via_json_file_as_xarray(file_path: Path): + + +# via_dict = read_via_json_file_as_dict(file_path) diff --git a/pyproject.toml b/pyproject.toml index 3fb07d7..88443ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,9 @@ classifiers = [ dependencies = [ "movement", ] +dependencies = [ + "xarray", # "xarray[accel,viz]", +] [project.urls] "Homepage" = "https://github.com/neuroinformatics-unit/ethology" From 261e03c91cb9eae22833ce4840fc87450684dcfb Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Tue, 10 Dec 2024 20:25:38 +0000 Subject: [PATCH 02/36] Simplify --- .../annotations/notebook_annots_as_xarray.py | 168 ++++-------------- 1 file changed, 35 insertions(+), 133 deletions(-) diff --git a/ethology/annotations/notebook_annots_as_xarray.py b/ethology/annotations/notebook_annots_as_xarray.py index bdcb59f..08e0a5c 100644 --- a/ethology/annotations/notebook_annots_as_xarray.py +++ b/ethology/annotations/notebook_annots_as_xarray.py @@ -1,4 +1,22 @@ -# %% +"""Explore formatting COCO annotations as an xarray Dataset. + +The dataset is made up from the following data variables: +- bbox: a 3D array with bounding box coordinates and shape + (max_n_bboxes_per_image, n_images, 4). + The four coordinates represent (x, y, h, w) per annotation. +- global_id: a 2D array of shape (max_n_bboxes_per_image, n_images) with + the global ID of each annotation. + +To add: +- category: a 2D array of shape (max_n_bboxes_per_image, n_images) with + the category ID / str of each annotation. +- split bbox into position and shape. +- keep track of image filename? + +""" + +# %%%%%%%%%%%%%%%%%%%% +# imports import numpy as np import xarray as xr @@ -17,15 +35,16 @@ # print(via_data.keys()) # _via_img_metadata, _via_image_id_list # %%%%%%%%%%%%%%%%%%%% -# read as dict +# read input json as dict coco_data = read_json_file_as_dict(coco_file_path) print( coco_data.keys() ) # dict_keys(['annotations', 'categories', 'images', 'info', 'licenses']) -# %%%%%%%%%%%%%%%%%%%% +# %%%%%%%%%%%%%%%%%%%% +# helper fn to format data as homogeneous arrays def compute_homog_data_array_per_image_id(data_str, axis_image_id_in_output): # pair up data with image id pair_data = [] @@ -44,7 +63,7 @@ def compute_homog_data_array_per_image_id(data_str, axis_image_id_in_output): axis=0, ) - # pad + # pad missing annotation-image IDs max_bboxes_per_image = max([d.shape[0] for d in data_array_per_image_id]) data_array_per_image_id_with_nans = np.stack( [ @@ -65,153 +84,35 @@ def compute_homog_data_array_per_image_id(data_str, axis_image_id_in_output): # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# Format bboxes data as xarray DataArray - -# # get bboxes coordinates per image -# bbox_and_image_id_array = np.array( -# [annot["bbox"] + [annot["image_id"]] -# for annot in coco_data["annotations"]] -# ) -# bbox_array_per_image_id = np.split( -# bbox_and_image_id_array[:, :4], -# np.where(np.diff(bbox_and_image_id_array[:, -1]))[0] + 1, -# axis=0, -# ) - -# # pad missing annnotation-image-ids with np.nan -# max_bboxes_per_image = max([d.shape[0] for d in bbox_array_per_image_id]) -# bbox_array_per_image_id_with_nans = np.stack( -# [ -# np.concat( -# ( -# d, -# np.full( -# (max_bboxes_per_image - d.shape[0], d.shape[1]), np.nan -# ), -# ) -# ).squeeze() -# for d in bbox_array_per_image_id -# ], -# axis=1 -# ) #annotation_image_id, image_id, space +# Format data # define bboxes data array bboxes_data = compute_homog_data_array_per_image_id( "bbox", axis_image_id_in_output=1 ) -bboxes_da = xr.DataArray.from_dict( - { - "dims": [ - "annotation_image_id", - "image_id", - "space", - ], - "data": bboxes_data, - "coords": { - "annotation_image_id": { - "dims": "annotation_image_id", - "data": list(range(bboxes_data.shape[0])), # --------- - }, - "image_id": { - "dims": "image_id", - "data": np.unique( - [annot["image_id"] for annot in coco_data["annotations"]] - ), - }, - "space": { - "dims": "space", - "data": ["x", "y", "width", "height"], - }, - }, - # "attrs": {"title": "air temperature"}, - "name": "bbox", - } -) - -# %%%%%%%%%%%%%%%%%%%% -# Format annotation ID as xarray DataArray - -# # get data -# annot_and_image_id_array = np.array( -# [ -# [annot["id"]] + [annot["image_id"]] -# for annot in coco_data["annotations"] -# ], -# dtype=int, -# ) - -# # split based on image id -# annot_array_per_image_id = np.split( -# annot_and_image_id_array[:, 0].reshape(-1, 1), -# np.where(np.diff(annot_and_image_id_array[:, -1]))[0] + 1, -# axis=0, -# ) - -# # pad missing annnotation-image-ids with np.nan -# # max_bboxes_per_image = max([d.shape[0] for d in annot_array_per_image_id]) -# annot_array_per_image_id_with_nans = np.stack( -# [ -# np.concat( -# ( -# d, -# np.full( -# (max_bboxes_per_image - d.shape[0], d.shape[1]), np.nan -# ), -# ) -# ).squeeze() -# for d in annot_array_per_image_id -# ], -# axis=-1, -# # dtype=int -# ) # annotation_image_id, image_id - # define annot ID data array annot_ID_data = compute_homog_data_array_per_image_id( "id", axis_image_id_in_output=-1 ) -annotation_id_da = xr.DataArray.from_dict( - { - "dims": [ - "annotation_image_id", - "image_id", - ], - "data": annot_ID_data, - "coords": { - "annotation_image_id": { - "dims": "annotation_image_id", - "data": list(range(annot_ID_data.shape[0])), # --------- - }, - "image_id": { - "dims": "image_id", - "data": np.unique( - [annot["image_id"] for annot in coco_data["annotations"]] - ), - }, - }, - "attrs": {"title": "annotations ID per dataset"}, - "name": "bbox", - } -) - -# %% +# %%%%%%%%%%%%%%%%%%%% +# Create xarray Dataset ds = xr.Dataset( data_vars=dict( - bbox=(["annotation_image_id", "image_id", "space"], bboxes_da.data), + bbox=(["annotation_image_id", "image_id", "space"], bboxes_data), global_id=( ["annotation_image_id", "image_id"], - annotation_id_da.data, + annot_ID_data, ), - # category=(["annotation_id", "category_id"], category_da), ), coords=dict( - annotation_image_id=bboxes_da.coords["annotation_image_id"], - image_id=bboxes_da.coords["image_id"], - space=bboxes_da.coords["space"], - # category_id=category_da.coords["category_id"], + annotation_image_id=list(range(bboxes_data.shape[0])), + image_id=np.unique( + [annot["image_id"] for annot in coco_data["annotations"]] + ), + space=["x", "y", "width", "height"], ), - # attrs=dict(description="Weather related data."), ) # %%%%%%%%%%%%%%%%%%%% @@ -224,8 +125,9 @@ def compute_homog_data_array_per_image_id(data_str, axis_image_id_in_output): # get the bbox coordinates of the annotation with global ID = 2 -# a.where(a.x + a.y < 4) ds.bbox.where(ds.global_id == 2, drop=True) # get the global ID of the third annotation per image ds.global_id.sel(annotation_image_id=3) + +# %% From 57dc2b9e1d3e3d8b3dac487e4fc00eec5e2e4eb1 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Wed, 11 Dec 2024 13:20:50 +0000 Subject: [PATCH 03/36] Add movement as dependency, fix indentation --- pyproject.toml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 88443ce..0e4914f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,12 +17,17 @@ classifiers = [ "Programming Language :: Python :: 3.12", "Operating System :: OS Independent", "License :: OSI Approved :: BSD License", + "Development Status :: 2 - Pre-Alpha", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Operating System :: OS Independent", + "License :: OSI Approved :: BSD License", ] dependencies = [ - "movement", -] -dependencies = [ - "xarray", # "xarray[accel,viz]", + "movement" ] [project.urls] From 5f16511a451badd55acb942ecf9e9622241e2713 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Wed, 11 Dec 2024 13:21:16 +0000 Subject: [PATCH 04/36] Add io module for reading manual annotations --- ethology/annotations/io.py | 106 +++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 ethology/annotations/io.py diff --git a/ethology/annotations/io.py b/ethology/annotations/io.py new file mode 100644 index 0000000..09339ba --- /dev/null +++ b/ethology/annotations/io.py @@ -0,0 +1,106 @@ +"""Module for reading and writing manually labelled annotations.""" + +import json +from pathlib import Path + +import pandas as pd +from movement.validators.files import ValidFile + +from ethology.annotations.validators import ValidJSON, ValidVIAUntrackedJSON + +STANDARD_DF_COLUMNS = [ + "annotation_id", + "image_filename", + "image_id", + "x_min", + "y_min", + "width", + "height", + "superclass", + "class", +] + + +def df_from_via_json_file(file_path: Path): + """Validate and read untracked VIA JSON file. + + The data is formated as an untracked annotations DataFrame. + """ + # General file validation + file = ValidFile( + file_path, expected_permission="r", expected_suffix=[".json"] + ) + + # JSON file validation + json_file = ValidJSON(file.path) + + # VIA Untracked JSON schema validation + via_untracked_file = ValidVIAUntrackedJSON(json_file.path) + + # Read as standard dataframe + return _df_from_validated_via_json_file(via_untracked_file.path) + + +def _df_from_validated_via_json_file(file_path): + """Read VIA JSON file as standard untracked annotations DataFrame.""" + # Read validated json as dict + with open(file_path) as file: + data_dict = json.load(file) + + # Get relevant fields + image_metadata_dict = data_dict["_via_img_metadata"] + via_image_id_list = data_dict[ + "_via_image_id_list" + ] # ordered list of the keys in image_metadata_dict + + # map filename to keys in image_metadata_dict + map_filename_to_via_img_id = { + img_dict["filename"]: ky + for ky, img_dict in image_metadata_dict.items() + } + + # Build standard dataframe + list_rows = [] + # loop thru images + for _, img_dict in image_metadata_dict.items(): + # loop thru annotations in the image + for region in img_dict["regions"]: + region_shape = region["shape_attributes"] + region_attributes = region["region_attributes"] + + # append annotations to df + list_rows.append( + { + "image_filename": img_dict["filename"], + "x_min": region_shape["x"], + "y_min": region_shape["y"], + "width": region_shape["width"], + "height": region_shape["height"], + "superclass": list(region_attributes.keys())[ + 0 + ], # takes first key as superclass + "class": region_attributes[ + list(region_attributes.keys())[0] + ], + }, + ) + + df = pd.DataFrame( + list_rows, + columns=[ + col for col in STANDARD_DF_COLUMNS if not col.endswith("_id") + ], + ) + + # add image_id column + df["image_id"] = df["image_filename"].apply( + lambda x: via_image_id_list.index(map_filename_to_via_img_id[x]) + ) + + # add annotation_id column based on index + df["annotation_id"] = df.index + + # reorder columns to match standard + df = df.reindex(columns=STANDARD_DF_COLUMNS) + + return df From 0945291a26afad9a0e154945d88f6993f80ba9bf Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Wed, 11 Dec 2024 13:21:41 +0000 Subject: [PATCH 05/36] Add validators for manual annotation files --- ethology/annotations/validators.py | 144 +++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 ethology/annotations/validators.py diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py new file mode 100644 index 0000000..a73f9a6 --- /dev/null +++ b/ethology/annotations/validators.py @@ -0,0 +1,144 @@ +import json +from pathlib import Path + +import jsonschema +import jsonschema.exceptions +from attrs import define, field, validators + +# + + +@define +class ValidJSON: + """Class for validating JSON files. + + Attributes + ---------- + path : pathlib.Path + Path to the JSON file. + + Raises + ------ + ValueError + If the file is not in JSON format or if it does not contain the + expected keys. + + """ + + path: Path = field(validator=validators.instance_of(Path)) + + @path.validator + def _file_is_json(self, attribute, value): + """Ensure that the file is a JSON file.""" + try: + with open(value) as file: + json.load(file) + except FileNotFoundError as not_found_error: + raise ValueError(f"File not found: {value}") from not_found_error + except json.JSONDecodeError as decode_error: + raise ValueError( + f"Error decoding JSON data from file: {value}" + ) from decode_error + + +@define +class ValidVIAUntrackedJSON: + """Class for validating VIA JSON files for untracked data. + + The validator ensures that the file matches the expected schema. + + https://json-schema.org/understanding-json-schema/reference/object#additional-properties + + Attributes + ---------- + path : pathlib.Path + Path to the JSON file. + + Raises + ------ + ValueError + If the JSON file does not match the expected schema. + + """ + + path: Path = field(validator=validators.instance_of(Path)) + # expected_schema: dict = field(factory=dict, kw_only=True) + # https://stackoverflow.com/questions/16222633/how-would-you-design-json-schema-for-an-arbitrary-key + + @path.validator + def _file_macthes_VIA_JSON_schema(self, attribute, value): + """Ensure that the JSON file matches the expected schema.""" + # should the schema be an attribute? + VIA_JSON_schema = { + "type": "object", + "properties": { + "_via_settings": { + "type": "object", + "properties": { + "ui": {"type": "object"}, + "core": {"type": "object"}, + "project": {"type": "object"}, + }, + }, + "_via_img_metadata": { + "type": "object", + "additionalProperties": { # ---- does this work? + "type": "object", + "properties": { + "filename": {"type": "string"}, + "size": {"type": "integer"}, + "regions": { + "type": "list", # does this work? + "properties": { + "shape_attributes": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "x": {"type": "integer"}, + "y": {"type": "integer"}, + "width": {"type": "integer"}, + "height": {"type": "integer"}, + }, + "region_attributes": { + "type": "object" + }, + }, + }, + }, + "file_attributes": {"type": "object"}, + }, + }, + }, + "_via_attributes": { + "type": "dict", + "properties": { + "region": {"type": "dict"}, + "file": {"type": "dict"}, + }, + }, + "_via_data_format_version": {"type": "string"}, + "_via_image_id_list": {"type": "list"}, + }, + } + + # should have been validated with ValidVIAUntrackedJSON + with open(value) as file: + data = json.load(file) + + # check schema + try: + jsonschema.validate(instance=data, schema=VIA_JSON_schema) + except jsonschema.exceptions.ValidationError as val_err: + raise ValueError( + "The JSON data does not match " + f"the provided schema: {VIA_JSON_schema}" + ) from val_err + # except jsonschema.exceptions.SchemaError as schema_err: + # raise ValueError( + # f"Invalid schema provided: {VIA_JSON_schema}" + # ) from schema_err + + +@define +class ValidCOCOUntrackedJSON: + pass From fd2e30c14ad8b4baf3945c32dc7cc438cfc91725 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Wed, 11 Dec 2024 13:36:58 +0000 Subject: [PATCH 06/36] Remove notebook for xarray --- .../annotations/notebook_annots_as_xarray.py | 133 ------------------ 1 file changed, 133 deletions(-) delete mode 100644 ethology/annotations/notebook_annots_as_xarray.py diff --git a/ethology/annotations/notebook_annots_as_xarray.py b/ethology/annotations/notebook_annots_as_xarray.py deleted file mode 100644 index 08e0a5c..0000000 --- a/ethology/annotations/notebook_annots_as_xarray.py +++ /dev/null @@ -1,133 +0,0 @@ -"""Explore formatting COCO annotations as an xarray Dataset. - -The dataset is made up from the following data variables: -- bbox: a 3D array with bounding box coordinates and shape - (max_n_bboxes_per_image, n_images, 4). - The four coordinates represent (x, y, h, w) per annotation. -- global_id: a 2D array of shape (max_n_bboxes_per_image, n_images) with - the global ID of each annotation. - -To add: -- category: a 2D array of shape (max_n_bboxes_per_image, n_images) with - the category ID / str of each annotation. -- split bbox into position and shape. -- keep track of image filename? - -""" - -# %%%%%%%%%%%%%%%%%%%% -# imports - -import numpy as np -import xarray as xr -from utils import read_json_file_as_dict - -# %%%%%%%%%%%%%%%%%%% -# input data -via_file_path = ( - "/home/sminano/swc/project_ethology/sample_VIA_annotations/VIA_JSON_1.json" -) -coco_file_path = ( - "/home/sminano/swc/project_ethology/sample_COCO_annotations/sample_annotations_1.json" -) - -# via_data = read_via_json_file_as_dict(via_file_path) -# print(via_data.keys()) # _via_img_metadata, _via_image_id_list - -# %%%%%%%%%%%%%%%%%%%% -# read input json as dict -coco_data = read_json_file_as_dict(coco_file_path) - -print( - coco_data.keys() -) # dict_keys(['annotations', 'categories', 'images', 'info', 'licenses']) - - -# %%%%%%%%%%%%%%%%%%%% -# helper fn to format data as homogeneous arrays -def compute_homog_data_array_per_image_id(data_str, axis_image_id_in_output): - # pair up data with image id - pair_data = [] - for annot in coco_data["annotations"]: - if isinstance(annot[data_str], list): - pair_data.append(annot[data_str] + [annot["image_id"]]) - else: - pair_data.append([annot[data_str], annot["image_id"]]) - - data_and_image_id_array = np.array(pair_data) - - # split - data_array_per_image_id = np.split( - data_and_image_id_array[:, : data_and_image_id_array.shape[1] - 1], - np.where(np.diff(data_and_image_id_array[:, -1]))[0] + 1, - axis=0, - ) - - # pad missing annotation-image IDs - max_bboxes_per_image = max([d.shape[0] for d in data_array_per_image_id]) - data_array_per_image_id_with_nans = np.stack( - [ - np.concat( - ( - d, - np.full( - (max_bboxes_per_image - d.shape[0], d.shape[1]), np.nan - ), - ) - ).squeeze() - for d in data_array_per_image_id - ], - axis=axis_image_id_in_output, # 1, -1 - ) # annotation_image_id, image_id, space - - return data_array_per_image_id_with_nans - - -# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# Format data - -# define bboxes data array -bboxes_data = compute_homog_data_array_per_image_id( - "bbox", axis_image_id_in_output=1 -) - -# define annot ID data array -annot_ID_data = compute_homog_data_array_per_image_id( - "id", axis_image_id_in_output=-1 -) - -# %%%%%%%%%%%%%%%%%%%% -# Create xarray Dataset -ds = xr.Dataset( - data_vars=dict( - bbox=(["annotation_image_id", "image_id", "space"], bboxes_data), - global_id=( - ["annotation_image_id", "image_id"], - annot_ID_data, - ), - ), - coords=dict( - annotation_image_id=list(range(bboxes_data.shape[0])), - image_id=np.unique( - [annot["image_id"] for annot in coco_data["annotations"]] - ), - space=["x", "y", "width", "height"], - ), -) - -# %%%%%%%%%%%%%%%%%%%% -# Inspect the dataset - -print(ds) - -# get all annotations in image 4 -ds.bbox.sel(image_id=4) - - -# get the bbox coordinates of the annotation with global ID = 2 -ds.bbox.where(ds.global_id == 2, drop=True) - -# get the global ID of the third annotation per image -ds.global_id.sel(annotation_image_id=3) - -# %% From c69fea9f4c789305ad7addc775cc25f2374e89a4 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Wed, 11 Dec 2024 13:37:11 +0000 Subject: [PATCH 07/36] Fix schema for validation --- ethology/annotations/validators.py | 66 ++++++++++++++++++------------ 1 file changed, 40 insertions(+), 26 deletions(-) diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py index a73f9a6..88a5c03 100644 --- a/ethology/annotations/validators.py +++ b/ethology/annotations/validators.py @@ -1,3 +1,5 @@ +"""Validators for annotation files.""" + import json from pathlib import Path @@ -5,8 +7,6 @@ import jsonschema.exceptions from attrs import define, field, validators -# - @define class ValidJSON: @@ -63,15 +63,16 @@ class ValidVIAUntrackedJSON: path: Path = field(validator=validators.instance_of(Path)) # expected_schema: dict = field(factory=dict, kw_only=True) - # https://stackoverflow.com/questions/16222633/how-would-you-design-json-schema-for-an-arbitrary-key @path.validator def _file_macthes_VIA_JSON_schema(self, attribute, value): """Ensure that the JSON file matches the expected schema.""" - # should the schema be an attribute? + # Define schema for VIA JSON file for untracked + # (aka manually labelled) data VIA_JSON_schema = { "type": "object", "properties": { + # settings for browser UI "_via_settings": { "type": "object", "properties": { @@ -80,27 +81,33 @@ def _file_macthes_VIA_JSON_schema(self, attribute, value): "project": {"type": "object"}, }, }, + # annotation data "_via_img_metadata": { "type": "object", - "additionalProperties": { # ---- does this work? + "additionalProperties": { + # "additionalProperties" to allow any key, + # see https://stackoverflow.com/a/69811612/24834957 "type": "object", "properties": { "filename": {"type": "string"}, "size": {"type": "integer"}, "regions": { - "type": "list", # does this work? - "properties": { - "shape_attributes": { - "type": "object", - "properties": { - "name": {"type": "string"}, - "x": {"type": "integer"}, - "y": {"type": "integer"}, - "width": {"type": "integer"}, - "height": {"type": "integer"}, - }, - "region_attributes": { - "type": "object" + "type": "array", # a list of dicts + "items": { + "type": "object", + "properties": { + "shape_attributes": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "x": {"type": "integer"}, + "y": {"type": "integer"}, + "width": {"type": "integer"}, + "height": {"type": "integer"}, + }, + "region_attributes": { + "type": "object" + }, # we just check it's a dict }, }, }, @@ -109,15 +116,22 @@ def _file_macthes_VIA_JSON_schema(self, attribute, value): }, }, }, + # ordered list of image keys + # - the position defines the image ID + "_via_image_id_list": { + "type": "array", + "items": {"type": "string"}, + }, + # region (aka annotation) and file attributes for VIA UI "_via_attributes": { - "type": "dict", + "type": "object", "properties": { - "region": {"type": "dict"}, - "file": {"type": "dict"}, + "region": {"type": "object"}, + "file": {"type": "object"}, }, }, + # version of the VIA data format "_via_data_format_version": {"type": "string"}, - "_via_image_id_list": {"type": "list"}, }, } @@ -125,7 +139,7 @@ def _file_macthes_VIA_JSON_schema(self, attribute, value): with open(value) as file: data = json.load(file) - # check schema + # check against schema try: jsonschema.validate(instance=data, schema=VIA_JSON_schema) except jsonschema.exceptions.ValidationError as val_err: @@ -139,6 +153,6 @@ def _file_macthes_VIA_JSON_schema(self, attribute, value): # ) from schema_err -@define -class ValidCOCOUntrackedJSON: - pass +# @define +# class ValidCOCOUntrackedJSON: +# pass From 0a1d64e6c3464a5c73d8c13adc0ce40465f8ce70 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Wed, 11 Dec 2024 14:27:55 +0000 Subject: [PATCH 08/36] Add validation and loading for COCO style file --- ethology/annotations/io.py | 119 +++++++++++++++++++++++------ ethology/annotations/validators.py | 118 +++++++++++++++++++++++++--- 2 files changed, 203 insertions(+), 34 deletions(-) diff --git a/ethology/annotations/io.py b/ethology/annotations/io.py index 09339ba..700b92f 100644 --- a/ethology/annotations/io.py +++ b/ethology/annotations/io.py @@ -6,7 +6,11 @@ import pandas as pd from movement.validators.files import ValidFile -from ethology.annotations.validators import ValidJSON, ValidVIAUntrackedJSON +from ethology.annotations.validators import ( + ValidCOCOUntrackedJSON, + ValidJSON, + ValidVIAUntrackedJSON, +) STANDARD_DF_COLUMNS = [ "annotation_id", @@ -16,15 +20,15 @@ "y_min", "width", "height", - "superclass", - "class", + "supercategory", + "category", ] -def df_from_via_json_file(file_path: Path): +def df_from_via_json_file(file_path: Path) -> pd.DataFrame: """Validate and read untracked VIA JSON file. - The data is formated as an untracked annotations DataFrame. + The data is formatted as an untracked annotations DataFrame. """ # General file validation file = ValidFile( @@ -41,13 +45,30 @@ def df_from_via_json_file(file_path: Path): return _df_from_validated_via_json_file(via_untracked_file.path) +def df_from_coco_json_file(file_path: Path) -> pd.DataFrame: + """Validate and read COCO JSON file.""" + # General file validation + file = ValidFile( + file_path, expected_permission="r", expected_suffix=[".json"] + ) + + # JSON file validation + json_file = ValidJSON(file.path) + + # COCO Untracked JSON schema validation + coco_untracked_file = ValidCOCOUntrackedJSON(json_file.path) + + # Read as standard dataframe + return _df_from_validated_coco_json_file(coco_untracked_file.path) + + def _df_from_validated_via_json_file(file_path): """Read VIA JSON file as standard untracked annotations DataFrame.""" # Read validated json as dict with open(file_path) as file: data_dict = json.load(file) - # Get relevant fields + # Prepare data image_metadata_dict = data_dict["_via_img_metadata"] via_image_id_list = data_dict[ "_via_image_id_list" @@ -68,28 +89,26 @@ def _df_from_validated_via_json_file(file_path): region_shape = region["shape_attributes"] region_attributes = region["region_attributes"] + row = { + "image_filename": img_dict["filename"], + "x_min": region_shape["x"], + "y_min": region_shape["y"], + "width": region_shape["width"], + "height": region_shape["height"], + "supercategory": list(region_attributes.keys())[ + 0 + ], # takes first key as supercategory + "category": region_attributes[ + list(region_attributes.keys())[0] + ], + } + # append annotations to df - list_rows.append( - { - "image_filename": img_dict["filename"], - "x_min": region_shape["x"], - "y_min": region_shape["y"], - "width": region_shape["width"], - "height": region_shape["height"], - "superclass": list(region_attributes.keys())[ - 0 - ], # takes first key as superclass - "class": region_attributes[ - list(region_attributes.keys())[0] - ], - }, - ) + list_rows.append(row) df = pd.DataFrame( list_rows, - columns=[ - col for col in STANDARD_DF_COLUMNS if not col.endswith("_id") - ], + # columns=list(row.keys()), # do I need this? ) # add image_id column @@ -104,3 +123,55 @@ def _df_from_validated_via_json_file(file_path): df = df.reindex(columns=STANDARD_DF_COLUMNS) return df + + +def _df_from_validated_coco_json_file(file_path: Path) -> pd.DataFrame: + """Read COCO JSON file as standard untracked annotations DataFrame.""" + # Read validated json as dict + with open(file_path) as file: + data_dict = json.load(file) + + # Prepare data + map_image_id_to_filename = { + img_dict["id"]: img_dict["file_name"] + for img_dict in data_dict["images"] + } + + map_category_id_to_category_data = { + cat_dict["id"]: (cat_dict["name"], cat_dict["supercategory"]) + for cat_dict in data_dict["categories"] + } + + # Build standard dataframe + list_rows = [] + for annot_dict in data_dict["annotations"]: + annotation_id = annot_dict["id"] + # image data + image_id = annot_dict["image_id"] + image_filename = map_image_id_to_filename[image_id] + + # bbox data + x_min, y_min, width, height = annot_dict["bbox"] + + # class data + category_id = annot_dict["category_id"] + category, supercategory = map_category_id_to_category_data[category_id] + + row = { + "annotation_id": annotation_id, + "image_filename": image_filename, + "image_id": image_id, + "x_min": x_min, + "y_min": y_min, + "width": width, + "height": height, + "supercategory": supercategory, + "category": category, + } + + list_rows.append(row) + + df = pd.DataFrame(list_rows) + df.reindex(columns=STANDARD_DF_COLUMNS) + + return df diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py index 88a5c03..6a2b998 100644 --- a/ethology/annotations/validators.py +++ b/ethology/annotations/validators.py @@ -46,8 +46,9 @@ class ValidVIAUntrackedJSON: """Class for validating VIA JSON files for untracked data. The validator ensures that the file matches the expected schema. + The schema validation only checks the type for each specified + key if it exists. It does not check for the presence of the keys. - https://json-schema.org/understanding-json-schema/reference/object#additional-properties Attributes ---------- @@ -59,10 +60,16 @@ class ValidVIAUntrackedJSON: ValueError If the JSON file does not match the expected schema. + Notes + ----- + https://json-schema.org/understanding-json-schema/ + """ + # TODO: add a check for the presence of the keys + # that I use in loading the data + path: Path = field(validator=validators.instance_of(Path)) - # expected_schema: dict = field(factory=dict, kw_only=True) @path.validator def _file_macthes_VIA_JSON_schema(self, attribute, value): @@ -135,7 +142,8 @@ def _file_macthes_VIA_JSON_schema(self, attribute, value): }, } - # should have been validated with ValidVIAUntrackedJSON + # should have been validated with ValidJSON + # already so this should work fine with open(value) as file: data = json.load(file) @@ -147,12 +155,102 @@ def _file_macthes_VIA_JSON_schema(self, attribute, value): "The JSON data does not match " f"the provided schema: {VIA_JSON_schema}" ) from val_err - # except jsonschema.exceptions.SchemaError as schema_err: - # raise ValueError( - # f"Invalid schema provided: {VIA_JSON_schema}" - # ) from schema_err -# @define -# class ValidCOCOUntrackedJSON: -# pass +@define +class ValidCOCOUntrackedJSON: + """Class for validating COCO JSON files for untracked data. + + The validator ensures that the file matches the expected schema. + The schema validation only checks the type for each specified + key if it exists. It does not check for the presence of the keys. + + Attributes + ---------- + path : pathlib.Path + Path to the JSON file. + + Raises + ------ + ValueError + If the JSON file does not match the expected schema. + + Notes + ----- + https://json-schema.org/understanding-json-schema/ + + """ + + path: Path = field(validator=validators.instance_of(Path)) + + # TODO: add a check for the presence of the keys + # that I use in loading the data + + @path.validator + def _file_macthes_COCO_JSON_schema(self, attribute, value): + """Ensure that the JSON file matches the expected schema.""" + # Define schema for VIA JSON file for untracked + # (aka manually labelled) data + COCO_JSON_schema = { + "type": "object", + "properties": { + "info": {"type": "object"}, + "licenses": { + "type": "array", + }, + "images": { + "type": "array", + "items": { + "type": "object", + "properties": { + "file_name": {"type": "string"}, + "id": {"type": "integer"}, + "width": {"type": "integer"}, + "height": {"type": "integer"}, + }, + }, + }, + "annotations": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": {"type": "integer"}, # annotation global ID + "image_id": {"type": "integer"}, + "bbox": { + "type": "array", + "items": {"type": "integer"}, + }, + "category_id": {"type": "integer"}, + "area": {"type": "integer"}, + "iscrowd": {"type": "integer"}, + }, + }, + }, + "categories": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": {"type": "integer"}, + "name": {"type": "string"}, + "supercategory": {"type": "string"}, + }, + }, + }, + }, + } + + # should have been validated with ValidJSON + # already so this should work fine + with open(value) as file: + data = json.load(file) + + # check against schema + try: + jsonschema.validate(instance=data, schema=COCO_JSON_schema) + except jsonschema.exceptions.ValidationError as val_err: + raise ValueError( + "The JSON data does not match " + f"the provided schema: {COCO_JSON_schema}" + ) from val_err From 57c94ece89c1bab772ab4a10a9614642c9185a29 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Wed, 11 Dec 2024 19:21:24 +0000 Subject: [PATCH 09/36] Keys check draft --- ethology/annotations/validators.py | 65 ++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py index 6a2b998..a5e1898 100644 --- a/ethology/annotations/validators.py +++ b/ethology/annotations/validators.py @@ -156,6 +156,71 @@ def _file_macthes_VIA_JSON_schema(self, attribute, value): f"the provided schema: {VIA_JSON_schema}" ) from val_err + @path.validator + def _file_contains_required_keys(self, attribute, value): + """Ensure that the JSON file contains the required keys.""" + required_keys_main = [ + "_via_img_metadata", + "_via_image_id_list", + ] + + required_keys_img_metadata_dicts = [ + "filename", + "regions", + ] + + required_keys_region_dicts = [ + "shape_attributes", + "region_attributes", + ] + + required_keys_shape_attributes_dicts = [ + "x", + "y", + "width", + "height", + ] + + with open(value) as file: + data = json.load(file) + + # check keys first level + for key in required_keys_main: + if key not in data: + raise ValueError( + f"Key '{key}' not found in first level " + f"of the JSON input file: {value}" + ) + + # check keys in each of the _via_img_metadata dicts + for key in required_keys_img_metadata_dicts: + for img_str, img_dict in data["_via_img_metadata"]: + if key not in img_dict: + raise ValueError( + f"Key '{key}' not found for {img_str}" + " under _via_img_metadata" + ) + + # check keys under regions + for key in required_keys_region_dicts: + for img_str, img_dict in data["_via_img_metadata"]: + for region in img_dict["regions"]: + if key not in region: + raise ValueError( + f"Key '{key}' not found for region" + f" under {img_str}" + ) + + # check keys under shape_attributes + for key in required_keys_shape_attributes_dicts: + for img_str, img_dict in data["_via_img_metadata"]: + for region in img_dict["regions"]: + if key not in region["shape_attributes"]: + raise ValueError( + f"Key 'shape_attributes > {key}' not found " + f"for region under {img_str}" + ) + @define class ValidCOCOUntrackedJSON: From 1d19eb984bed26f003cc0dcd3c8d743581f232a1 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Wed, 11 Dec 2024 19:49:20 +0000 Subject: [PATCH 10/36] Improve keys check --- ethology/annotations/validators.py | 97 +++++++++++++----------------- 1 file changed, 41 insertions(+), 56 deletions(-) diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py index a5e1898..862a336 100644 --- a/ethology/annotations/validators.py +++ b/ethology/annotations/validators.py @@ -159,68 +159,53 @@ def _file_macthes_VIA_JSON_schema(self, attribute, value): @path.validator def _file_contains_required_keys(self, attribute, value): """Ensure that the JSON file contains the required keys.""" - required_keys_main = [ - "_via_img_metadata", - "_via_image_id_list", - ] - - required_keys_img_metadata_dicts = [ - "filename", - "regions", - ] - - required_keys_region_dicts = [ - "shape_attributes", - "region_attributes", - ] - - required_keys_shape_attributes_dicts = [ - "x", - "y", - "width", - "height", - ] + required_keys = { + "main": ["_via_img_metadata", "_via_image_id_list"], + "image_keys": ["filename", "regions"], + "region_keys": ["shape_attributes", "region_attributes"], + "shape_attributes_keys": ["x", "y", "width", "height"], + } + + def _check_keys( + list_required_keys: list[str], + data_dict: dict, + additional_error_message: str = "", + ): + missing_keys = set(list_required_keys) - data_dict.keys() + if missing_keys: + raise ValueError( + f"Required key(s) {missing_keys} not " + f"found in {list(data_dict.keys())} " + + additional_error_message + + "." + ) + # Read data as dict with open(value) as file: data = json.load(file) - # check keys first level - for key in required_keys_main: - if key not in data: - raise ValueError( - f"Key '{key}' not found in first level " - f"of the JSON input file: {value}" + # Check first level keys + _check_keys(required_keys["main"], data) + + # Check keys in nested dicts + for img_str, img_dict in data["_via_img_metadata"].items(): + # Check keys for each image dictionary + _check_keys( + required_keys["image_keys"], + img_dict, + additional_error_message=f"for {img_str}", + ) + # Check keys for each region + for region in img_dict["regions"]: + _check_keys(required_keys["region_keys"], region) + + # Check keys under shape_attributes + _check_keys( + required_keys["shape_attributes_keys"], + region["shape_attributes"], + additional_error_message=f"for region under {img_str}", ) - # check keys in each of the _via_img_metadata dicts - for key in required_keys_img_metadata_dicts: - for img_str, img_dict in data["_via_img_metadata"]: - if key not in img_dict: - raise ValueError( - f"Key '{key}' not found for {img_str}" - " under _via_img_metadata" - ) - - # check keys under regions - for key in required_keys_region_dicts: - for img_str, img_dict in data["_via_img_metadata"]: - for region in img_dict["regions"]: - if key not in region: - raise ValueError( - f"Key '{key}' not found for region" - f" under {img_str}" - ) - - # check keys under shape_attributes - for key in required_keys_shape_attributes_dicts: - for img_str, img_dict in data["_via_img_metadata"]: - for region in img_dict["regions"]: - if key not in region["shape_attributes"]: - raise ValueError( - f"Key 'shape_attributes > {key}' not found " - f"for region under {img_str}" - ) - @define class ValidCOCOUntrackedJSON: From 97df63c1eedac3296da50fcc9fa886f1f41fd93e Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Wed, 11 Dec 2024 19:57:25 +0000 Subject: [PATCH 11/36] Add keys check for COCO untracked json validator --- ethology/annotations/validators.py | 73 ++++++++++++++++++++++++------ 1 file changed, 59 insertions(+), 14 deletions(-) diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py index 862a336..20b1317 100644 --- a/ethology/annotations/validators.py +++ b/ethology/annotations/validators.py @@ -166,20 +166,6 @@ def _file_contains_required_keys(self, attribute, value): "shape_attributes_keys": ["x", "y", "width", "height"], } - def _check_keys( - list_required_keys: list[str], - data_dict: dict, - additional_error_message: str = "", - ): - missing_keys = set(list_required_keys) - data_dict.keys() - if missing_keys: - raise ValueError( - f"Required key(s) {missing_keys} not " - f"found in {list(data_dict.keys())} " - + additional_error_message - + "." - ) - # Read data as dict with open(value) as file: data = json.load(file) @@ -304,3 +290,62 @@ def _file_macthes_COCO_JSON_schema(self, attribute, value): "The JSON data does not match " f"the provided schema: {COCO_JSON_schema}" ) from val_err + + @path.validator + def _file_contains_required_keys(self, attribute, value): + """Ensure that the JSON file contains the required keys.""" + required_keys = { + "main": ["images", "annotations", "categories"], + "image_keys": [ + "id", + "file_name", + ], # add height and width of image? + "annotations_keys": ["id", "image_id", "bbox", "category_id"], + "categories_keys": ["id", "name", "supercategory"], + } + + # Read data as dict + with open(value) as file: + data = json.load(file) + + # Check first level keys + _check_keys(required_keys["main"], data) + + # Check keys in images dicts + for img_dict in data["images"]: + _check_keys( + required_keys["image_keys"], + img_dict, + additional_error_message=f"for image dict {img_dict}", + ) + + # Check keys in annotations dicts + for annot_dict in data["annotations"]: + _check_keys( + required_keys["annotations_keys"], + annot_dict, + additional_error_message=f"for annotation dict {annot_dict}", + ) + + # Check keys in categories dicts + for cat_dict in data["categories"]: + _check_keys( + required_keys["categories_keys"], + cat_dict, + additional_error_message=f"for category dict {cat_dict}", + ) + + +def _check_keys( + list_required_keys: list[str], + data_dict: dict, + additional_error_message: str = "", +): + missing_keys = set(list_required_keys) - data_dict.keys() + if missing_keys: + raise ValueError( + f"Required key(s) {missing_keys} not " + f"found in {list(data_dict.keys())} " + + additional_error_message + + "." + ) From 029bb8b921eed28e4fef25da3e4f6361a4aab1f5 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Wed, 11 Dec 2024 20:02:20 +0000 Subject: [PATCH 12/36] Remove some comments --- ethology/annotations/io.py | 39 +++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/ethology/annotations/io.py b/ethology/annotations/io.py index 700b92f..129f241 100644 --- a/ethology/annotations/io.py +++ b/ethology/annotations/io.py @@ -29,16 +29,23 @@ def df_from_via_json_file(file_path: Path) -> pd.DataFrame: """Validate and read untracked VIA JSON file. The data is formatted as an untracked annotations DataFrame. + + Parameters + ---------- + file_path : Path + Path to the untracked VIA JSON file. + + Returns + ------- + pd.DataFrame + Untracked annotations DataFrame. + """ - # General file validation + # Run validators file = ValidFile( file_path, expected_permission="r", expected_suffix=[".json"] ) - - # JSON file validation json_file = ValidJSON(file.path) - - # VIA Untracked JSON schema validation via_untracked_file = ValidVIAUntrackedJSON(json_file.path) # Read as standard dataframe @@ -46,16 +53,26 @@ def df_from_via_json_file(file_path: Path) -> pd.DataFrame: def df_from_coco_json_file(file_path: Path) -> pd.DataFrame: - """Validate and read COCO JSON file.""" - # General file validation + """Validate and read untracked COCO JSON file. + + The data is formatted as an untracked annotations DataFrame. + + Parameters + ---------- + file_path : Path + Path to the untracked COCO JSON file. + + Returns + ------- + pd.DataFrame + Untracked annotations DataFrame. + + """ + # Run validators file = ValidFile( file_path, expected_permission="r", expected_suffix=[".json"] ) - - # JSON file validation json_file = ValidJSON(file.path) - - # COCO Untracked JSON schema validation coco_untracked_file = ValidCOCOUntrackedJSON(json_file.path) # Read as standard dataframe From 23d5259bd9287807f6f169a805dd951ceba57955 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Thu, 12 Dec 2024 10:17:10 +0000 Subject: [PATCH 13/36] Delete utils --- ethology/annotations/utils.py | 72 ----------------------------------- 1 file changed, 72 deletions(-) delete mode 100644 ethology/annotations/utils.py diff --git a/ethology/annotations/utils.py b/ethology/annotations/utils.py deleted file mode 100644 index 925489a..0000000 --- a/ethology/annotations/utils.py +++ /dev/null @@ -1,72 +0,0 @@ -"""Utility functions to work with annotations in JSON format.""" - -import json -from pathlib import Path - - -def read_json_file_as_dict( - file_path: Path, -) -> dict: - """Read JSON file as dict. - - Parameters - ---------- - file_path : str - Path to the JSON file - - Returns - ------- - dict - Dictionary with the JSON data - - """ - try: - with open(file_path) as file: - return json.load(file) - except FileNotFoundError as not_found_error: - msg = f"File not found: {file_path}" - raise ValueError(msg) from not_found_error - except json.JSONDecodeError as decode_error: - msg = f"Error decoding JSON data from file: {file_path}" - raise ValueError(msg) from decode_error - - -def read_via_json_file_as_dict(file_path: Path) -> dict: - """Read VIA JSON file as dict. - - Parameters - ---------- - file_path : str - Path to the VIA JSON file - - Returns - ------- - dict - Dictionary with the JSON data - - """ - # Read data - data = read_json_file_as_dict(file_path) - - # Check the expected keys are defined in the JSON file - expected_keys = [ - "_via_settings", - "_via_img_metadata", - "_via_attributes", - "_via_data_format_version", - "_via_image_id_list", - ] - - for ky in expected_keys: - if ky not in data: - raise ValueError( - f"Expected key '{ky}' not found in file: {file_path}" - ) - - return data - - -# def read_via_json_file_as_xarray(file_path: Path): - - -# via_dict = read_via_json_file_as_dict(file_path) From 9fbc4f70b4e81fea1e547d26ebd980962dd9c256 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Thu, 12 Dec 2024 10:36:04 +0000 Subject: [PATCH 14/36] Add module for fixtures --- .pre-commit-config.yaml | 1 + tests/conftest.py | 5 +++++ tests/fixtures/__init__.py | 0 tests/fixtures/annotations.py | 1 + 4 files changed, 7 insertions(+) create mode 100644 tests/conftest.py create mode 100644 tests/fixtures/__init__.py create mode 100644 tests/fixtures/annotations.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8c8a2be..99263f3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,6 +19,7 @@ repos: args: [--fix=lf] - id: name-tests-test args: ["--pytest-test-first"] + exclude: ^tests/fixtures - id: requirements-txt-fixer - id: trailing-whitespace - repo: https://github.com/pre-commit/pygrep-hooks diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..9b6026a --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,5 @@ +"""Pytest configuration file.""" + +pytest_plugins = [ + "tests.fixtures.annotations", +] diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/fixtures/annotations.py b/tests/fixtures/annotations.py new file mode 100644 index 0000000..11894a8 --- /dev/null +++ b/tests/fixtures/annotations.py @@ -0,0 +1 @@ +"""Pytest fixtures for annotation tests.""" From d3809e0dd2465e8fcc7d41261e1e2f93ca9a5b2a Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Thu, 12 Dec 2024 12:59:22 +0000 Subject: [PATCH 15/36] Change JSON error to FileNotFound --- ethology/annotations/validators.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py index 20b1317..633c341 100644 --- a/ethology/annotations/validators.py +++ b/ethology/annotations/validators.py @@ -17,12 +17,6 @@ class ValidJSON: path : pathlib.Path Path to the JSON file. - Raises - ------ - ValueError - If the file is not in JSON format or if it does not contain the - expected keys. - """ path: Path = field(validator=validators.instance_of(Path)) @@ -34,7 +28,9 @@ def _file_is_json(self, attribute, value): with open(value) as file: json.load(file) except FileNotFoundError as not_found_error: - raise ValueError(f"File not found: {value}") from not_found_error + raise FileNotFoundError( + f"File not found: {value}" + ) from not_found_error except json.JSONDecodeError as decode_error: raise ValueError( f"Error decoding JSON data from file: {value}" From 5866d106057abcde43ab9b834e94e1cdd3575bba Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Thu, 12 Dec 2024 12:59:44 +0000 Subject: [PATCH 16/36] Add shared fixtures across all tests to conftest --- tests/conftest.py | 90 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 9b6026a..13e4c9d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,93 @@ -"""Pytest configuration file.""" +"""Pytest configuration file with shared fixtures across all tests.""" + +from pathlib import Path + +import pooch +import pytest + +GIN_TEST_DATA_REPO = ( + "https://gin.g-node.org/neuroinformatics/ethology-test-data" +) pytest_plugins = [ "tests.fixtures.annotations", ] + + +@pytest.fixture(scope="session") +def pooch_registry() -> dict: + """Pooch registry for the test data. + + This fixture is common to the entire test session. The + file registry is downloaded fresh for every test session. + + Returns + ------- + dict + URL and hash of the GIN repository with the test data + + """ + # Cache the test data in the user's home directory + test_data_dir = Path.home() / ".ethology-test-data" + + # Initialise pooch registry + registry = pooch.create( + test_data_dir, + base_url=f"{GIN_TEST_DATA_REPO}/raw/master/test_data", + ) + + # Download only the registry file from GIN + # if known_hash = None, the file is always downloaded. + file_registry = pooch.retrieve( + url=f"{GIN_TEST_DATA_REPO}/raw/master/files-registry.txt", + known_hash=None, + fname="files-registry.txt", + path=test_data_dir, + ) + + # Load registry file onto pooch registry + registry.load_registry(file_registry) + + return registry + + +@pytest.fixture() +def get_paths_test_data(): + """Define a factory fixture to get the paths of the data files + under a specific zip. + + The name of the zip file is intended to match a testing module. For + example, to get the paths to the test files for the annotations + tests module, we would call `get_paths_test_data(pooch_registry, + "test_annotations")` in a test. This assumes in the GIN repository + there is a zip file named `test_annotations.zip` under the `test_data` + directory containing the relevant test files. + """ + + def _get_paths_test_data(pooch_registry, zip_filename: str) -> dict: + """Return the paths of the test files under the specified zip filename. + + The zip filename is expected to match a testing module. + """ + # Fetch the test data for the annotations module + list_files_in_local_storage = pooch_registry.fetch( + f"{zip_filename}.zip", + processor=pooch.Unzip(extract_dir=""), + progressbar=True, + ) + + # Filter out files not under `test_annotations` directory + list_files_annotations = [ + f + for f in list_files_in_local_storage + if (zip_filename in f) and (not f.endswith(".zip")) + ] + + # return paths as dict + input_data_dict = {} + for f in list_files_annotations: + input_data_dict[Path(f).name] = Path(f) + + return input_data_dict + + return _get_paths_test_data From 8504a59db17695acfa06b06cc20a38348445876d Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Thu, 12 Dec 2024 12:59:58 +0000 Subject: [PATCH 17/36] Add annotations_test_data fixture to its module --- tests/fixtures/annotations.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/fixtures/annotations.py b/tests/fixtures/annotations.py index 11894a8..dced3eb 100644 --- a/tests/fixtures/annotations.py +++ b/tests/fixtures/annotations.py @@ -1 +1,8 @@ -"""Pytest fixtures for annotation tests.""" +"""Pytest fixtures shared across annotation tests.""" + +import pytest + + +@pytest.fixture() +def annotations_test_data(pooch_registry, get_paths_test_data): + return get_paths_test_data(pooch_registry, "test_annotations") From a22bfc47330d1babd6ee0d0a9b7d1c1fedb40686 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Thu, 12 Dec 2024 13:00:34 +0000 Subject: [PATCH 18/36] Add test for JSON file validator --- tests/test_unit/test_annotations/__init__.py | 0 .../test_annotations/test_validators.py | 47 +++++++++++++++++++ tests/test_unit/test_placeholder.py | 2 - 3 files changed, 47 insertions(+), 2 deletions(-) create mode 100644 tests/test_unit/test_annotations/__init__.py create mode 100644 tests/test_unit/test_annotations/test_validators.py delete mode 100644 tests/test_unit/test_placeholder.py diff --git a/tests/test_unit/test_annotations/__init__.py b/tests/test_unit/test_annotations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py new file mode 100644 index 0000000..1341459 --- /dev/null +++ b/tests/test_unit/test_annotations/test_validators.py @@ -0,0 +1,47 @@ +# import json +from contextlib import nullcontext as does_not_raise + +# import pooch +import pytest + +from ethology.annotations.validators import ( + ValidJSON, +) + + +@pytest.mark.parametrize( + "valid_json_file", + [ + "VIA_JSON_sample_1.json", + "VIA_JSON_sample_2.json", + ], +) +def test_valid_json(valid_json_file, annotations_test_data): + """Test the ValidJSON validator on valid data.""" + input_json_file = annotations_test_data[valid_json_file] + with does_not_raise(): + ValidJSON(input_json_file) + + +# @pytest.mark.parametrize( +# "invalid_json_file, expected_exception, log_message", +# [ +# ( +# "invalid_VIA_JSON_sample_1.json", +# FileNotFoundError, +# "File not found: invalid_VIA_JSON_sample_1.json.", +# ), +# ( +# "invalid_VIA_JSON_sample_2.json", +# ValueError, +# "Error decoding JSON data from file: invalid_VIA_JSON_sample_2.", +# ), +# ], +# ) +# def test_valid_json_errors(invalid_json_file, +# expected_exception, log_message): +# """Test the ValidJSON validator on invalid data.""" +# with pytest.raises(expected_exception) as excinfo: +# ValidJSON(invalid_json_file) + +# assert str(excinfo.value) == log_message diff --git a/tests/test_unit/test_placeholder.py b/tests/test_unit/test_placeholder.py deleted file mode 100644 index 3ada1ee..0000000 --- a/tests/test_unit/test_placeholder.py +++ /dev/null @@ -1,2 +0,0 @@ -def test_placeholder(): - assert True From 6edbfc452cf6e7e46ddffcf8d4a73691d04fa9f0 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Thu, 12 Dec 2024 14:05:52 +0000 Subject: [PATCH 19/36] Fix fresh download of files-registry --- tests/conftest.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 13e4c9d..f759045 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -30,6 +30,12 @@ def pooch_registry() -> dict: # Cache the test data in the user's home directory test_data_dir = Path.home() / ".ethology-test-data" + # Remove the file registry if it exists + # otherwise it is not downloaded from scratch every time + file_registry_path = test_data_dir / "files-registry.txt" + if file_registry_path.is_file(): + Path(file_registry_path).unlink() + # Initialise pooch registry registry = pooch.create( test_data_dir, @@ -37,12 +43,11 @@ def pooch_registry() -> dict: ) # Download only the registry file from GIN - # if known_hash = None, the file is always downloaded. file_registry = pooch.retrieve( url=f"{GIN_TEST_DATA_REPO}/raw/master/files-registry.txt", known_hash=None, - fname="files-registry.txt", - path=test_data_dir, + fname=file_registry_path.name, + path=file_registry_path.parent, ) # Load registry file onto pooch registry From 9976715a0416d250ddd53a3424d7ce2fb2a5bbc7 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Thu, 12 Dec 2024 14:24:03 +0000 Subject: [PATCH 20/36] Fix tests to work with unzipped subdirectories --- tests/conftest.py | 38 ++++++++----------- .../test_annotations/test_validators.py | 27 ++++++++----- 2 files changed, 34 insertions(+), 31 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index f759045..37d1041 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -69,30 +69,24 @@ def get_paths_test_data(): directory containing the relevant test files. """ - def _get_paths_test_data(pooch_registry, zip_filename: str) -> dict: + def _get_paths_test_data(pooch_registry, subdir_name: str) -> dict: """Return the paths of the test files under the specified zip filename. - The zip filename is expected to match a testing module. + subdir_name is the name of the subdirectory under `test_data`. """ - # Fetch the test data for the annotations module - list_files_in_local_storage = pooch_registry.fetch( - f"{zip_filename}.zip", - processor=pooch.Unzip(extract_dir=""), - progressbar=True, - ) - - # Filter out files not under `test_annotations` directory - list_files_annotations = [ - f - for f in list_files_in_local_storage - if (zip_filename in f) and (not f.endswith(".zip")) - ] - - # return paths as dict - input_data_dict = {} - for f in list_files_annotations: - input_data_dict[Path(f).name] = Path(f) - - return input_data_dict + test_filename_to_path = {} + for relative_filepath in pooch_registry.registry: + # relative to test_data + if relative_filepath.startswith(f"{subdir_name}/"): + # fetch file from pooch registry + fetched_filepath = pooch_registry.fetch( + relative_filepath, # under test_data + progressbar=True, + ) + + test_filename_to_path[Path(fetched_filepath).name] = Path( + fetched_filepath + ) + return test_filename_to_path return _get_paths_test_data diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py index 1341459..a0612df 100644 --- a/tests/test_unit/test_annotations/test_validators.py +++ b/tests/test_unit/test_annotations/test_validators.py @@ -4,24 +4,33 @@ # import pooch import pytest -from ethology.annotations.validators import ( - ValidJSON, -) +from ethology.annotations.validators import ValidJSON + +# @pytest.fixture() +# def @pytest.mark.parametrize( - "valid_json_file", + "input_json_file, expected_exception, log_message", [ - "VIA_JSON_sample_1.json", - "VIA_JSON_sample_2.json", + ("VIA_JSON_sample_1.json", does_not_raise(), ""), + ("VIA_JSON_sample_2.json", does_not_raise(), ""), ], ) -def test_valid_json(valid_json_file, annotations_test_data): +def test_valid_json( + annotations_test_data, + input_json_file, + expected_exception, + log_message, +): """Test the ValidJSON validator on valid data.""" - input_json_file = annotations_test_data[valid_json_file] - with does_not_raise(): + input_json_file = annotations_test_data[input_json_file] + with expected_exception as excinfo: ValidJSON(input_json_file) + if log_message: + assert str(excinfo.value) == log_message + # @pytest.mark.parametrize( # "invalid_json_file, expected_exception, log_message", From a4ffb4411cd39dfa407f1222fd67053feb49c266 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 16 Dec 2024 11:29:25 +0000 Subject: [PATCH 21/36] Add test for JSON validator --- ethology/annotations/validators.py | 4 +- .../test_annotations/test_validators.py | 55 ++++++++++++++++--- 2 files changed, 50 insertions(+), 9 deletions(-) diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py index 633c341..ebb0d58 100644 --- a/ethology/annotations/validators.py +++ b/ethology/annotations/validators.py @@ -29,11 +29,11 @@ def _file_is_json(self, attribute, value): json.load(file) except FileNotFoundError as not_found_error: raise FileNotFoundError( - f"File not found: {value}" + f"File not found: {value}." ) from not_found_error except json.JSONDecodeError as decode_error: raise ValueError( - f"Error decoding JSON data from file: {value}" + f"Error decoding JSON data from file: {value}." ) from decode_error diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py index a0612df..b71bdd5 100644 --- a/tests/test_unit/test_annotations/test_validators.py +++ b/tests/test_unit/test_annotations/test_validators.py @@ -6,15 +6,55 @@ from ethology.annotations.validators import ValidJSON -# @pytest.fixture() -# def + +@pytest.fixture() +def via_json_valid_file(annotations_test_data): + return annotations_test_data["VIA_JSON_sample_1.json"] + + +@pytest.fixture() +def coco_json_valid_file(annotations_test_data): + return annotations_test_data["COCO_JSON_sample_1.json"] + + +@pytest.fixture() +def json_with_decode_error(tmp_path): + """Return the path to a JSON file with a decoding error.""" + json_file = tmp_path / "JSON_decode_error.json" + with open(json_file, "w") as f: + f.write("invalid_json") + return json_file + + +@pytest.fixture() +def json_file_not_found(tmp_path): + """Return the path to a JSON file that does not exist.""" + return tmp_path / "JSON_file_not_found.json" @pytest.mark.parametrize( "input_json_file, expected_exception, log_message", [ - ("VIA_JSON_sample_1.json", does_not_raise(), ""), - ("VIA_JSON_sample_2.json", does_not_raise(), ""), + ( + "via_json_valid_file", + does_not_raise(), + "", + ), + ( + "coco_json_valid_file", + does_not_raise(), + "", + ), + ( + "json_with_decode_error", + pytest.raises(ValueError), + "Error decoding JSON data from file: {}.", + ), + ( + "json_file_not_found", + pytest.raises(FileNotFoundError), + "File not found: {}.", + ), ], ) def test_valid_json( @@ -22,14 +62,15 @@ def test_valid_json( input_json_file, expected_exception, log_message, + request, ): - """Test the ValidJSON validator on valid data.""" - input_json_file = annotations_test_data[input_json_file] + """Test the ValidJSON validator.""" + input_json_file = request.getfixturevalue(input_json_file) with expected_exception as excinfo: ValidJSON(input_json_file) if log_message: - assert str(excinfo.value) == log_message + assert str(excinfo.value) == log_message.format(input_json_file) # @pytest.mark.parametrize( From b901658a80ccc4ac859058964ec724c5a5df6146 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 16 Dec 2024 12:55:30 +0000 Subject: [PATCH 22/36] Factor out schemas and include schema validation in ValidJSON --- ethology/annotations/io.py | 8 +- ethology/annotations/json_schemas.py | 118 ++++++++++++ ethology/annotations/validators.py | 142 +++++--------- .../test_annotations/test_validators.py | 177 +++++++++++++----- 4 files changed, 292 insertions(+), 153 deletions(-) create mode 100644 ethology/annotations/json_schemas.py diff --git a/ethology/annotations/io.py b/ethology/annotations/io.py index 129f241..e62d9f0 100644 --- a/ethology/annotations/io.py +++ b/ethology/annotations/io.py @@ -6,6 +6,10 @@ import pandas as pd from movement.validators.files import ValidFile +from ethology.annotations.json_schemas import ( + COCO_UNTRACKED_SCHEMA, + VIA_UNTRACKED_SCHEMA, +) from ethology.annotations.validators import ( ValidCOCOUntrackedJSON, ValidJSON, @@ -45,7 +49,7 @@ def df_from_via_json_file(file_path: Path) -> pd.DataFrame: file = ValidFile( file_path, expected_permission="r", expected_suffix=[".json"] ) - json_file = ValidJSON(file.path) + json_file = ValidJSON(path=file.path, schema=VIA_UNTRACKED_SCHEMA) via_untracked_file = ValidVIAUntrackedJSON(json_file.path) # Read as standard dataframe @@ -72,7 +76,7 @@ def df_from_coco_json_file(file_path: Path) -> pd.DataFrame: file = ValidFile( file_path, expected_permission="r", expected_suffix=[".json"] ) - json_file = ValidJSON(file.path) + json_file = ValidJSON(path=file.path, schema=COCO_UNTRACKED_SCHEMA) coco_untracked_file = ValidCOCOUntrackedJSON(json_file.path) # Read as standard dataframe diff --git a/ethology/annotations/json_schemas.py b/ethology/annotations/json_schemas.py new file mode 100644 index 0000000..7e9d265 --- /dev/null +++ b/ethology/annotations/json_schemas.py @@ -0,0 +1,118 @@ +"""JSON schemas for VIA and COCO annotations.""" + +VIA_UNTRACKED_SCHEMA = { + "type": "object", + "properties": { + # settings for browser UI + "_via_settings": { + "type": "object", + "properties": { + "ui": {"type": "object"}, + "core": {"type": "object"}, + "project": {"type": "object"}, + }, + }, + # annotation data + "_via_img_metadata": { + "type": "object", + "additionalProperties": { + # "additionalProperties" to allow any key, + # see https://stackoverflow.com/a/69811612/24834957 + "type": "object", + "properties": { + "filename": {"type": "string"}, + "size": {"type": "integer"}, + "regions": { + "type": "array", # a list of dicts + "items": { + "type": "object", + "properties": { + "shape_attributes": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "x": {"type": "integer"}, + "y": {"type": "integer"}, + "width": {"type": "integer"}, + "height": {"type": "integer"}, + }, + "region_attributes": { + "type": "object" + }, # we just check it's a dict + }, + }, + }, + }, + "file_attributes": {"type": "object"}, + }, + }, + }, + # ordered list of image keys + # - the position defines the image ID + "_via_image_id_list": { + "type": "array", + "items": {"type": "string"}, + }, + # region (aka annotation) and file attributes for VIA UI + "_via_attributes": { + "type": "object", + "properties": { + "region": {"type": "object"}, + "file": {"type": "object"}, + }, + }, + # version of the VIA data format + "_via_data_format_version": {"type": "string"}, + }, +} + + +COCO_UNTRACKED_SCHEMA = { + "type": "object", + "properties": { + "info": {"type": "object"}, + "licenses": { + "type": "array", + }, + "images": { + "type": "array", + "items": { + "type": "object", + "properties": { + "file_name": {"type": "string"}, + "id": {"type": "integer"}, + "width": {"type": "integer"}, + "height": {"type": "integer"}, + }, + }, + }, + "annotations": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": {"type": "integer"}, # annotation global ID + "image_id": {"type": "integer"}, + "bbox": { + "type": "array", + "items": {"type": "integer"}, + }, + "category_id": {"type": "integer"}, + "area": {"type": "integer"}, + "iscrowd": {"type": "integer"}, + }, + }, + }, + "categories": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": {"type": "integer"}, + "name": {"type": "string"}, + "supercategory": {"type": "string"}, + }, + }, + }, + }, +} diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py index ebb0d58..b328d97 100644 --- a/ethology/annotations/validators.py +++ b/ethology/annotations/validators.py @@ -17,9 +17,28 @@ class ValidJSON: path : pathlib.Path Path to the JSON file. + schema : dict + JSON schema to validate the file against. + + Raises + ------ + FileNotFoundError + If the file does not exist. + ValueError + If the JSON file cannot be decoded, or + if the type of any of its keys does not match those + specified in the schema. + + + Notes + ----- + https://json-schema.org/understanding-json-schema/ + """ + # Required attributes path: Path = field(validator=validators.instance_of(Path)) + schema: dict = field() @path.validator def _file_is_json(self, attribute, value): @@ -36,122 +55,45 @@ def _file_is_json(self, attribute, value): f"Error decoding JSON data from file: {value}." ) from decode_error + @path.validator + def _file_matches_JSON_schema(self, attribute, value): + """Ensure that the JSON file matches the expected schema. + + The schema validation only checks the type for each specified + key if it exists. It does not check for the presence of the keys. + """ + # read json file + with open(value) as file: + data = json.load(file) + + # check against schema + try: + jsonschema.validate(instance=data, schema=self.schema) + except jsonschema.exceptions.ValidationError as val_err: + raise ValueError( + "The JSON data does not match " + f"the provided schema: {self.schema}." + ) from val_err + @define class ValidVIAUntrackedJSON: """Class for validating VIA JSON files for untracked data. - The validator ensures that the file matches the expected schema. - The schema validation only checks the type for each specified - key if it exists. It does not check for the presence of the keys. + Checks the VIA JSON file for untracked data contains the required keys. + Note that the validation against the schema does not check the existence + of the keys, only the type of their values if they exist. Attributes ---------- path : pathlib.Path Path to the JSON file. - Raises - ------ - ValueError - If the JSON file does not match the expected schema. - - Notes - ----- - https://json-schema.org/understanding-json-schema/ - """ - # TODO: add a check for the presence of the keys - # that I use in loading the data - path: Path = field(validator=validators.instance_of(Path)) - @path.validator - def _file_macthes_VIA_JSON_schema(self, attribute, value): - """Ensure that the JSON file matches the expected schema.""" - # Define schema for VIA JSON file for untracked - # (aka manually labelled) data - VIA_JSON_schema = { - "type": "object", - "properties": { - # settings for browser UI - "_via_settings": { - "type": "object", - "properties": { - "ui": {"type": "object"}, - "core": {"type": "object"}, - "project": {"type": "object"}, - }, - }, - # annotation data - "_via_img_metadata": { - "type": "object", - "additionalProperties": { - # "additionalProperties" to allow any key, - # see https://stackoverflow.com/a/69811612/24834957 - "type": "object", - "properties": { - "filename": {"type": "string"}, - "size": {"type": "integer"}, - "regions": { - "type": "array", # a list of dicts - "items": { - "type": "object", - "properties": { - "shape_attributes": { - "type": "object", - "properties": { - "name": {"type": "string"}, - "x": {"type": "integer"}, - "y": {"type": "integer"}, - "width": {"type": "integer"}, - "height": {"type": "integer"}, - }, - "region_attributes": { - "type": "object" - }, # we just check it's a dict - }, - }, - }, - }, - "file_attributes": {"type": "object"}, - }, - }, - }, - # ordered list of image keys - # - the position defines the image ID - "_via_image_id_list": { - "type": "array", - "items": {"type": "string"}, - }, - # region (aka annotation) and file attributes for VIA UI - "_via_attributes": { - "type": "object", - "properties": { - "region": {"type": "object"}, - "file": {"type": "object"}, - }, - }, - # version of the VIA data format - "_via_data_format_version": {"type": "string"}, - }, - } - - # should have been validated with ValidJSON - # already so this should work fine - with open(value) as file: - data = json.load(file) - - # check against schema - try: - jsonschema.validate(instance=data, schema=VIA_JSON_schema) - except jsonschema.exceptions.ValidationError as val_err: - raise ValueError( - "The JSON data does not match " - f"the provided schema: {VIA_JSON_schema}" - ) from val_err - @path.validator def _file_contains_required_keys(self, attribute, value): """Ensure that the JSON file contains the required keys.""" diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py index b71bdd5..4c9fd34 100644 --- a/tests/test_unit/test_annotations/test_validators.py +++ b/tests/test_unit/test_annotations/test_validators.py @@ -1,97 +1,172 @@ -# import json +import json from contextlib import nullcontext as does_not_raise -# import pooch import pytest +from ethology.annotations.json_schemas import ( + COCO_UNTRACKED_SCHEMA, + VIA_UNTRACKED_SCHEMA, +) from ethology.annotations.validators import ValidJSON @pytest.fixture() -def via_json_valid_file(annotations_test_data): - return annotations_test_data["VIA_JSON_sample_1.json"] +def json_file_with_decode_error(tmp_path): + """Return factory of paths to JSON files with a decoding error.""" + json_file = tmp_path / "JSON_decode_error.json" + with open(json_file, "w") as f: + f.write("just-a-string") + return json_file @pytest.fixture() -def coco_json_valid_file(annotations_test_data): - return annotations_test_data["COCO_JSON_sample_1.json"] +def json_file_with_not_found_error(tmp_path): + """Return the path to a JSON file that does not exist.""" + return tmp_path / "JSON_file_not_found.json" @pytest.fixture() -def json_with_decode_error(tmp_path): - """Return the path to a JSON file with a decoding error.""" - json_file = tmp_path / "JSON_decode_error.json" - with open(json_file, "w") as f: - f.write("invalid_json") - return json_file +def via_json_file_with_schema_error(tmp_path, annotations_test_data): + """Return path to a JSON file that doesn't match the expected schema.""" + # read valid json file + via_json_valid_filepath = annotations_test_data["VIA_JSON_sample_1.json"] + with open(via_json_valid_filepath) as f: + data = json.load(f) + + # change type of specific keys + # - change "_via_image_id_list" from list of strings to list of integers + # TODO: what if I change several? + data["_via_image_id_list"] = list(range(len(data["_via_image_id_list"]))) + + # save the modified data to a new file under tmp_path + out_json = tmp_path / "VIA_JSON_schema_error.json" + with open(out_json, "w") as f: + json.dump(data, f) + return out_json @pytest.fixture() -def json_file_not_found(tmp_path): - """Return the path to a JSON file that does not exist.""" - return tmp_path / "JSON_file_not_found.json" +def coco_json_file_with_schema_error( + tmp_path, + annotations_test_data, +): + """Return path to a JSON file that doesn't match the expected schema.""" + # read valid json file + via_json_valid_filepath = annotations_test_data["COCO_JSON_sample_1.json"] + with open(via_json_valid_filepath) as f: + data = json.load(f) + + # change "annotations" from list of dicts to list of lists + # TODO: what if I change several? + data["annotations"] = [[d] for d in data["annotations"]] + + # save the modified data to a new file under tmp_path + out_json = tmp_path / "VIA_JSON_schema_error.json" + with open(out_json, "w") as f: + json.dump(data, f) + return out_json + + +@pytest.mark.parametrize( + "input_json_file, input_schema", + [ + ("VIA_JSON_sample_1.json", VIA_UNTRACKED_SCHEMA), + ("VIA_JSON_sample_2.json", VIA_UNTRACKED_SCHEMA), + ("COCO_JSON_sample_1.json", COCO_UNTRACKED_SCHEMA), + ("COCO_JSON_sample_2.json", COCO_UNTRACKED_SCHEMA), + ], +) +def test_valid_json( + annotations_test_data, + input_json_file, + input_schema, +): + """Test the ValidJSON validator with valid files.""" + input_json_file = annotations_test_data[input_json_file] + with does_not_raise(): + ValidJSON(path=input_json_file, schema=input_schema) @pytest.mark.parametrize( - "input_json_file, expected_exception, log_message", + "invalid_json_file_str, input_schema, expected_exception, log_message", [ ( - "via_json_valid_file", - does_not_raise(), - "", + "json_file_with_decode_error", + VIA_UNTRACKED_SCHEMA, # should be independent of schema + pytest.raises(ValueError), + "Error decoding JSON data from file: {}.", ), ( - "coco_json_valid_file", - does_not_raise(), - "", + "json_file_with_not_found_error", + VIA_UNTRACKED_SCHEMA, # should be independent of schema + pytest.raises(FileNotFoundError), + "File not found: {}.", ), ( - "json_with_decode_error", + "via_json_file_with_schema_error", + VIA_UNTRACKED_SCHEMA, pytest.raises(ValueError), - "Error decoding JSON data from file: {}.", + "The JSON data does not match the provided schema: {}.", ), ( - "json_file_not_found", - pytest.raises(FileNotFoundError), - "File not found: {}.", + "coco_json_file_with_schema_error", + COCO_UNTRACKED_SCHEMA, + pytest.raises(ValueError), + "The JSON data does not match the provided schema: {}.", ), ], ) -def test_valid_json( - annotations_test_data, - input_json_file, +def test_valid_json_error( + invalid_json_file_str, + input_schema, expected_exception, log_message, request, ): - """Test the ValidJSON validator.""" - input_json_file = request.getfixturevalue(input_json_file) + """Test the ValidJSON validator throws the expected error.""" + invalid_json_file = request.getfixturevalue(invalid_json_file_str) + with expected_exception as excinfo: - ValidJSON(input_json_file) + ValidJSON(path=invalid_json_file, schema=input_schema) - if log_message: - assert str(excinfo.value) == log_message.format(input_json_file) + if "schema" in invalid_json_file_str: + assert str(excinfo.value) == log_message.format(input_schema) + else: + assert str(excinfo.value) == log_message.format(invalid_json_file) # @pytest.mark.parametrize( -# "invalid_json_file, expected_exception, log_message", +# "valid_json_file, input_schema", +# [ +# ("VIA_JSON_sample_1.json", VIA_UNTRACKED_SCHEMA), +# ("COCO_JSON_sample_1.json", COCO_UNTRACKED_SCHEMA), +# ], +# ) +# @pytest.mark.parametrize( +# "invalid_json_factory, expected_exception, log_message", # [ # ( -# "invalid_VIA_JSON_sample_1.json", -# FileNotFoundError, -# "File not found: invalid_VIA_JSON_sample_1.json.", -# ), -# ( -# "invalid_VIA_JSON_sample_2.json", -# ValueError, -# "Error decoding JSON data from file: invalid_VIA_JSON_sample_2.", +# "get_json_file_with_schema_error", +# pytest.raises(ValueError), +# "The JSON data does not match the provided schema: {}.", # ), # ], # ) -# def test_valid_json_errors(invalid_json_file, -# expected_exception, log_message): -# """Test the ValidJSON validator on invalid data.""" -# with pytest.raises(expected_exception) as excinfo: -# ValidJSON(invalid_json_file) - -# assert str(excinfo.value) == log_message +# def test_valid_json_schema_error( +# valid_json_file, +# input_schema, +# invalid_json_factory, +# expected_exception, +# log_message, +# tmp_path, +# request, +# ): +# """Test the ValidJSON validator throws the expected error.""" +# invalid_json_factory = request.getfixturevalue(invalid_json_factory) +# invalid_json_file = invalid_json_factory(valid_json_file) + +# with expected_exception as excinfo: +# ValidJSON(path=invalid_json_file, schema=input_schema) + +# if log_message: +# assert str(excinfo.value) == log_message.format(input_schema) From 05696a7e4d0a6fd451c40584b6a400f2112cf6de Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 16 Dec 2024 13:14:40 +0000 Subject: [PATCH 23/36] Make schema optional --- ethology/annotations/validators.py | 21 +++++++++++-------- .../test_annotations/test_validators.py | 20 ++++++++++++------ 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py index b328d97..1a0d22f 100644 --- a/ethology/annotations/validators.py +++ b/ethology/annotations/validators.py @@ -38,7 +38,9 @@ class ValidJSON: # Required attributes path: Path = field(validator=validators.instance_of(Path)) - schema: dict = field() + + # Optional attributes + schema: dict | None = field(default=None) @path.validator def _file_is_json(self, attribute, value): @@ -66,14 +68,15 @@ def _file_matches_JSON_schema(self, attribute, value): with open(value) as file: data = json.load(file) - # check against schema - try: - jsonschema.validate(instance=data, schema=self.schema) - except jsonschema.exceptions.ValidationError as val_err: - raise ValueError( - "The JSON data does not match " - f"the provided schema: {self.schema}." - ) from val_err + # check against schema if provided + if self.schema: + try: + jsonschema.validate(instance=data, schema=self.schema) + except jsonschema.exceptions.ValidationError as val_err: + raise ValueError( + "The JSON data does not match " + f"the provided schema: {self.schema}." + ) from val_err @define diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py index 4c9fd34..7fe9820 100644 --- a/tests/test_unit/test_annotations/test_validators.py +++ b/tests/test_unit/test_annotations/test_validators.py @@ -68,20 +68,28 @@ def coco_json_file_with_schema_error( @pytest.mark.parametrize( - "input_json_file, input_schema", + "input_json_file_suffix", + ["1", "2"], +) +@pytest.mark.parametrize( + "input_file_standard, input_schema", [ - ("VIA_JSON_sample_1.json", VIA_UNTRACKED_SCHEMA), - ("VIA_JSON_sample_2.json", VIA_UNTRACKED_SCHEMA), - ("COCO_JSON_sample_1.json", COCO_UNTRACKED_SCHEMA), - ("COCO_JSON_sample_2.json", COCO_UNTRACKED_SCHEMA), + ("VIA", VIA_UNTRACKED_SCHEMA), + ("VIA", None), + ("COCO", COCO_UNTRACKED_SCHEMA), + ("COCO", None), ], ) def test_valid_json( annotations_test_data, - input_json_file, + input_file_standard, + input_json_file_suffix, input_schema, ): """Test the ValidJSON validator with valid files.""" + input_json_file = ( + f"{input_file_standard}_JSON_sample_{input_json_file_suffix}.json" + ) input_json_file = annotations_test_data[input_json_file] with does_not_raise(): ValidJSON(path=input_json_file, schema=input_schema) From 8223701da977425cf803e66e06b093757b097396 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 16 Dec 2024 14:32:35 +0000 Subject: [PATCH 24/36] Add tests for schemas --- ethology/annotations/json_schemas.py | 1 - ethology/annotations/validators.py | 7 +- .../test_annotations/test_validators.py | 167 +++++++++--------- 3 files changed, 87 insertions(+), 88 deletions(-) diff --git a/ethology/annotations/json_schemas.py b/ethology/annotations/json_schemas.py index 7e9d265..161c924 100644 --- a/ethology/annotations/json_schemas.py +++ b/ethology/annotations/json_schemas.py @@ -66,7 +66,6 @@ }, } - COCO_UNTRACKED_SCHEMA = { "type": "object", "properties": { diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py index 1a0d22f..2ad2b4a 100644 --- a/ethology/annotations/validators.py +++ b/ethology/annotations/validators.py @@ -5,6 +5,7 @@ import jsonschema import jsonschema.exceptions +import jsonschema.validators from attrs import define, field, validators @@ -73,10 +74,8 @@ def _file_matches_JSON_schema(self, attribute, value): try: jsonschema.validate(instance=data, schema=self.schema) except jsonschema.exceptions.ValidationError as val_err: - raise ValueError( - "The JSON data does not match " - f"the provided schema: {self.schema}." - ) from val_err + # forward the error message as it is quite informative + raise val_err @define diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py index 7fe9820..dd66ad9 100644 --- a/tests/test_unit/test_annotations/test_validators.py +++ b/tests/test_unit/test_annotations/test_validators.py @@ -1,6 +1,7 @@ import json from contextlib import nullcontext as does_not_raise +import jsonschema import pytest from ethology.annotations.json_schemas import ( @@ -25,74 +26,83 @@ def json_file_with_not_found_error(tmp_path): return tmp_path / "JSON_file_not_found.json" -@pytest.fixture() -def via_json_file_with_schema_error(tmp_path, annotations_test_data): +def _json_file_with_schema_error(out_parent_path, json_valid_path): """Return path to a JSON file that doesn't match the expected schema.""" # read valid json file - via_json_valid_filepath = annotations_test_data["VIA_JSON_sample_1.json"] - with open(via_json_valid_filepath) as f: + with open(json_valid_path) as f: data = json.load(f) - # change type of specific keys - # - change "_via_image_id_list" from list of strings to list of integers - # TODO: what if I change several? - data["_via_image_id_list"] = list(range(len(data["_via_image_id_list"]))) - - # save the modified data to a new file under tmp_path - out_json = tmp_path / "VIA_JSON_schema_error.json" + # modify so that it doesn't match the corresponding schema + if "VIA" in json_valid_path.name: + # change "width" of a bounding box from int to float + data["_via_img_metadata"][ + "09.08_09.08.2023-01-Left_frame_001764.png15086122" + ]["regions"][0]["shape_attributes"]["width"] = 49.5 + elif "COCO" in json_valid_path.name: + # change "annotations" from list of dicts to list of lists + data["annotations"] = [[d] for d in data["annotations"]] + + # save the modified json to a new file + out_json = out_parent_path / f"{json_valid_path.name}_schema_error.json" with open(out_json, "w") as f: json.dump(data, f) return out_json @pytest.fixture() -def coco_json_file_with_schema_error( +def via_json_file_with_schema_error( tmp_path, annotations_test_data, ): - """Return path to a JSON file that doesn't match the expected schema.""" - # read valid json file - via_json_valid_filepath = annotations_test_data["COCO_JSON_sample_1.json"] - with open(via_json_valid_filepath) as f: - data = json.load(f) + """Return path to a VIA JSON file that doesn't match its schema.""" + return _json_file_with_schema_error( + tmp_path, + annotations_test_data["VIA_JSON_sample_1.json"], + ) - # change "annotations" from list of dicts to list of lists - # TODO: what if I change several? - data["annotations"] = [[d] for d in data["annotations"]] - # save the modified data to a new file under tmp_path - out_json = tmp_path / "VIA_JSON_schema_error.json" - with open(out_json, "w") as f: - json.dump(data, f) - return out_json +@pytest.fixture() +def coco_json_file_with_schema_error( + tmp_path, + annotations_test_data, +): + """Return path to a COCO JSON file that doesn't match its schema.""" + return _json_file_with_schema_error( + tmp_path, + annotations_test_data["COCO_JSON_sample_1.json"], + ) -@pytest.mark.parametrize( - "input_json_file_suffix", - ["1", "2"], -) @pytest.mark.parametrize( "input_file_standard, input_schema", [ - ("VIA", VIA_UNTRACKED_SCHEMA), ("VIA", None), - ("COCO", COCO_UNTRACKED_SCHEMA), + ("VIA", VIA_UNTRACKED_SCHEMA), ("COCO", None), + ("COCO", COCO_UNTRACKED_SCHEMA), ], ) +@pytest.mark.parametrize( + "input_json_file_suffix", + ["JSON_sample_1.json", "JSON_sample_2.json"], +) def test_valid_json( - annotations_test_data, input_file_standard, input_json_file_suffix, input_schema, + annotations_test_data, ): """Test the ValidJSON validator with valid files.""" - input_json_file = ( - f"{input_file_standard}_JSON_sample_{input_json_file_suffix}.json" - ) - input_json_file = annotations_test_data[input_json_file] + # get path to file + filepath = annotations_test_data[ + f"{input_file_standard}_{input_json_file_suffix}" + ] + with does_not_raise(): - ValidJSON(path=input_json_file, schema=input_schema) + ValidJSON( + path=filepath, + schema=input_schema, + ) @pytest.mark.parametrize( @@ -100,27 +110,55 @@ def test_valid_json( [ ( "json_file_with_decode_error", - VIA_UNTRACKED_SCHEMA, # should be independent of schema + None, # should be independent of schema pytest.raises(ValueError), "Error decoding JSON data from file: {}.", ), ( "json_file_with_not_found_error", - VIA_UNTRACKED_SCHEMA, # should be independent of schema + None, # should be independent of schema pytest.raises(FileNotFoundError), "File not found: {}.", ), ( "via_json_file_with_schema_error", VIA_UNTRACKED_SCHEMA, - pytest.raises(ValueError), - "The JSON data does not match the provided schema: {}.", + pytest.raises(jsonschema.exceptions.ValidationError), + "49.5 is not of type 'integer'\n\n" + "Failed validating 'type' in " + "schema['properties']['_via_img_metadata']['additionalProperties']" + "['properties']['regions']['items']['properties']" + "['shape_attributes']['properties']['width']:\n" + " {'type': 'integer'}\n\n" + "On instance['_via_img_metadata']" + "['09.08_09.08.2023-01-Left_frame_001764.png15086122']['regions']" + "[0]['shape_attributes']['width']:\n" + " 49.5", ), ( "coco_json_file_with_schema_error", COCO_UNTRACKED_SCHEMA, - pytest.raises(ValueError), - "The JSON data does not match the provided schema: {}.", + pytest.raises(jsonschema.exceptions.ValidationError), + "[{'area': 432, 'bbox': [1278, 556, 16, 27], 'category_id': 1, " + "'id': 8917, 'image_id': 199, 'iscrowd': 0}] is not of type " + "'object'\n\n" + "Failed validating 'type' in " + "schema['properties']['annotations']['items']:\n" + " {'type': 'object',\n" + " 'properties': {'id': {'type': 'integer'},\n" + " 'image_id': {'type': 'integer'},\n" + " 'bbox': {'type': 'array', 'items': " + "{'type': 'integer'}},\n" + " 'category_id': {'type': 'integer'},\n" + " 'area': {'type': 'integer'},\n" + " 'iscrowd': {'type': 'integer'}}}\n\n" + "On instance['annotations'][4343]:\n" + " [{'area': 432,\n" + " 'bbox': [1278, 556, 16, 27],\n" + " 'category_id': 1,\n" + " 'id': 8917,\n" + " 'image_id': 199,\n" + " 'iscrowd': 0}]", ), ], ) @@ -131,50 +169,13 @@ def test_valid_json_error( log_message, request, ): - """Test the ValidJSON validator throws the expected error.""" + """Test the ValidJSON validator throws the expected errors.""" invalid_json_file = request.getfixturevalue(invalid_json_file_str) with expected_exception as excinfo: ValidJSON(path=invalid_json_file, schema=input_schema) - if "schema" in invalid_json_file_str: - assert str(excinfo.value) == log_message.format(input_schema) + if input_schema: + assert str(excinfo.value) == log_message else: assert str(excinfo.value) == log_message.format(invalid_json_file) - - -# @pytest.mark.parametrize( -# "valid_json_file, input_schema", -# [ -# ("VIA_JSON_sample_1.json", VIA_UNTRACKED_SCHEMA), -# ("COCO_JSON_sample_1.json", COCO_UNTRACKED_SCHEMA), -# ], -# ) -# @pytest.mark.parametrize( -# "invalid_json_factory, expected_exception, log_message", -# [ -# ( -# "get_json_file_with_schema_error", -# pytest.raises(ValueError), -# "The JSON data does not match the provided schema: {}.", -# ), -# ], -# ) -# def test_valid_json_schema_error( -# valid_json_file, -# input_schema, -# invalid_json_factory, -# expected_exception, -# log_message, -# tmp_path, -# request, -# ): -# """Test the ValidJSON validator throws the expected error.""" -# invalid_json_factory = request.getfixturevalue(invalid_json_factory) -# invalid_json_file = invalid_json_factory(valid_json_file) - -# with expected_exception as excinfo: -# ValidJSON(path=invalid_json_file, schema=input_schema) - -# if log_message: -# assert str(excinfo.value) == log_message.format(input_schema) From 92d6b74e59838c1ebc3d382726fa3e21cca660bd Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 16 Dec 2024 14:34:26 +0000 Subject: [PATCH 25/36] Reduce error message check for schema validation --- .../test_annotations/test_validators.py | 81 +++++++------------ 1 file changed, 27 insertions(+), 54 deletions(-) diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py index dd66ad9..00a8662 100644 --- a/tests/test_unit/test_annotations/test_validators.py +++ b/tests/test_unit/test_annotations/test_validators.py @@ -26,29 +26,6 @@ def json_file_with_not_found_error(tmp_path): return tmp_path / "JSON_file_not_found.json" -def _json_file_with_schema_error(out_parent_path, json_valid_path): - """Return path to a JSON file that doesn't match the expected schema.""" - # read valid json file - with open(json_valid_path) as f: - data = json.load(f) - - # modify so that it doesn't match the corresponding schema - if "VIA" in json_valid_path.name: - # change "width" of a bounding box from int to float - data["_via_img_metadata"][ - "09.08_09.08.2023-01-Left_frame_001764.png15086122" - ]["regions"][0]["shape_attributes"]["width"] = 49.5 - elif "COCO" in json_valid_path.name: - # change "annotations" from list of dicts to list of lists - data["annotations"] = [[d] for d in data["annotations"]] - - # save the modified json to a new file - out_json = out_parent_path / f"{json_valid_path.name}_schema_error.json" - with open(out_json, "w") as f: - json.dump(data, f) - return out_json - - @pytest.fixture() def via_json_file_with_schema_error( tmp_path, @@ -73,6 +50,29 @@ def coco_json_file_with_schema_error( ) +def _json_file_with_schema_error(out_parent_path, json_valid_path): + """Return path to a JSON file that doesn't match the expected schema.""" + # read valid json file + with open(json_valid_path) as f: + data = json.load(f) + + # modify so that it doesn't match the corresponding schema + if "VIA" in json_valid_path.name: + # change "width" of a bounding box from int to float + data["_via_img_metadata"][ + "09.08_09.08.2023-01-Left_frame_001764.png15086122" + ]["regions"][0]["shape_attributes"]["width"] = 49.5 + elif "COCO" in json_valid_path.name: + # change "annotations" from list of dicts to list of lists + data["annotations"] = [[d] for d in data["annotations"]] + + # save the modified json to a new file + out_json = out_parent_path / f"{json_valid_path.name}_schema_error.json" + with open(out_json, "w") as f: + json.dump(data, f) + return out_json + + @pytest.mark.parametrize( "input_file_standard, input_schema", [ @@ -93,7 +93,6 @@ def test_valid_json( annotations_test_data, ): """Test the ValidJSON validator with valid files.""" - # get path to file filepath = annotations_test_data[ f"{input_file_standard}_{input_json_file_suffix}" ] @@ -124,16 +123,7 @@ def test_valid_json( "via_json_file_with_schema_error", VIA_UNTRACKED_SCHEMA, pytest.raises(jsonschema.exceptions.ValidationError), - "49.5 is not of type 'integer'\n\n" - "Failed validating 'type' in " - "schema['properties']['_via_img_metadata']['additionalProperties']" - "['properties']['regions']['items']['properties']" - "['shape_attributes']['properties']['width']:\n" - " {'type': 'integer'}\n\n" - "On instance['_via_img_metadata']" - "['09.08_09.08.2023-01-Left_frame_001764.png15086122']['regions']" - "[0]['shape_attributes']['width']:\n" - " 49.5", + "49.5 is not of type 'integer'\n\n", ), ( "coco_json_file_with_schema_error", @@ -141,24 +131,7 @@ def test_valid_json( pytest.raises(jsonschema.exceptions.ValidationError), "[{'area': 432, 'bbox': [1278, 556, 16, 27], 'category_id': 1, " "'id': 8917, 'image_id': 199, 'iscrowd': 0}] is not of type " - "'object'\n\n" - "Failed validating 'type' in " - "schema['properties']['annotations']['items']:\n" - " {'type': 'object',\n" - " 'properties': {'id': {'type': 'integer'},\n" - " 'image_id': {'type': 'integer'},\n" - " 'bbox': {'type': 'array', 'items': " - "{'type': 'integer'}},\n" - " 'category_id': {'type': 'integer'},\n" - " 'area': {'type': 'integer'},\n" - " 'iscrowd': {'type': 'integer'}}}\n\n" - "On instance['annotations'][4343]:\n" - " [{'area': 432,\n" - " 'bbox': [1278, 556, 16, 27],\n" - " 'category_id': 1,\n" - " 'id': 8917,\n" - " 'image_id': 199,\n" - " 'iscrowd': 0}]", + "'object'\n\n", ), ], ) @@ -176,6 +149,6 @@ def test_valid_json_error( ValidJSON(path=invalid_json_file, schema=input_schema) if input_schema: - assert str(excinfo.value) == log_message + assert log_message in str(excinfo.value) else: - assert str(excinfo.value) == log_message.format(invalid_json_file) + assert log_message.format(invalid_json_file) == str(excinfo.value) From a6523e1650226734aa3b9b94c5ce9ec3cf9a488f Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 16 Dec 2024 16:27:24 +0000 Subject: [PATCH 26/36] Add tests for keys check WIP --- ethology/annotations/validators.py | 87 +----------- .../test_annotations/test_validators.py | 125 +++++++++++++++++- 2 files changed, 129 insertions(+), 83 deletions(-) diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py index 2ad2b4a..e297978 100644 --- a/ethology/annotations/validators.py +++ b/ethology/annotations/validators.py @@ -84,9 +84,6 @@ class ValidVIAUntrackedJSON: Checks the VIA JSON file for untracked data contains the required keys. - Note that the validation against the schema does not check the existence - of the keys, only the type of their values if they exist. - Attributes ---------- path : pathlib.Path @@ -119,7 +116,7 @@ def _file_contains_required_keys(self, attribute, value): _check_keys( required_keys["image_keys"], img_dict, - additional_error_message=f"for {img_str}", + additional_error_message=f" for {img_str}", ) # Check keys for each region for region in img_dict["regions"]: @@ -129,7 +126,7 @@ def _file_contains_required_keys(self, attribute, value): _check_keys( required_keys["shape_attributes_keys"], region["shape_attributes"], - additional_error_message=f"for region under {img_str}", + additional_error_message=f" for region under {img_str}", ) @@ -159,78 +156,6 @@ class ValidCOCOUntrackedJSON: path: Path = field(validator=validators.instance_of(Path)) - # TODO: add a check for the presence of the keys - # that I use in loading the data - - @path.validator - def _file_macthes_COCO_JSON_schema(self, attribute, value): - """Ensure that the JSON file matches the expected schema.""" - # Define schema for VIA JSON file for untracked - # (aka manually labelled) data - COCO_JSON_schema = { - "type": "object", - "properties": { - "info": {"type": "object"}, - "licenses": { - "type": "array", - }, - "images": { - "type": "array", - "items": { - "type": "object", - "properties": { - "file_name": {"type": "string"}, - "id": {"type": "integer"}, - "width": {"type": "integer"}, - "height": {"type": "integer"}, - }, - }, - }, - "annotations": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": {"type": "integer"}, # annotation global ID - "image_id": {"type": "integer"}, - "bbox": { - "type": "array", - "items": {"type": "integer"}, - }, - "category_id": {"type": "integer"}, - "area": {"type": "integer"}, - "iscrowd": {"type": "integer"}, - }, - }, - }, - "categories": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": {"type": "integer"}, - "name": {"type": "string"}, - "supercategory": {"type": "string"}, - }, - }, - }, - }, - } - - # should have been validated with ValidJSON - # already so this should work fine - with open(value) as file: - data = json.load(file) - - # check against schema - try: - jsonschema.validate(instance=data, schema=COCO_JSON_schema) - except jsonschema.exceptions.ValidationError as val_err: - raise ValueError( - "The JSON data does not match " - f"the provided schema: {COCO_JSON_schema}" - ) from val_err - @path.validator def _file_contains_required_keys(self, attribute, value): """Ensure that the JSON file contains the required keys.""" @@ -256,7 +181,7 @@ def _file_contains_required_keys(self, attribute, value): _check_keys( required_keys["image_keys"], img_dict, - additional_error_message=f"for image dict {img_dict}", + additional_error_message=f" for image dict {img_dict}", ) # Check keys in annotations dicts @@ -264,7 +189,7 @@ def _file_contains_required_keys(self, attribute, value): _check_keys( required_keys["annotations_keys"], annot_dict, - additional_error_message=f"for annotation dict {annot_dict}", + additional_error_message=f" for annotation dict {annot_dict}", ) # Check keys in categories dicts @@ -272,7 +197,7 @@ def _file_contains_required_keys(self, attribute, value): _check_keys( required_keys["categories_keys"], cat_dict, - additional_error_message=f"for category dict {cat_dict}", + additional_error_message=f" for category dict {cat_dict}", ) @@ -285,7 +210,7 @@ def _check_keys( if missing_keys: raise ValueError( f"Required key(s) {missing_keys} not " - f"found in {list(data_dict.keys())} " + f"found in {list(data_dict.keys())}" + additional_error_message + "." ) diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py index 00a8662..0f6b7d3 100644 --- a/tests/test_unit/test_annotations/test_validators.py +++ b/tests/test_unit/test_annotations/test_validators.py @@ -8,7 +8,10 @@ COCO_UNTRACKED_SCHEMA, VIA_UNTRACKED_SCHEMA, ) -from ethology.annotations.validators import ValidJSON +from ethology.annotations.validators import ( + ValidJSON, + ValidVIAUntrackedJSON, +) @pytest.fixture() @@ -73,6 +76,47 @@ def _json_file_with_schema_error(out_parent_path, json_valid_path): return out_json +@pytest.fixture() +def via_json_file_with_missing_keys(tmp_path, annotations_test_data): + def _via_json_file_with_missing_keys( + valid_json_filename, required_keys_to_pop + ): + """Return path to a JSON file that is missing required keys.""" + # read valid json file + valid_json_path = annotations_test_data[valid_json_filename] + with open(valid_json_path) as f: + data = json.load(f) + + # remove any keys in the first level + for key in required_keys_to_pop.get("main", []): + data.pop(key) + + # remove keys in nested dicts + for _, img_dict in data["_via_img_metadata"].items(): + # remove keys for each image dictionary + for key in required_keys_to_pop.get("image_keys", []): + img_dict.pop(key) + + for region in img_dict["regions"]: + # remove keys for each region + for key in required_keys_to_pop.get("region_keys", []): + region.pop(key) + + # remove keys under shape_attributes + for key in required_keys_to_pop.get( + "shape_attributes_keys", [] + ): + region["shape_attributes"].pop(key) + + # save the modified json to a new file + out_json = tmp_path / f"{valid_json_path.name}_missing_keys.json" + with open(out_json, "w") as f: + json.dump(data, f) + return out_json + + return _via_json_file_with_missing_keys + + @pytest.mark.parametrize( "input_file_standard, input_schema", [ @@ -135,7 +179,7 @@ def test_valid_json( ), ], ) -def test_valid_json_error( +def test_valid_json_errors( invalid_json_file_str, input_schema, expected_exception, @@ -152,3 +196,80 @@ def test_valid_json_error( assert log_message in str(excinfo.value) else: assert log_message.format(invalid_json_file) == str(excinfo.value) + + +@pytest.mark.parametrize( + "input_json_file", + [ + "VIA_JSON_sample_1.json", + "VIA_JSON_sample_2.json", + ], +) +def test_valid_via_untracked_json(annotations_test_data, input_json_file): + filepath = annotations_test_data[input_json_file] + with does_not_raise(): + ValidVIAUntrackedJSON( + path=filepath, + ) + + +@pytest.mark.parametrize( + "valid_json_file", + [ + "VIA_JSON_sample_1.json", + "VIA_JSON_sample_2.json", + ], +) +@pytest.mark.parametrize( + "missing_keys, expected_exception, log_message", + [ + ( + {"main": ["_via_image_id_list"]}, + pytest.raises(ValueError), + "Required key(s) {'_via_image_id_list'} not found " + "in ['_via_settings', '_via_img_metadata', '_via_attributes', " + "'_via_data_format_version'].", + ), + ( + {"image_keys": ["filename"]}, + pytest.raises(ValueError), + "Required key(s) {'filename'} not found " + "in ['size', 'regions', 'file_attributes'] " + "for 09.08_09.08.2023-01-Left_frame_001764.png15086122.", + ), + ( + {"region_keys": ["shape_attributes"]}, + pytest.raises(ValueError), + "The JSON data does not contain the required keys: annotations.", + ), + ( + {"shape_attributes_keys": ["x"]}, + pytest.raises(ValueError), + "The JSON data does not contain the required keys: annotations.", + ), + ], +) +def test_valid_via_untracked_json_missing_keys( + valid_json_file, + missing_keys, + via_json_file_with_missing_keys, + expected_exception, + log_message, +): + # create invalid json file with missing keys + invalid_json_file = via_json_file_with_missing_keys( + valid_json_file, missing_keys + ) + + # run validatio + with expected_exception as excinfo: + ValidVIAUntrackedJSON( + path=invalid_json_file, + ) + + assert str(excinfo.value) == log_message + + +# def test_valid_via_untracked_json ---> checks required keys +# def test_valid_coco_untracked_json ---> checks required keys +# def test_check_keys? From ab65d95802acb4ef8b9badb372054688c753d510 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 16 Dec 2024 17:15:28 +0000 Subject: [PATCH 27/36] Add test for VIA JSON untracked validator --- ethology/annotations/validators.py | 24 +++---- .../test_annotations/test_validators.py | 66 +++++++++++-------- 2 files changed, 53 insertions(+), 37 deletions(-) diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py index e297978..74b1462 100644 --- a/ethology/annotations/validators.py +++ b/ethology/annotations/validators.py @@ -116,17 +116,21 @@ def _file_contains_required_keys(self, attribute, value): _check_keys( required_keys["image_keys"], img_dict, - additional_error_message=f" for {img_str}", + additional_message=f" for {img_str}", ) # Check keys for each region - for region in img_dict["regions"]: - _check_keys(required_keys["region_keys"], region) + for i, region in enumerate(img_dict["regions"]): + _check_keys( + required_keys["region_keys"], + region, + additional_message=f" for region {i} under {img_str}", + ) # Check keys under shape_attributes _check_keys( required_keys["shape_attributes_keys"], region["shape_attributes"], - additional_error_message=f" for region under {img_str}", + additional_message=f" for region {i} under {img_str}", ) @@ -181,7 +185,7 @@ def _file_contains_required_keys(self, attribute, value): _check_keys( required_keys["image_keys"], img_dict, - additional_error_message=f" for image dict {img_dict}", + additional_message=f" for image dict {img_dict}", ) # Check keys in annotations dicts @@ -189,7 +193,7 @@ def _file_contains_required_keys(self, attribute, value): _check_keys( required_keys["annotations_keys"], annot_dict, - additional_error_message=f" for annotation dict {annot_dict}", + additional_message=f" for annotation dict {annot_dict}", ) # Check keys in categories dicts @@ -197,20 +201,18 @@ def _file_contains_required_keys(self, attribute, value): _check_keys( required_keys["categories_keys"], cat_dict, - additional_error_message=f" for category dict {cat_dict}", + additional_message=f" for category dict {cat_dict}", ) def _check_keys( list_required_keys: list[str], data_dict: dict, - additional_error_message: str = "", + additional_message: str = "", ): missing_keys = set(list_required_keys) - data_dict.keys() if missing_keys: raise ValueError( f"Required key(s) {missing_keys} not " - f"found in {list(data_dict.keys())}" - + additional_error_message - + "." + f"found in {list(data_dict.keys())}" + additional_message + "." ) diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py index 0f6b7d3..4c96511 100644 --- a/tests/test_unit/test_annotations/test_validators.py +++ b/tests/test_unit/test_annotations/test_validators.py @@ -8,10 +8,7 @@ COCO_UNTRACKED_SCHEMA, VIA_UNTRACKED_SCHEMA, ) -from ethology.annotations.validators import ( - ValidJSON, - ValidVIAUntrackedJSON, -) +from ethology.annotations.validators import ValidJSON, ValidVIAUntrackedJSON @pytest.fixture() @@ -81,7 +78,12 @@ def via_json_file_with_missing_keys(tmp_path, annotations_test_data): def _via_json_file_with_missing_keys( valid_json_filename, required_keys_to_pop ): - """Return path to a JSON file that is missing required keys.""" + """Return path to a JSON file that is missing required keys. + + If a key to pop refers to a nested dictionary, it is removed from + the first element. + + """ # read valid json file valid_json_path = annotations_test_data[valid_json_filename] with open(valid_json_path) as f: @@ -91,28 +93,35 @@ def _via_json_file_with_missing_keys( for key in required_keys_to_pop.get("main", []): data.pop(key) - # remove keys in nested dicts - for _, img_dict in data["_via_img_metadata"].items(): - # remove keys for each image dictionary + # remove keys in nested dictionaries + edited_image_dicts = {} + if "_via_img_metadata" in data: + # remove image keys for first image dictionary + img_str, img_dict = list(data["_via_img_metadata"].items())[ + 0 + ] # list(data["_via_img_metadata"].values())[0] for key in required_keys_to_pop.get("image_keys", []): img_dict.pop(key) + edited_image_dicts["image_keys"] = img_str - for region in img_dict["regions"]: - # remove keys for each region - for key in required_keys_to_pop.get("region_keys", []): - region.pop(key) + # remove region keys for first region under second image dictionary + img_str, img_dict = list(data["_via_img_metadata"].items())[1] + for key in required_keys_to_pop.get("region_keys", []): + img_dict["regions"][0].pop(key) + edited_image_dicts["region_keys"] = img_str - # remove keys under shape_attributes - for key in required_keys_to_pop.get( - "shape_attributes_keys", [] - ): - region["shape_attributes"].pop(key) + # remove shape_attributes keys for first region under third image + # dictionary + img_str, img_dict = list(data["_via_img_metadata"].items())[2] + for key in required_keys_to_pop.get("shape_attributes_keys", []): + img_dict["regions"][0]["shape_attributes"].pop(key) + edited_image_dicts["shape_attributes_keys"] = img_str # save the modified json to a new file out_json = tmp_path / f"{valid_json_path.name}_missing_keys.json" with open(out_json, "w") as f: json.dump(data, f) - return out_json + return out_json, edited_image_dicts return _via_json_file_with_missing_keys @@ -226,26 +235,28 @@ def test_valid_via_untracked_json(annotations_test_data, input_json_file): ( {"main": ["_via_image_id_list"]}, pytest.raises(ValueError), - "Required key(s) {'_via_image_id_list'} not found " + "Required key(s) {{'_via_image_id_list'}} not found " "in ['_via_settings', '_via_img_metadata', '_via_attributes', " "'_via_data_format_version'].", ), ( {"image_keys": ["filename"]}, pytest.raises(ValueError), - "Required key(s) {'filename'} not found " + "Required key(s) {{'filename'}} not found " "in ['size', 'regions', 'file_attributes'] " - "for 09.08_09.08.2023-01-Left_frame_001764.png15086122.", + "for {}.", ), ( {"region_keys": ["shape_attributes"]}, pytest.raises(ValueError), - "The JSON data does not contain the required keys: annotations.", + "Required key(s) {{'shape_attributes'}} not found in " + "['region_attributes'] for region 0 under {}.", ), ( {"shape_attributes_keys": ["x"]}, pytest.raises(ValueError), - "The JSON data does not contain the required keys: annotations.", + "Required key(s) {{'x'}} not found in " + "['name', 'y', 'width', 'height'] for region 0 under {}.", ), ], ) @@ -257,17 +268,20 @@ def test_valid_via_untracked_json_missing_keys( log_message, ): # create invalid json file with missing keys - invalid_json_file = via_json_file_with_missing_keys( + invalid_json_file, edited_image_dicts = via_json_file_with_missing_keys( valid_json_file, missing_keys ) - # run validatio + # get key of affected image in _via_img_metadata + img_key_str = edited_image_dicts.get(list(missing_keys.keys())[0], None) + + # run validation with expected_exception as excinfo: ValidVIAUntrackedJSON( path=invalid_json_file, ) - assert str(excinfo.value) == log_message + assert str(excinfo.value) == log_message.format(img_key_str) # def test_valid_via_untracked_json ---> checks required keys From 3e241865d0cd6e7a72442824a04317c7302568e2 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 16 Dec 2024 17:58:02 +0000 Subject: [PATCH 28/36] Add test for valid_coco_untracked_json --- ethology/annotations/validators.py | 7 +- .../test_annotations/test_validators.py | 162 +++++++++++++++--- 2 files changed, 142 insertions(+), 27 deletions(-) diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py index 74b1462..f433577 100644 --- a/ethology/annotations/validators.py +++ b/ethology/annotations/validators.py @@ -165,10 +165,7 @@ def _file_contains_required_keys(self, attribute, value): """Ensure that the JSON file contains the required keys.""" required_keys = { "main": ["images", "annotations", "categories"], - "image_keys": [ - "id", - "file_name", - ], # add height and width of image? + "image_keys": ["id", "file_name"], # "height", "width"? "annotations_keys": ["id", "image_id", "bbox", "category_id"], "categories_keys": ["id", "name", "supercategory"], } @@ -213,6 +210,6 @@ def _check_keys( missing_keys = set(list_required_keys) - data_dict.keys() if missing_keys: raise ValueError( - f"Required key(s) {missing_keys} not " + f"Required key(s) {sorted(missing_keys)} not " f"found in {list(data_dict.keys())}" + additional_message + "." ) diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py index 4c96511..a3fb678 100644 --- a/tests/test_unit/test_annotations/test_validators.py +++ b/tests/test_unit/test_annotations/test_validators.py @@ -8,7 +8,11 @@ COCO_UNTRACKED_SCHEMA, VIA_UNTRACKED_SCHEMA, ) -from ethology.annotations.validators import ValidJSON, ValidVIAUntrackedJSON +from ethology.annotations.validators import ( + ValidCOCOUntrackedJSON, + ValidJSON, + ValidVIAUntrackedJSON, +) @pytest.fixture() @@ -75,15 +79,12 @@ def _json_file_with_schema_error(out_parent_path, json_valid_path): @pytest.fixture() def via_json_file_with_missing_keys(tmp_path, annotations_test_data): + """Return factory of paths to VIA JSON files with missing required keys.""" + def _via_json_file_with_missing_keys( valid_json_filename, required_keys_to_pop ): - """Return path to a JSON file that is missing required keys. - - If a key to pop refers to a nested dictionary, it is removed from - the first element. - - """ + """Return path to a JSON file that is missing required keys.""" # read valid json file valid_json_path = annotations_test_data[valid_json_filename] with open(valid_json_path) as f: @@ -96,26 +97,26 @@ def _via_json_file_with_missing_keys( # remove keys in nested dictionaries edited_image_dicts = {} if "_via_img_metadata" in data: + list_img_metadata_tuples = list(data["_via_img_metadata"].items()) + # remove image keys for first image dictionary - img_str, img_dict = list(data["_via_img_metadata"].items())[ - 0 - ] # list(data["_via_img_metadata"].values())[0] + img_str, img_dict = list_img_metadata_tuples[0] + edited_image_dicts["image_keys"] = img_str for key in required_keys_to_pop.get("image_keys", []): img_dict.pop(key) - edited_image_dicts["image_keys"] = img_str # remove region keys for first region under second image dictionary - img_str, img_dict = list(data["_via_img_metadata"].items())[1] + img_str, img_dict = list_img_metadata_tuples[1] + edited_image_dicts["region_keys"] = img_str for key in required_keys_to_pop.get("region_keys", []): img_dict["regions"][0].pop(key) - edited_image_dicts["region_keys"] = img_str # remove shape_attributes keys for first region under third image # dictionary - img_str, img_dict = list(data["_via_img_metadata"].items())[2] + img_str, img_dict = list_img_metadata_tuples[2] + edited_image_dicts["shape_attributes_keys"] = img_str for key in required_keys_to_pop.get("shape_attributes_keys", []): img_dict["regions"][0]["shape_attributes"].pop(key) - edited_image_dicts["shape_attributes_keys"] = img_str # save the modified json to a new file out_json = tmp_path / f"{valid_json_path.name}_missing_keys.json" @@ -126,6 +127,54 @@ def _via_json_file_with_missing_keys( return _via_json_file_with_missing_keys +@pytest.fixture() +def coco_json_file_with_missing_keys(tmp_path, annotations_test_data): + """Return factory of paths to COCO JSON files with missing required + keys. + """ + + def _coco_json_file_with_missing_keys( + valid_json_filename, required_keys_to_pop + ): + """Return path to a JSON file that is missing required keys.""" + # read valid json file + valid_json_path = annotations_test_data[valid_json_filename] + with open(valid_json_path) as f: + data = json.load(f) + + # remove any keys in the first level + for key in required_keys_to_pop.get("main", []): + data.pop(key) + + edited_image_dicts = {} + + # remove required keys in first images dictionary + if "images" in data: + edited_image_dicts["image_keys"] = data["images"][0] + for key in required_keys_to_pop.get("image_keys", []): + data["images"][0].pop(key) + + # remove required keys in first annotations dictionary + if "annotations" in data: + edited_image_dicts["annotations_keys"] = data["annotations"][0] + for key in required_keys_to_pop.get("annotations_keys", []): + data["annotations"][0].pop(key) + + # remove required keys in first categories dictionary + if "categories" in data: + edited_image_dicts["categories_keys"] = data["categories"][0] + for key in required_keys_to_pop.get("categories_keys", []): + data["categories"][0].pop(key) + + # save the modified json to a new file + out_json = tmp_path / f"{valid_json_path.name}_missing_keys.json" + with open(out_json, "w") as f: + json.dump(data, f) + return out_json, edited_image_dicts + + return _coco_json_file_with_missing_keys + + @pytest.mark.parametrize( "input_file_standard, input_schema", [ @@ -235,27 +284,34 @@ def test_valid_via_untracked_json(annotations_test_data, input_json_file): ( {"main": ["_via_image_id_list"]}, pytest.raises(ValueError), - "Required key(s) {{'_via_image_id_list'}} not found " + "Required key(s) ['_via_image_id_list'] not found " "in ['_via_settings', '_via_img_metadata', '_via_attributes', " "'_via_data_format_version'].", ), + ( + {"main": ["_via_image_id_list", "_via_img_metadata"]}, + pytest.raises(ValueError), + "Required key(s) ['_via_image_id_list', '_via_img_metadata'] " + "not found in ['_via_settings', '_via_attributes', " + "'_via_data_format_version'].", + ), ( {"image_keys": ["filename"]}, pytest.raises(ValueError), - "Required key(s) {{'filename'}} not found " + "Required key(s) ['filename'] not found " "in ['size', 'regions', 'file_attributes'] " "for {}.", ), ( {"region_keys": ["shape_attributes"]}, pytest.raises(ValueError), - "Required key(s) {{'shape_attributes'}} not found in " + "Required key(s) ['shape_attributes'] not found in " "['region_attributes'] for region 0 under {}.", ), ( {"shape_attributes_keys": ["x"]}, pytest.raises(ValueError), - "Required key(s) {{'x'}} not found in " + "Required key(s) ['x'] not found in " "['name', 'y', 'width', 'height'] for region 0 under {}.", ), ], @@ -284,6 +340,68 @@ def test_valid_via_untracked_json_missing_keys( assert str(excinfo.value) == log_message.format(img_key_str) -# def test_valid_via_untracked_json ---> checks required keys -# def test_valid_coco_untracked_json ---> checks required keys -# def test_check_keys? +@pytest.mark.parametrize( + "valid_json_file", + [ + "COCO_JSON_sample_1.json", + "COCO_JSON_sample_2.json", + ], +) +@pytest.mark.parametrize( + "missing_keys, expected_exception, log_message", + [ + ( + {"main": ["categories"]}, + pytest.raises(ValueError), + "Required key(s) ['categories'] not found " + "in ['annotations', 'images', 'info', 'licenses'].", + ), + ( + {"main": ["categories", "images"]}, + pytest.raises(ValueError), + "Required key(s) ['categories', 'images'] not found " + "in ['annotations', 'info', 'licenses'].", + ), + ( + {"image_keys": ["file_name"]}, + pytest.raises(ValueError), + "Required key(s) ['file_name'] not found in " + "['height', 'id', 'width'] for image dict {}.", + ), + ( + {"annotations_keys": ["category_id"]}, + pytest.raises(ValueError), + "Required key(s) ['category_id'] not found in " + "['area', 'bbox', 'id', 'image_id', 'iscrowd'] for " + "annotation dict {}.", + ), + ( + {"categories_keys": ["id"]}, + pytest.raises(ValueError), + "Required key(s) ['id'] not found in " + "['name', 'supercategory'] for category dict {}.", + ), + ], +) +def test_valid_coco_untracked_json( + valid_json_file, + missing_keys, + coco_json_file_with_missing_keys, + expected_exception, + log_message, +): + # create invalid json file with missing keys + invalid_json_file, edited_image_dicts = coco_json_file_with_missing_keys( + valid_json_file, missing_keys + ) + + # get key of affected image in _via_img_metadata + img_dict = edited_image_dicts.get(list(missing_keys.keys())[0], None) + + # run validation + with expected_exception as excinfo: + ValidCOCOUntrackedJSON( + path=invalid_json_file, + ) + + assert str(excinfo.value) == log_message.format(img_dict) From e8fcb64ba29db7c657f04e6fbbeb7b49cd2b6d6e Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 16 Dec 2024 18:15:27 +0000 Subject: [PATCH 29/36] Add test for check_keys --- ethology/annotations/validators.py | 2 +- .../test_annotations/test_validators.py | 54 +++++++++++++++++-- 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py index f433577..13deb1b 100644 --- a/ethology/annotations/validators.py +++ b/ethology/annotations/validators.py @@ -211,5 +211,5 @@ def _check_keys( if missing_keys: raise ValueError( f"Required key(s) {sorted(missing_keys)} not " - f"found in {list(data_dict.keys())}" + additional_message + "." + f"found in {list(data_dict.keys())}{additional_message}." ) diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py index a3fb678..d290de4 100644 --- a/tests/test_unit/test_annotations/test_validators.py +++ b/tests/test_unit/test_annotations/test_validators.py @@ -12,6 +12,7 @@ ValidCOCOUntrackedJSON, ValidJSON, ValidVIAUntrackedJSON, + _check_keys, ) @@ -61,13 +62,12 @@ def _json_file_with_schema_error(out_parent_path, json_valid_path): data = json.load(f) # modify so that it doesn't match the corresponding schema + # if VIA, change "width" of a bounding box from int to float + # if COCO, change "annotations" from list of dicts to list of lists if "VIA" in json_valid_path.name: - # change "width" of a bounding box from int to float - data["_via_img_metadata"][ - "09.08_09.08.2023-01-Left_frame_001764.png15086122" - ]["regions"][0]["shape_attributes"]["width"] = 49.5 + _, img_dict = list(data["_via_img_metadata"].items())[0] + img_dict["regions"][0]["shape_attributes"]["width"] = 49.5 elif "COCO" in json_valid_path.name: - # change "annotations" from list of dicts to list of lists data["annotations"] = [[d] for d in data["annotations"]] # save the modified json to a new file @@ -405,3 +405,47 @@ def test_valid_coco_untracked_json( ) assert str(excinfo.value) == log_message.format(img_dict) + + +@pytest.mark.parametrize( + "list_required_keys, data_dict, additional_message, expected_exception", + [ + ( + ["images", "annotations", "categories"], + {"images": "", "annotations": "", "categories": ""}, + "", + does_not_raise(), + ), + ( + ["images", "annotations", "categories"], + {"annotations": "", "categories": ""}, + "", + pytest.raises(ValueError), + ), # one missing key + ( + ["images", "annotations", "categories"], + {"annotations": ""}, + "", + pytest.raises(ValueError), + ), # two missing keys + ( + ["images", "annotations", "categories"], + {"annotations": "", "categories": ""}, + "TEST", + pytest.raises(ValueError), + ), # one missing key + ], +) +def test_check_keys( + list_required_keys, data_dict, additional_message, expected_exception +): + """Test the _check_keys helper function.""" + with expected_exception as excinfo: + _check_keys(list_required_keys, data_dict, additional_message) + + if excinfo: + missing_keys = set(list_required_keys) - data_dict.keys() + assert str(excinfo.value) == ( + f"Required key(s) {sorted(missing_keys)} not " + f"found in {list(data_dict.keys())}{additional_message}." + ) From 06b47c122d93c2a93a2b5b61e36ea44b893d77cc Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 16 Dec 2024 18:30:22 +0000 Subject: [PATCH 30/36] Remove untracked from names --- ethology/annotations/io.py | 17 +++--- ethology/annotations/json_schemas.py | 4 +- ethology/annotations/validators.py | 4 +- tests/conftest.py | 10 ++-- .../test_annotations/test_validators.py | 56 +++++++++---------- 5 files changed, 44 insertions(+), 47 deletions(-) diff --git a/ethology/annotations/io.py b/ethology/annotations/io.py index e62d9f0..1cf137e 100644 --- a/ethology/annotations/io.py +++ b/ethology/annotations/io.py @@ -6,14 +6,11 @@ import pandas as pd from movement.validators.files import ValidFile -from ethology.annotations.json_schemas import ( - COCO_UNTRACKED_SCHEMA, - VIA_UNTRACKED_SCHEMA, -) +from ethology.annotations.json_schemas import COCO_SCHEMA, VIA_SCHEMA from ethology.annotations.validators import ( - ValidCOCOUntrackedJSON, + ValidCOCOJSON, ValidJSON, - ValidVIAUntrackedJSON, + ValidVIAJSON, ) STANDARD_DF_COLUMNS = [ @@ -49,8 +46,8 @@ def df_from_via_json_file(file_path: Path) -> pd.DataFrame: file = ValidFile( file_path, expected_permission="r", expected_suffix=[".json"] ) - json_file = ValidJSON(path=file.path, schema=VIA_UNTRACKED_SCHEMA) - via_untracked_file = ValidVIAUntrackedJSON(json_file.path) + json_file = ValidJSON(path=file.path, schema=VIA_SCHEMA) + via_untracked_file = ValidVIAJSON(json_file.path) # Read as standard dataframe return _df_from_validated_via_json_file(via_untracked_file.path) @@ -76,8 +73,8 @@ def df_from_coco_json_file(file_path: Path) -> pd.DataFrame: file = ValidFile( file_path, expected_permission="r", expected_suffix=[".json"] ) - json_file = ValidJSON(path=file.path, schema=COCO_UNTRACKED_SCHEMA) - coco_untracked_file = ValidCOCOUntrackedJSON(json_file.path) + json_file = ValidJSON(path=file.path, schema=COCO_SCHEMA) + coco_untracked_file = ValidCOCOJSON(json_file.path) # Read as standard dataframe return _df_from_validated_coco_json_file(coco_untracked_file.path) diff --git a/ethology/annotations/json_schemas.py b/ethology/annotations/json_schemas.py index 161c924..79a6773 100644 --- a/ethology/annotations/json_schemas.py +++ b/ethology/annotations/json_schemas.py @@ -1,6 +1,6 @@ """JSON schemas for VIA and COCO annotations.""" -VIA_UNTRACKED_SCHEMA = { +VIA_SCHEMA = { "type": "object", "properties": { # settings for browser UI @@ -66,7 +66,7 @@ }, } -COCO_UNTRACKED_SCHEMA = { +COCO_SCHEMA = { "type": "object", "properties": { "info": {"type": "object"}, diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py index 13deb1b..f03bb55 100644 --- a/ethology/annotations/validators.py +++ b/ethology/annotations/validators.py @@ -79,7 +79,7 @@ def _file_matches_JSON_schema(self, attribute, value): @define -class ValidVIAUntrackedJSON: +class ValidVIAJSON: """Class for validating VIA JSON files for untracked data. Checks the VIA JSON file for untracked data contains the required keys. @@ -135,7 +135,7 @@ def _file_contains_required_keys(self, attribute, value): @define -class ValidCOCOUntrackedJSON: +class ValidCOCOJSON: """Class for validating COCO JSON files for untracked data. The validator ensures that the file matches the expected schema. diff --git a/tests/conftest.py b/tests/conftest.py index 37d1041..28d0ec7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -59,14 +59,14 @@ def pooch_registry() -> dict: @pytest.fixture() def get_paths_test_data(): """Define a factory fixture to get the paths of the data files - under a specific zip. + under a specific subdirectory in the GIN repository. - The name of the zip file is intended to match a testing module. For + The name of the subdirectories is intended to match a testing module. For example, to get the paths to the test files for the annotations - tests module, we would call `get_paths_test_data(pooch_registry, + module, we would call `get_paths_test_data(pooch_registry, "test_annotations")` in a test. This assumes in the GIN repository - there is a zip file named `test_annotations.zip` under the `test_data` - directory containing the relevant test files. + there is a subdirectory named `test_annotations` under the `test_data` + directory with the relevant test files. """ def _get_paths_test_data(pooch_registry, subdir_name: str) -> dict: diff --git a/tests/test_unit/test_annotations/test_validators.py b/tests/test_unit/test_annotations/test_validators.py index d290de4..604c7c4 100644 --- a/tests/test_unit/test_annotations/test_validators.py +++ b/tests/test_unit/test_annotations/test_validators.py @@ -4,14 +4,11 @@ import jsonschema import pytest -from ethology.annotations.json_schemas import ( - COCO_UNTRACKED_SCHEMA, - VIA_UNTRACKED_SCHEMA, -) +from ethology.annotations.json_schemas import COCO_SCHEMA, VIA_SCHEMA from ethology.annotations.validators import ( - ValidCOCOUntrackedJSON, + ValidCOCOJSON, ValidJSON, - ValidVIAUntrackedJSON, + ValidVIAJSON, _check_keys, ) @@ -179,9 +176,9 @@ def _coco_json_file_with_missing_keys( "input_file_standard, input_schema", [ ("VIA", None), - ("VIA", VIA_UNTRACKED_SCHEMA), + ("VIA", VIA_SCHEMA), ("COCO", None), - ("COCO", COCO_UNTRACKED_SCHEMA), + ("COCO", COCO_SCHEMA), ], ) @pytest.mark.parametrize( @@ -194,7 +191,7 @@ def test_valid_json( input_schema, annotations_test_data, ): - """Test the ValidJSON validator with valid files.""" + """Test the ValidJSON validator with valid inputs.""" filepath = annotations_test_data[ f"{input_file_standard}_{input_json_file_suffix}" ] @@ -223,13 +220,13 @@ def test_valid_json( ), ( "via_json_file_with_schema_error", - VIA_UNTRACKED_SCHEMA, + VIA_SCHEMA, pytest.raises(jsonschema.exceptions.ValidationError), "49.5 is not of type 'integer'\n\n", ), ( "coco_json_file_with_schema_error", - COCO_UNTRACKED_SCHEMA, + COCO_SCHEMA, pytest.raises(jsonschema.exceptions.ValidationError), "[{'area': 432, 'bbox': [1278, 556, 16, 27], 'category_id': 1, " "'id': 8917, 'image_id': 199, 'iscrowd': 0}] is not of type " @@ -263,16 +260,17 @@ def test_valid_json_errors( "VIA_JSON_sample_2.json", ], ) -def test_valid_via_untracked_json(annotations_test_data, input_json_file): +def test_valid_via_json(annotations_test_data, input_json_file): + """Test the ValidVIAJSON validator with valid inputs.""" filepath = annotations_test_data[input_json_file] with does_not_raise(): - ValidVIAUntrackedJSON( + ValidVIAJSON( path=filepath, ) @pytest.mark.parametrize( - "valid_json_file", + "valid_via_json_file", [ "VIA_JSON_sample_1.json", "VIA_JSON_sample_2.json", @@ -316,24 +314,25 @@ def test_valid_via_untracked_json(annotations_test_data, input_json_file): ), ], ) -def test_valid_via_untracked_json_missing_keys( - valid_json_file, +def test_valid_via_json_missing_keys( + valid_via_json_file, missing_keys, via_json_file_with_missing_keys, expected_exception, log_message, ): - # create invalid json file with missing keys + """Test the ValidVIAJSON when input has missing keys.""" + # create invalid VIA json file with missing keys invalid_json_file, edited_image_dicts = via_json_file_with_missing_keys( - valid_json_file, missing_keys + valid_via_json_file, missing_keys ) - # get key of affected image in _via_img_metadata + # get key of affected images in _via_img_metadata img_key_str = edited_image_dicts.get(list(missing_keys.keys())[0], None) # run validation with expected_exception as excinfo: - ValidVIAUntrackedJSON( + ValidVIAJSON( path=invalid_json_file, ) @@ -341,7 +340,7 @@ def test_valid_via_untracked_json_missing_keys( @pytest.mark.parametrize( - "valid_json_file", + "valid_coco_json_file", [ "COCO_JSON_sample_1.json", "COCO_JSON_sample_2.json", @@ -383,16 +382,17 @@ def test_valid_via_untracked_json_missing_keys( ), ], ) -def test_valid_coco_untracked_json( - valid_json_file, +def test_valid_coco_json_missing_keys( + valid_coco_json_file, missing_keys, coco_json_file_with_missing_keys, expected_exception, log_message, ): + """Test the ValidCOCOJSON when input has missing keys.""" # create invalid json file with missing keys invalid_json_file, edited_image_dicts = coco_json_file_with_missing_keys( - valid_json_file, missing_keys + valid_coco_json_file, missing_keys ) # get key of affected image in _via_img_metadata @@ -400,7 +400,7 @@ def test_valid_coco_untracked_json( # run validation with expected_exception as excinfo: - ValidCOCOUntrackedJSON( + ValidCOCOJSON( path=invalid_json_file, ) @@ -415,7 +415,7 @@ def test_valid_coco_untracked_json( {"images": "", "annotations": "", "categories": ""}, "", does_not_raise(), - ), + ), # zero missing keys ( ["images", "annotations", "categories"], {"annotations": "", "categories": ""}, @@ -431,9 +431,9 @@ def test_valid_coco_untracked_json( ( ["images", "annotations", "categories"], {"annotations": "", "categories": ""}, - "TEST", + "FOO", pytest.raises(ValueError), - ), # one missing key + ), # one missing key with additional message ], ) def test_check_keys( From 147eb11be7fa1cb05c524e3525241e79ce854ec2 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 16 Dec 2024 18:45:08 +0000 Subject: [PATCH 31/36] Remove dataloader (now in separate PR) --- ethology/annotations/io.py | 195 ------------------------------------- 1 file changed, 195 deletions(-) delete mode 100644 ethology/annotations/io.py diff --git a/ethology/annotations/io.py b/ethology/annotations/io.py deleted file mode 100644 index 1cf137e..0000000 --- a/ethology/annotations/io.py +++ /dev/null @@ -1,195 +0,0 @@ -"""Module for reading and writing manually labelled annotations.""" - -import json -from pathlib import Path - -import pandas as pd -from movement.validators.files import ValidFile - -from ethology.annotations.json_schemas import COCO_SCHEMA, VIA_SCHEMA -from ethology.annotations.validators import ( - ValidCOCOJSON, - ValidJSON, - ValidVIAJSON, -) - -STANDARD_DF_COLUMNS = [ - "annotation_id", - "image_filename", - "image_id", - "x_min", - "y_min", - "width", - "height", - "supercategory", - "category", -] - - -def df_from_via_json_file(file_path: Path) -> pd.DataFrame: - """Validate and read untracked VIA JSON file. - - The data is formatted as an untracked annotations DataFrame. - - Parameters - ---------- - file_path : Path - Path to the untracked VIA JSON file. - - Returns - ------- - pd.DataFrame - Untracked annotations DataFrame. - - """ - # Run validators - file = ValidFile( - file_path, expected_permission="r", expected_suffix=[".json"] - ) - json_file = ValidJSON(path=file.path, schema=VIA_SCHEMA) - via_untracked_file = ValidVIAJSON(json_file.path) - - # Read as standard dataframe - return _df_from_validated_via_json_file(via_untracked_file.path) - - -def df_from_coco_json_file(file_path: Path) -> pd.DataFrame: - """Validate and read untracked COCO JSON file. - - The data is formatted as an untracked annotations DataFrame. - - Parameters - ---------- - file_path : Path - Path to the untracked COCO JSON file. - - Returns - ------- - pd.DataFrame - Untracked annotations DataFrame. - - """ - # Run validators - file = ValidFile( - file_path, expected_permission="r", expected_suffix=[".json"] - ) - json_file = ValidJSON(path=file.path, schema=COCO_SCHEMA) - coco_untracked_file = ValidCOCOJSON(json_file.path) - - # Read as standard dataframe - return _df_from_validated_coco_json_file(coco_untracked_file.path) - - -def _df_from_validated_via_json_file(file_path): - """Read VIA JSON file as standard untracked annotations DataFrame.""" - # Read validated json as dict - with open(file_path) as file: - data_dict = json.load(file) - - # Prepare data - image_metadata_dict = data_dict["_via_img_metadata"] - via_image_id_list = data_dict[ - "_via_image_id_list" - ] # ordered list of the keys in image_metadata_dict - - # map filename to keys in image_metadata_dict - map_filename_to_via_img_id = { - img_dict["filename"]: ky - for ky, img_dict in image_metadata_dict.items() - } - - # Build standard dataframe - list_rows = [] - # loop thru images - for _, img_dict in image_metadata_dict.items(): - # loop thru annotations in the image - for region in img_dict["regions"]: - region_shape = region["shape_attributes"] - region_attributes = region["region_attributes"] - - row = { - "image_filename": img_dict["filename"], - "x_min": region_shape["x"], - "y_min": region_shape["y"], - "width": region_shape["width"], - "height": region_shape["height"], - "supercategory": list(region_attributes.keys())[ - 0 - ], # takes first key as supercategory - "category": region_attributes[ - list(region_attributes.keys())[0] - ], - } - - # append annotations to df - list_rows.append(row) - - df = pd.DataFrame( - list_rows, - # columns=list(row.keys()), # do I need this? - ) - - # add image_id column - df["image_id"] = df["image_filename"].apply( - lambda x: via_image_id_list.index(map_filename_to_via_img_id[x]) - ) - - # add annotation_id column based on index - df["annotation_id"] = df.index - - # reorder columns to match standard - df = df.reindex(columns=STANDARD_DF_COLUMNS) - - return df - - -def _df_from_validated_coco_json_file(file_path: Path) -> pd.DataFrame: - """Read COCO JSON file as standard untracked annotations DataFrame.""" - # Read validated json as dict - with open(file_path) as file: - data_dict = json.load(file) - - # Prepare data - map_image_id_to_filename = { - img_dict["id"]: img_dict["file_name"] - for img_dict in data_dict["images"] - } - - map_category_id_to_category_data = { - cat_dict["id"]: (cat_dict["name"], cat_dict["supercategory"]) - for cat_dict in data_dict["categories"] - } - - # Build standard dataframe - list_rows = [] - for annot_dict in data_dict["annotations"]: - annotation_id = annot_dict["id"] - # image data - image_id = annot_dict["image_id"] - image_filename = map_image_id_to_filename[image_id] - - # bbox data - x_min, y_min, width, height = annot_dict["bbox"] - - # class data - category_id = annot_dict["category_id"] - category, supercategory = map_category_id_to_category_data[category_id] - - row = { - "annotation_id": annotation_id, - "image_filename": image_filename, - "image_id": image_id, - "x_min": x_min, - "y_min": y_min, - "width": width, - "height": height, - "supercategory": supercategory, - "category": category, - } - - list_rows.append(row) - - df = pd.DataFrame(list_rows) - df.reindex(columns=STANDARD_DF_COLUMNS) - - return df From 3af9b4e159865a641a0cfea56b7fded48ef4ec7c Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 16 Dec 2024 18:55:59 +0000 Subject: [PATCH 32/36] Add references to JSON schemas --- ethology/annotations/json_schemas.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ethology/annotations/json_schemas.py b/ethology/annotations/json_schemas.py index 79a6773..99d9387 100644 --- a/ethology/annotations/json_schemas.py +++ b/ethology/annotations/json_schemas.py @@ -1,4 +1,11 @@ -"""JSON schemas for VIA and COCO annotations.""" +"""JSON schemas for VIA and COCO annotations. + +References +---------- +- https://github.com/python-jsonschema/jsonschema +- https://json-schema.org/understanding-json-schema/ + +""" VIA_SCHEMA = { "type": "object", From 6d9cafff9a8a44f6a4cab06ad2975f269bb89713 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 16 Dec 2024 19:22:38 +0000 Subject: [PATCH 33/36] Fix duplicates --- pyproject.toml | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0e4914f..3fb07d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,17 +17,9 @@ classifiers = [ "Programming Language :: Python :: 3.12", "Operating System :: OS Independent", "License :: OSI Approved :: BSD License", - "Development Status :: 2 - Pre-Alpha", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Operating System :: OS Independent", - "License :: OSI Approved :: BSD License", ] dependencies = [ - "movement" + "movement", ] [project.urls] From d44be8bbc74a83afccf8c9f482debc2dd1d31404 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Tue, 17 Dec 2024 17:54:03 +0000 Subject: [PATCH 34/36] Add comments and references to schemas module --- ethology/annotations/json_schemas.py | 51 ++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/ethology/annotations/json_schemas.py b/ethology/annotations/json_schemas.py index 99d9387..4cbdd5a 100644 --- a/ethology/annotations/json_schemas.py +++ b/ethology/annotations/json_schemas.py @@ -1,16 +1,28 @@ -"""JSON schemas for VIA and COCO annotations. +"""JSON schemas for manual annotations files. + +We use JSON schemas to check the structure of a supported +annotation file via validators. + +Note that the schema validation only checks the type of a key +if that key is present. It does not check for the presence of +the keys. References ---------- - https://github.com/python-jsonschema/jsonschema - https://json-schema.org/understanding-json-schema/ +- https://cocodataset.org/#format-data +- https://gitlab.com/vgg/via/-/blob/master/via-2.x.y/CodeDoc.md?ref_type=heads#description-of-via-project-json-file """ +# The VIA schema corresponds to the +# format exported by VGG Image Annotator 2.x.y +# for manual labels VIA_SCHEMA = { "type": "object", "properties": { - # settings for browser UI + # settings for the browser-based UI of VIA "_via_settings": { "type": "object", "properties": { @@ -19,18 +31,20 @@ "project": {"type": "object"}, }, }, - # annotation data + # annotations data per image "_via_img_metadata": { "type": "object", "additionalProperties": { - # "additionalProperties" to allow any key, - # see https://stackoverflow.com/a/69811612/24834957 + # Each image under _via_img_metadata is indexed + # using a unique key: FILENAME-FILESIZE. + # We use "additionalProperties" to allow for any + # key name, see https://stackoverflow.com/a/69811612/24834957 "type": "object", "properties": { "filename": {"type": "string"}, "size": {"type": "integer"}, "regions": { - "type": "array", # a list of dicts + "type": "array", # 'regions' is a list of dicts "items": { "type": "object", "properties": { @@ -43,9 +57,7 @@ "width": {"type": "integer"}, "height": {"type": "integer"}, }, - "region_attributes": { - "type": "object" - }, # we just check it's a dict + "region_attributes": {"type": "object"}, }, }, }, @@ -54,13 +66,15 @@ }, }, }, - # ordered list of image keys - # - the position defines the image ID + # _via_image_id_list contains an + # ordered list of image keys using a unique key: FILENAME-FILESIZE, + # the position in the list defines the image ID "_via_image_id_list": { "type": "array", "items": {"type": "string"}, }, - # region (aka annotation) and file attributes for VIA UI + # region attributes and file attributes, to + # display in VIA's UI and to classify the data "_via_attributes": { "type": "object", "properties": { @@ -68,11 +82,14 @@ "file": {"type": "object"}, }, }, - # version of the VIA data format + # version of the VIA tool used "_via_data_format_version": {"type": "string"}, }, } +# The COCO schema follows the COCO dataset +# format for object detection +# See https://cocodataset.org/#format-data COCO_SCHEMA = { "type": "object", "properties": { @@ -97,15 +114,19 @@ "items": { "type": "object", "properties": { - "id": {"type": "integer"}, # annotation global ID + "id": {"type": "integer"}, "image_id": {"type": "integer"}, "bbox": { "type": "array", "items": {"type": "integer"}, }, + # (box coordinates are measured from the + # top left image corner and are 0-indexed) "category_id": {"type": "integer"}, - "area": {"type": "integer"}, + "area": {"type": "number"}, + # float according to the official schema "iscrowd": {"type": "integer"}, + # 0 or 1 according to the official schema }, }, }, From 316779e2e3b7474d7736c1bdba7cf265eec84715 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Tue, 17 Dec 2024 18:01:23 +0000 Subject: [PATCH 35/36] Add comments to the validators --- ethology/annotations/validators.py | 49 ++++++++++++++++-------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py index f03bb55..888defc 100644 --- a/ethology/annotations/validators.py +++ b/ethology/annotations/validators.py @@ -11,14 +11,17 @@ @define class ValidJSON: - """Class for validating JSON files. + """Class for valid JSON files. + + It checks the JSON file exists, can be decoded, and optionally + validates the file against a JSON schema. Attributes ---------- path : pathlib.Path Path to the JSON file. - schema : dict + schema : dict, optional JSON schema to validate the file against. Raises @@ -26,9 +29,10 @@ class ValidJSON: FileNotFoundError If the file does not exist. ValueError - If the JSON file cannot be decoded, or - if the type of any of its keys does not match those - specified in the schema. + If the JSON file cannot be decoded. + jsonschema.exceptions.ValidationError + If the type of any of the keys in the JSON file + does not match the type specified in the schema. Notes @@ -63,7 +67,8 @@ def _file_matches_JSON_schema(self, attribute, value): """Ensure that the JSON file matches the expected schema. The schema validation only checks the type for each specified - key if it exists. It does not check for the presence of the keys. + key if the key exists. It does not check for the presence of + the keys. """ # read json file with open(value) as file: @@ -80,14 +85,19 @@ def _file_matches_JSON_schema(self, attribute, value): @define class ValidVIAJSON: - """Class for validating VIA JSON files for untracked data. + """Class for valid VIA JSON files for untracked data. - Checks the VIA JSON file for untracked data contains the required keys. + It checks the input VIA JSON file contains the required keys. Attributes ---------- path : pathlib.Path - Path to the JSON file. + Path to the VIA JSON file. + + Raises + ------ + ValueError + If the VIA JSON file misses any of the required keys. """ @@ -95,7 +105,7 @@ class ValidVIAJSON: @path.validator def _file_contains_required_keys(self, attribute, value): - """Ensure that the JSON file contains the required keys.""" + """Ensure that the VIA JSON file contains the required keys.""" required_keys = { "main": ["_via_img_metadata", "_via_image_id_list"], "image_keys": ["filename", "regions"], @@ -136,25 +146,19 @@ def _file_contains_required_keys(self, attribute, value): @define class ValidCOCOJSON: - """Class for validating COCO JSON files for untracked data. + """Class valid COCO JSON files for untracked data. - The validator ensures that the file matches the expected schema. - The schema validation only checks the type for each specified - key if it exists. It does not check for the presence of the keys. + It checks the input COCO JSON file contains the required keys. Attributes ---------- path : pathlib.Path - Path to the JSON file. + Path to the COCO JSON file. Raises ------ ValueError - If the JSON file does not match the expected schema. - - Notes - ----- - https://json-schema.org/understanding-json-schema/ + If the COCO JSON file misses any of the required keys. """ @@ -162,10 +166,10 @@ class ValidCOCOJSON: @path.validator def _file_contains_required_keys(self, attribute, value): - """Ensure that the JSON file contains the required keys.""" + """Ensure that the COCO JSON file contains the required keys.""" required_keys = { "main": ["images", "annotations", "categories"], - "image_keys": ["id", "file_name"], # "height", "width"? + "image_keys": ["id", "file_name"], # add "height" and "width"? "annotations_keys": ["id", "image_id", "bbox", "category_id"], "categories_keys": ["id", "name", "supercategory"], } @@ -207,6 +211,7 @@ def _check_keys( data_dict: dict, additional_message: str = "", ): + """Check if the required keys are present in the input data_dict.""" missing_keys = set(list_required_keys) - data_dict.keys() if missing_keys: raise ValueError( From e9dbda40ed2e791c14db23f1fac0c50edb37e2d4 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Tue, 17 Dec 2024 18:57:42 +0000 Subject: [PATCH 36/36] Make ValidCOCOJSON and ValidVIAJSON inherit from ValidJSON --- ethology/annotations/validators.py | 32 ++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/ethology/annotations/validators.py b/ethology/annotations/validators.py index 888defc..4ff1179 100644 --- a/ethology/annotations/validators.py +++ b/ethology/annotations/validators.py @@ -3,11 +3,14 @@ import json from pathlib import Path +import attrs import jsonschema import jsonschema.exceptions import jsonschema.validators from attrs import define, field, validators +from ethology.annotations.json_schemas import COCO_SCHEMA, VIA_SCHEMA + @define class ValidJSON: @@ -84,7 +87,7 @@ def _file_matches_JSON_schema(self, attribute, value): @define -class ValidVIAJSON: +class ValidVIAJSON(ValidJSON): """Class for valid VIA JSON files for untracked data. It checks the input VIA JSON file contains the required keys. @@ -94,6 +97,9 @@ class ValidVIAJSON: path : pathlib.Path Path to the VIA JSON file. + schema : dict, optional + JSON schema to validate the file against. Default is VIA_SCHEMA. + Raises ------ ValueError @@ -101,8 +107,17 @@ class ValidVIAJSON: """ - path: Path = field(validator=validators.instance_of(Path)) + # run the parent's validators first + path: Path = field(validator=attrs.fields(ValidJSON).path.validator) + schema: dict = field( + validator=attrs.fields(ValidJSON).schema.validator, # type: ignore + default=VIA_SCHEMA, + ) + # TODO: add a validator to check the schema defines types + # for the required keys + + # run additional validators @path.validator def _file_contains_required_keys(self, attribute, value): """Ensure that the VIA JSON file contains the required keys.""" @@ -145,7 +160,7 @@ def _file_contains_required_keys(self, attribute, value): @define -class ValidCOCOJSON: +class ValidCOCOJSON(ValidJSON): """Class valid COCO JSON files for untracked data. It checks the input COCO JSON file contains the required keys. @@ -162,8 +177,17 @@ class ValidCOCOJSON: """ - path: Path = field(validator=validators.instance_of(Path)) + # run the parent's validators first + path: Path = field(validator=attrs.fields(ValidJSON).path.validator) + schema: dict = field( + validator=attrs.fields(ValidJSON).schema.validator, # type: ignore + default=COCO_SCHEMA, + ) + + # TODO: add a validator to check the schema defines types + # for the required keys + # run additional validators @path.validator def _file_contains_required_keys(self, attribute, value): """Ensure that the COCO JSON file contains the required keys."""