Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validate VIA and COCO files for untracked data #20

Draft
wants to merge 36 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
00c309e
add notebook to read COCO annotations as xarray
sfmig Dec 10, 2024
261e03c
Simplify
sfmig Dec 10, 2024
57dc2b9
Add movement as dependency, fix indentation
sfmig Dec 11, 2024
5f16511
Add io module for reading manual annotations
sfmig Dec 11, 2024
0945291
Add validators for manual annotation files
sfmig Dec 11, 2024
fd2e30c
Remove notebook for xarray
sfmig Dec 11, 2024
c69fea9
Fix schema for validation
sfmig Dec 11, 2024
0a1d64e
Add validation and loading for COCO style file
sfmig Dec 11, 2024
57c94ec
Keys check draft
sfmig Dec 11, 2024
1d19eb9
Improve keys check
sfmig Dec 11, 2024
97df63c
Add keys check for COCO untracked json validator
sfmig Dec 11, 2024
029bb8b
Remove some comments
sfmig Dec 11, 2024
23d5259
Delete utils
sfmig Dec 12, 2024
9fbc4f7
Add module for fixtures
sfmig Dec 12, 2024
d3809e0
Change JSON error to FileNotFound
sfmig Dec 12, 2024
5866d10
Add shared fixtures across all tests to conftest
sfmig Dec 12, 2024
8504a59
Add annotations_test_data fixture to its module
sfmig Dec 12, 2024
a22bfc4
Add test for JSON file validator
sfmig Dec 12, 2024
6edbfc4
Fix fresh download of files-registry
sfmig Dec 12, 2024
9976715
Fix tests to work with unzipped subdirectories
sfmig Dec 12, 2024
a4ffb44
Add test for JSON validator
sfmig Dec 16, 2024
b901658
Factor out schemas and include schema validation in ValidJSON
sfmig Dec 16, 2024
05696a7
Make schema optional
sfmig Dec 16, 2024
8223701
Add tests for schemas
sfmig Dec 16, 2024
92d6b74
Reduce error message check for schema validation
sfmig Dec 16, 2024
a6523e1
Add tests for keys check WIP
sfmig Dec 16, 2024
ab65d95
Add test for VIA JSON untracked validator
sfmig Dec 16, 2024
3e24186
Add test for valid_coco_untracked_json
sfmig Dec 16, 2024
e8fcb64
Add test for check_keys
sfmig Dec 16, 2024
06b47c1
Remove untracked from names
sfmig Dec 16, 2024
147eb11
Remove dataloader (now in separate PR)
sfmig Dec 16, 2024
3af9b4e
Add references to JSON schemas
sfmig Dec 16, 2024
6d9caff
Fix duplicates
sfmig Dec 16, 2024
d44be8b
Add comments and references to schemas module
sfmig Dec 17, 2024
316779e
Add comments to the validators
sfmig Dec 17, 2024
e9dbda4
Make ValidCOCOJSON and ValidVIAJSON inherit from ValidJSON
sfmig Dec 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ repos:
args: [--fix=lf]
- id: name-tests-test
args: ["--pytest-test-first"]
exclude: ^tests/fixtures
- id: requirements-txt-fixer
- id: trailing-whitespace
- repo: https://github.com/pre-commit/pygrep-hooks
Expand Down
145 changes: 145 additions & 0 deletions ethology/annotations/json_schemas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
"""JSON schemas for manual annotations files.

We use JSON schemas to check the structure of a supported
annotation file via validators.

Note that the schema validation only checks the type of a key
if that key is present. It does not check for the presence of
the keys.

References
----------
- https://github.com/python-jsonschema/jsonschema
- https://json-schema.org/understanding-json-schema/
- https://cocodataset.org/#format-data
- https://gitlab.com/vgg/via/-/blob/master/via-2.x.y/CodeDoc.md?ref_type=heads#description-of-via-project-json-file

"""

# The VIA schema corresponds to the
# format exported by VGG Image Annotator 2.x.y
# for manual labels
VIA_SCHEMA = {
"type": "object",
"properties": {
# settings for the browser-based UI of VIA
"_via_settings": {
"type": "object",
"properties": {
"ui": {"type": "object"},
"core": {"type": "object"},
"project": {"type": "object"},
},
},
# annotations data per image
"_via_img_metadata": {
"type": "object",
"additionalProperties": {
# Each image under _via_img_metadata is indexed
# using a unique key: FILENAME-FILESIZE.
# We use "additionalProperties" to allow for any
# key name, see https://stackoverflow.com/a/69811612/24834957
"type": "object",
"properties": {
"filename": {"type": "string"},
"size": {"type": "integer"},
"regions": {
"type": "array", # 'regions' is a list of dicts
"items": {
"type": "object",
"properties": {
"shape_attributes": {
"type": "object",
"properties": {
"name": {"type": "string"},
"x": {"type": "integer"},
"y": {"type": "integer"},
"width": {"type": "integer"},
"height": {"type": "integer"},
},
"region_attributes": {"type": "object"},
},
},
},
},
"file_attributes": {"type": "object"},
},
},
},
# _via_image_id_list contains an
# ordered list of image keys using a unique key: FILENAME-FILESIZE,
# the position in the list defines the image ID
"_via_image_id_list": {
"type": "array",
"items": {"type": "string"},
},
# region attributes and file attributes, to
# display in VIA's UI and to classify the data
"_via_attributes": {
"type": "object",
"properties": {
"region": {"type": "object"},
"file": {"type": "object"},
},
},
# version of the VIA tool used
"_via_data_format_version": {"type": "string"},
},
}

# The COCO schema follows the COCO dataset
# format for object detection
# See https://cocodataset.org/#format-data
COCO_SCHEMA = {
"type": "object",
"properties": {
"info": {"type": "object"},
"licenses": {
"type": "array",
},
"images": {
"type": "array",
"items": {
"type": "object",
"properties": {
"file_name": {"type": "string"},
"id": {"type": "integer"},
"width": {"type": "integer"},
"height": {"type": "integer"},
},
},
},
"annotations": {
"type": "array",
"items": {
"type": "object",
"properties": {
"id": {"type": "integer"},
"image_id": {"type": "integer"},
"bbox": {
"type": "array",
"items": {"type": "integer"},
},
# (box coordinates are measured from the
# top left image corner and are 0-indexed)
"category_id": {"type": "integer"},
"area": {"type": "number"},
# float according to the official schema
"iscrowd": {"type": "integer"},
# 0 or 1 according to the official schema
},
},
},
"categories": {
"type": "array",
"items": {
"type": "object",
"properties": {
"id": {"type": "integer"},
"name": {"type": "string"},
"supercategory": {"type": "string"},
},
},
},
},
}
244 changes: 244 additions & 0 deletions ethology/annotations/validators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
"""Validators for annotation files."""

import json
from pathlib import Path

import attrs
import jsonschema
import jsonschema.exceptions
import jsonschema.validators
from attrs import define, field, validators

from ethology.annotations.json_schemas import COCO_SCHEMA, VIA_SCHEMA


@define
class ValidJSON:
"""Class for valid JSON files.

It checks the JSON file exists, can be decoded, and optionally
validates the file against a JSON schema.

Attributes
----------
path : pathlib.Path
Path to the JSON file.

schema : dict, optional
JSON schema to validate the file against.

Raises
------
FileNotFoundError
If the file does not exist.
ValueError
If the JSON file cannot be decoded.
jsonschema.exceptions.ValidationError
If the type of any of the keys in the JSON file
does not match the type specified in the schema.


Notes
-----
https://json-schema.org/understanding-json-schema/

"""

# Required attributes
path: Path = field(validator=validators.instance_of(Path))

# Optional attributes
schema: dict | None = field(default=None)

@path.validator
def _file_is_json(self, attribute, value):
"""Ensure that the file is a JSON file."""
try:
with open(value) as file:
json.load(file)
except FileNotFoundError as not_found_error:
raise FileNotFoundError(
f"File not found: {value}."
) from not_found_error
except json.JSONDecodeError as decode_error:
raise ValueError(
f"Error decoding JSON data from file: {value}."
) from decode_error

@path.validator
def _file_matches_JSON_schema(self, attribute, value):
"""Ensure that the JSON file matches the expected schema.

The schema validation only checks the type for each specified
key if the key exists. It does not check for the presence of
the keys.
"""
# read json file
with open(value) as file:
data = json.load(file)

# check against schema if provided
if self.schema:
try:
jsonschema.validate(instance=data, schema=self.schema)
except jsonschema.exceptions.ValidationError as val_err:
# forward the error message as it is quite informative
raise val_err


@define
class ValidVIAJSON(ValidJSON):
"""Class for valid VIA JSON files for untracked data.

It checks the input VIA JSON file contains the required keys.

Attributes
----------
path : pathlib.Path
Path to the VIA JSON file.

schema : dict, optional
JSON schema to validate the file against. Default is VIA_SCHEMA.

Raises
------
ValueError
If the VIA JSON file misses any of the required keys.

"""

# run the parent's validators first
path: Path = field(validator=attrs.fields(ValidJSON).path.validator)
schema: dict = field(
validator=attrs.fields(ValidJSON).schema.validator, # type: ignore
default=VIA_SCHEMA,
)

# TODO: add a validator to check the schema defines types
# for the required keys

# run additional validators
@path.validator
def _file_contains_required_keys(self, attribute, value):
"""Ensure that the VIA JSON file contains the required keys."""
required_keys = {
"main": ["_via_img_metadata", "_via_image_id_list"],
"image_keys": ["filename", "regions"],
"region_keys": ["shape_attributes", "region_attributes"],
"shape_attributes_keys": ["x", "y", "width", "height"],
}

# Read data as dict
with open(value) as file:
data = json.load(file)

# Check first level keys
_check_keys(required_keys["main"], data)

# Check keys in nested dicts
for img_str, img_dict in data["_via_img_metadata"].items():
# Check keys for each image dictionary
_check_keys(
required_keys["image_keys"],
img_dict,
additional_message=f" for {img_str}",
)
# Check keys for each region
for i, region in enumerate(img_dict["regions"]):
_check_keys(
required_keys["region_keys"],
region,
additional_message=f" for region {i} under {img_str}",
)

# Check keys under shape_attributes
_check_keys(
required_keys["shape_attributes_keys"],
region["shape_attributes"],
additional_message=f" for region {i} under {img_str}",
)


@define
class ValidCOCOJSON(ValidJSON):
"""Class valid COCO JSON files for untracked data.

It checks the input COCO JSON file contains the required keys.

Attributes
----------
path : pathlib.Path
Path to the COCO JSON file.

Raises
------
ValueError
If the COCO JSON file misses any of the required keys.

"""

# run the parent's validators first
path: Path = field(validator=attrs.fields(ValidJSON).path.validator)
schema: dict = field(
validator=attrs.fields(ValidJSON).schema.validator, # type: ignore
default=COCO_SCHEMA,
)

# TODO: add a validator to check the schema defines types
# for the required keys

# run additional validators
@path.validator
def _file_contains_required_keys(self, attribute, value):
"""Ensure that the COCO JSON file contains the required keys."""
required_keys = {
"main": ["images", "annotations", "categories"],
"image_keys": ["id", "file_name"], # add "height" and "width"?
"annotations_keys": ["id", "image_id", "bbox", "category_id"],
"categories_keys": ["id", "name", "supercategory"],
}

# Read data as dict
with open(value) as file:
data = json.load(file)

# Check first level keys
_check_keys(required_keys["main"], data)

# Check keys in images dicts
for img_dict in data["images"]:
_check_keys(
required_keys["image_keys"],
img_dict,
additional_message=f" for image dict {img_dict}",
)

# Check keys in annotations dicts
for annot_dict in data["annotations"]:
_check_keys(
required_keys["annotations_keys"],
annot_dict,
additional_message=f" for annotation dict {annot_dict}",
)

# Check keys in categories dicts
for cat_dict in data["categories"]:
_check_keys(
required_keys["categories_keys"],
cat_dict,
additional_message=f" for category dict {cat_dict}",
)


def _check_keys(
list_required_keys: list[str],
data_dict: dict,
additional_message: str = "",
):
"""Check if the required keys are present in the input data_dict."""
missing_keys = set(list_required_keys) - data_dict.keys()
if missing_keys:
raise ValueError(
f"Required key(s) {sorted(missing_keys)} not "
f"found in {list(data_dict.keys())}{additional_message}."
)
Loading
Loading