Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validate VIA and COCO files for untracked data #20

Draft
wants to merge 36 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
00c309e
add notebook to read COCO annotations as xarray
sfmig Dec 10, 2024
261e03c
Simplify
sfmig Dec 10, 2024
57dc2b9
Add movement as dependency, fix indentation
sfmig Dec 11, 2024
5f16511
Add io module for reading manual annotations
sfmig Dec 11, 2024
0945291
Add validators for manual annotation files
sfmig Dec 11, 2024
fd2e30c
Remove notebook for xarray
sfmig Dec 11, 2024
c69fea9
Fix schema for validation
sfmig Dec 11, 2024
0a1d64e
Add validation and loading for COCO style file
sfmig Dec 11, 2024
57c94ec
Keys check draft
sfmig Dec 11, 2024
1d19eb9
Improve keys check
sfmig Dec 11, 2024
97df63c
Add keys check for COCO untracked json validator
sfmig Dec 11, 2024
029bb8b
Remove some comments
sfmig Dec 11, 2024
23d5259
Delete utils
sfmig Dec 12, 2024
9fbc4f7
Add module for fixtures
sfmig Dec 12, 2024
d3809e0
Change JSON error to FileNotFound
sfmig Dec 12, 2024
5866d10
Add shared fixtures across all tests to conftest
sfmig Dec 12, 2024
8504a59
Add annotations_test_data fixture to its module
sfmig Dec 12, 2024
a22bfc4
Add test for JSON file validator
sfmig Dec 12, 2024
6edbfc4
Fix fresh download of files-registry
sfmig Dec 12, 2024
9976715
Fix tests to work with unzipped subdirectories
sfmig Dec 12, 2024
a4ffb44
Add test for JSON validator
sfmig Dec 16, 2024
b901658
Factor out schemas and include schema validation in ValidJSON
sfmig Dec 16, 2024
05696a7
Make schema optional
sfmig Dec 16, 2024
8223701
Add tests for schemas
sfmig Dec 16, 2024
92d6b74
Reduce error message check for schema validation
sfmig Dec 16, 2024
a6523e1
Add tests for keys check WIP
sfmig Dec 16, 2024
ab65d95
Add test for VIA JSON untracked validator
sfmig Dec 16, 2024
3e24186
Add test for valid_coco_untracked_json
sfmig Dec 16, 2024
e8fcb64
Add test for check_keys
sfmig Dec 16, 2024
06b47c1
Remove untracked from names
sfmig Dec 16, 2024
147eb11
Remove dataloader (now in separate PR)
sfmig Dec 16, 2024
3af9b4e
Add references to JSON schemas
sfmig Dec 16, 2024
6d9caff
Fix duplicates
sfmig Dec 16, 2024
d44be8b
Add comments and references to schemas module
sfmig Dec 17, 2024
316779e
Add comments to the validators
sfmig Dec 17, 2024
e9dbda4
Make ValidCOCOJSON and ValidVIAJSON inherit from ValidJSON
sfmig Dec 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add validation and loading for COCO style file
  • Loading branch information
sfmig committed Dec 16, 2024
commit 0a1d64e6c3464a5c73d8c13adc0ce40465f8ce70
119 changes: 95 additions & 24 deletions ethology/annotations/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@
import pandas as pd
from movement.validators.files import ValidFile

from ethology.annotations.validators import ValidJSON, ValidVIAUntrackedJSON
from ethology.annotations.validators import (
ValidCOCOUntrackedJSON,
ValidJSON,
ValidVIAUntrackedJSON,
)

STANDARD_DF_COLUMNS = [
"annotation_id",
Expand All @@ -16,15 +20,15 @@
"y_min",
"width",
"height",
"superclass",
"class",
"supercategory",
"category",
]


def df_from_via_json_file(file_path: Path):
def df_from_via_json_file(file_path: Path) -> pd.DataFrame:
"""Validate and read untracked VIA JSON file.

The data is formated as an untracked annotations DataFrame.
The data is formatted as an untracked annotations DataFrame.
"""
# General file validation
file = ValidFile(
Expand All @@ -41,13 +45,30 @@ def df_from_via_json_file(file_path: Path):
return _df_from_validated_via_json_file(via_untracked_file.path)


def df_from_coco_json_file(file_path: Path) -> pd.DataFrame:
"""Validate and read COCO JSON file."""
# General file validation
file = ValidFile(
file_path, expected_permission="r", expected_suffix=[".json"]
)

# JSON file validation
json_file = ValidJSON(file.path)

# COCO Untracked JSON schema validation
coco_untracked_file = ValidCOCOUntrackedJSON(json_file.path)

# Read as standard dataframe
return _df_from_validated_coco_json_file(coco_untracked_file.path)


def _df_from_validated_via_json_file(file_path):
"""Read VIA JSON file as standard untracked annotations DataFrame."""
# Read validated json as dict
with open(file_path) as file:
data_dict = json.load(file)

# Get relevant fields
# Prepare data
image_metadata_dict = data_dict["_via_img_metadata"]
via_image_id_list = data_dict[
"_via_image_id_list"
Expand All @@ -68,28 +89,26 @@ def _df_from_validated_via_json_file(file_path):
region_shape = region["shape_attributes"]
region_attributes = region["region_attributes"]

row = {
"image_filename": img_dict["filename"],
"x_min": region_shape["x"],
"y_min": region_shape["y"],
"width": region_shape["width"],
"height": region_shape["height"],
"supercategory": list(region_attributes.keys())[
0
], # takes first key as supercategory
"category": region_attributes[
list(region_attributes.keys())[0]
],
}

# append annotations to df
list_rows.append(
{
"image_filename": img_dict["filename"],
"x_min": region_shape["x"],
"y_min": region_shape["y"],
"width": region_shape["width"],
"height": region_shape["height"],
"superclass": list(region_attributes.keys())[
0
], # takes first key as superclass
"class": region_attributes[
list(region_attributes.keys())[0]
],
},
)
list_rows.append(row)

df = pd.DataFrame(
list_rows,
columns=[
col for col in STANDARD_DF_COLUMNS if not col.endswith("_id")
],
# columns=list(row.keys()), # do I need this?
)

# add image_id column
Expand All @@ -104,3 +123,55 @@ def _df_from_validated_via_json_file(file_path):
df = df.reindex(columns=STANDARD_DF_COLUMNS)

return df


def _df_from_validated_coco_json_file(file_path: Path) -> pd.DataFrame:
"""Read COCO JSON file as standard untracked annotations DataFrame."""
# Read validated json as dict
with open(file_path) as file:
data_dict = json.load(file)

# Prepare data
map_image_id_to_filename = {
img_dict["id"]: img_dict["file_name"]
for img_dict in data_dict["images"]
}

map_category_id_to_category_data = {
cat_dict["id"]: (cat_dict["name"], cat_dict["supercategory"])
for cat_dict in data_dict["categories"]
}

# Build standard dataframe
list_rows = []
for annot_dict in data_dict["annotations"]:
annotation_id = annot_dict["id"]
# image data
image_id = annot_dict["image_id"]
image_filename = map_image_id_to_filename[image_id]

# bbox data
x_min, y_min, width, height = annot_dict["bbox"]

# class data
category_id = annot_dict["category_id"]
category, supercategory = map_category_id_to_category_data[category_id]

row = {
"annotation_id": annotation_id,
"image_filename": image_filename,
"image_id": image_id,
"x_min": x_min,
"y_min": y_min,
"width": width,
"height": height,
"supercategory": supercategory,
"category": category,
}

list_rows.append(row)

df = pd.DataFrame(list_rows)
df.reindex(columns=STANDARD_DF_COLUMNS)

return df
118 changes: 108 additions & 10 deletions ethology/annotations/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,9 @@ class ValidVIAUntrackedJSON:
"""Class for validating VIA JSON files for untracked data.

The validator ensures that the file matches the expected schema.
The schema validation only checks the type for each specified
key if it exists. It does not check for the presence of the keys.

https://json-schema.org/understanding-json-schema/reference/object#additional-properties

Attributes
----------
Expand All @@ -59,10 +60,16 @@ class ValidVIAUntrackedJSON:
ValueError
If the JSON file does not match the expected schema.

Notes
-----
https://json-schema.org/understanding-json-schema/

"""

# TODO: add a check for the presence of the keys
# that I use in loading the data

path: Path = field(validator=validators.instance_of(Path))
# expected_schema: dict = field(factory=dict, kw_only=True)

@path.validator
def _file_macthes_VIA_JSON_schema(self, attribute, value):
Expand Down Expand Up @@ -135,7 +142,8 @@ def _file_macthes_VIA_JSON_schema(self, attribute, value):
},
}

# should have been validated with ValidVIAUntrackedJSON
# should have been validated with ValidJSON
# already so this should work fine
with open(value) as file:
data = json.load(file)

Expand All @@ -147,12 +155,102 @@ def _file_macthes_VIA_JSON_schema(self, attribute, value):
"The JSON data does not match "
f"the provided schema: {VIA_JSON_schema}"
) from val_err
# except jsonschema.exceptions.SchemaError as schema_err:
# raise ValueError(
# f"Invalid schema provided: {VIA_JSON_schema}"
# ) from schema_err


# @define
# class ValidCOCOUntrackedJSON:
# pass
@define
class ValidCOCOUntrackedJSON:
"""Class for validating COCO JSON files for untracked data.

The validator ensures that the file matches the expected schema.
The schema validation only checks the type for each specified
key if it exists. It does not check for the presence of the keys.

Attributes
----------
path : pathlib.Path
Path to the JSON file.

Raises
------
ValueError
If the JSON file does not match the expected schema.

Notes
-----
https://json-schema.org/understanding-json-schema/

"""

path: Path = field(validator=validators.instance_of(Path))

# TODO: add a check for the presence of the keys
# that I use in loading the data

@path.validator
def _file_macthes_COCO_JSON_schema(self, attribute, value):
"""Ensure that the JSON file matches the expected schema."""
# Define schema for VIA JSON file for untracked
# (aka manually labelled) data
COCO_JSON_schema = {
"type": "object",
"properties": {
"info": {"type": "object"},
"licenses": {
"type": "array",
},
"images": {
"type": "array",
"items": {
"type": "object",
"properties": {
"file_name": {"type": "string"},
"id": {"type": "integer"},
"width": {"type": "integer"},
"height": {"type": "integer"},
},
},
},
"annotations": {
"type": "array",
"items": {
"type": "object",
"properties": {
"id": {"type": "integer"}, # annotation global ID
"image_id": {"type": "integer"},
"bbox": {
"type": "array",
"items": {"type": "integer"},
},
"category_id": {"type": "integer"},
"area": {"type": "integer"},
"iscrowd": {"type": "integer"},
},
},
},
"categories": {
"type": "array",
"items": {
"type": "object",
"properties": {
"id": {"type": "integer"},
"name": {"type": "string"},
"supercategory": {"type": "string"},
},
},
},
},
}

# should have been validated with ValidJSON
# already so this should work fine
with open(value) as file:
data = json.load(file)

# check against schema
try:
jsonschema.validate(instance=data, schema=COCO_JSON_schema)
except jsonschema.exceptions.ValidationError as val_err:
raise ValueError(
"The JSON data does not match "
f"the provided schema: {COCO_JSON_schema}"
) from val_err