Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add sanity check processing #11

Draft
wants to merge 9 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions dataset/sanity_checks/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import os
from picsellia import Client
from picsellia.types.enums import AnnotationFileType
import logging

from pycocotools.coco import COCO
from utils import (
get_duplicate_filenames,
get_duplicate_images,
add_tags_to_duplicate_images,
add_nbr_channel_byte_tags,
get_area_outlier_filenames,
log_results,
)

os.environ["PICSELLIA_SDK_CUSTOM_LOGGING"] = "True"
os.environ["PICSELLIA_SDK_DOWNLOAD_BAR_MODE"] = "2"
logging.getLogger("picsellia").setLevel(logging.INFO)

api_token = os.environ["api_token"]
organization_id = os.environ["organization_id"]
job_id = os.environ["job_id"]

if "host" not in os.environ:
host = "https://app.picsellia.com"
else:
host = os.environ["host"]

client = Client(api_token=api_token, organization_id=organization_id, host=host)
job = client.get_job_by_id(job_id)

context = job.sync()["dataset_version_processing_job"]
input_dataset_version_id = context["input_dataset_version_id"]
parameters = context["parameters"]
area_outlier_threshold = 4
duplicate_image_filenames = {}
dataset_path = "data"
cwd = os.getcwd()

dataset_version = client.get_dataset_version_by_id(input_dataset_version_id)
dataset_version.download(dataset_path)
annotation_file_path = dataset_version.export_annotation_file(
annotation_file_type=AnnotationFileType.COCO,
target_path=cwd,
force_replace=True,
)

coco = COCO(annotation_file_path)

image_duplicates = get_duplicate_images(dataset_path=dataset_path)
if image_duplicates:
duplicate_image_filenames = add_tags_to_duplicate_images(
dataset_version=dataset_version, duplicates=image_duplicates
)

filename_duplicates = get_duplicate_filenames(dataset_version=dataset_version)
if filename_duplicates:
dup_assets = dataset_version.find_all_assets(filenames=filename_duplicates)
dup_filename_tag = dataset_version.get_or_create_asset_tag(name="dup_filename")
dup_assets.add_tags(dup_filename_tag)

channel_counts, byte_counts = add_nbr_channel_byte_tags(
dataset_version=dataset_version, dataset_path=dataset_path
)
area_outlier_filenames = get_area_outlier_filenames(
coco=coco, area_outlier_threshold=area_outlier_threshold
)

log_results(
duplicate_image_filenames,
filename_duplicates,
byte_counts,
channel_counts,
area_outlier_filenames,
)
4 changes: 4 additions & 0 deletions dataset/sanity_checks/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
picsellia==6.10.3
pycocotools==2.0.6
imagededup==0.3.2
opencv-python==4.8.1.78
195 changes: 195 additions & 0 deletions dataset/sanity_checks/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
import os
import numpy as np
import cv2
import logging
from scipy.stats import zscore
from imagededup.methods import PHash
from picsellia.sdk.dataset_version import DatasetVersion
from picsellia import Tag, Asset

import matplotlib.pyplot as plt
from pycocotools.coco import COCO

logging.getLogger("picsellia").setLevel(logging.INFO)


def get_duplicate_images(dataset_path: str) -> dict:
phasher = PHash()
encodings = phasher.encode_images(image_dir=dataset_path)
duplicates = phasher.find_duplicates(encoding_map=encodings)
return duplicates


def add_tags_to_duplicate_images(
dataset_version: DatasetVersion, duplicates: dict
) -> set:

dup_image_tag = dataset_version.get_or_create_asset_tag(name="dup_image")

duplicate_files = set()
for filename, duplicate_filenames in duplicates.items():
for duplicate_filename in duplicate_filenames:
if duplicate_filename: # duplicate found
duplicate_files.add(filename)
duplicate_files.add(duplicate_filename)

find_asset_and_add_tag(
dataset_version=dataset_version,
filename=filename,
tag=dup_image_tag,
)
find_asset_and_add_tag(
dataset_version=dataset_version,
filename=duplicate_filename,
tag=dup_image_tag,
)

return duplicate_files


def find_asset_and_add_tag(dataset_version: DatasetVersion, filename: str, tag: Tag):
filename_asset = find_asset_by_filename(filename=filename, dataset=dataset_version)
filename_asset.add_tags(tag)


def find_asset_by_filename(filename: str, dataset: DatasetVersion) -> Asset | None:
try:
asset = dataset.find_asset(filename=filename)
return asset
except Exception as e:
logging.info(e)
return None


def get_duplicate_filenames(dataset_version: DatasetVersion) -> list[str]:
input_assets = dataset_version.list_assets()
image_filenames = [asset.filename for asset in input_assets]
filename_duplicates = find_filename_duplicates(image_filenames)
return filename_duplicates


def find_filename_duplicates(strings: list[str]) -> list[str]:
seen = set()
duplicates = set()

for string in strings:
if string in seen:
duplicates.add(string)
else:
seen.add(string)

return list(duplicates)


def find_filename_by_id(image_id: int, coco: COCO) -> str | None:
img_info = coco.loadImgs(int(image_id))
if img_info:
return img_info[0]["file_name"]
else:
return None


def add_nbr_channel_byte_tags(
dataset_version: DatasetVersion, dataset_path: str
) -> tuple[dict, dict]:
byte_counts = {}
channel_counts = {}
for asset in dataset_version.list_assets():
existing_tags = dataset_version.list_asset_tags()
existing_tag_names = [tag.name for tag in existing_tags]

filename = os.path.join(dataset_path, asset.filename)
BGR = cv2.imread(filename, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_ANYCOLOR)
nbr_bytes = BGR.dtype
nbr_channels = get_nbr_channels(BGR)
byte_counts[nbr_bytes] = byte_counts.get(nbr_bytes, 0) + 1
channel_counts[nbr_channels] = channel_counts.get(nbr_channels, 0) + 1

tag_name = str(nbr_channels) + "_" + str(nbr_bytes)
if tag_name not in existing_tag_names:
image_property_tag = dataset_version.get_or_create_asset_tag(name=tag_name)
else:
image_property_tag = next(
(tag for tag in existing_tags if tag.name == tag_name), None
)
asset.add_tags(image_property_tag)

return channel_counts, byte_counts


def get_nbr_channels(image: np.ndarray) -> int:
shape = len(image.shape)
nbr_channels = 0
if shape == 2:
nbr_channels = 1
elif shape == 3:
nbr_channels = image.shape[2]
else:
logging.info("Image has an unexpected shape")
return nbr_channels


def get_area_outlier_filenames(coco: COCO, area_outlier_threshold: int) -> list[str]:
areas, filename_ids = get_all_areas_filenames(coco=coco)
z_scores = zscore(areas)
outlier_indices = np.where(np.abs(z_scores) > area_outlier_threshold)[0]

outlier_areas = areas[outlier_indices]
filename_ids = list(set(filename_ids[outlier_indices]))
ood_filenames = [
find_filename_by_id(image_id=filename_ids[i], coco=coco)
for i in range(len(filename_ids))
]
logging.info("\nMean Area: ", np.mean(areas))
logging.info(f"Outlier Areas: {outlier_areas}")

return ood_filenames


def get_all_areas_filenames(coco: COCO) -> tuple[np.ndarray, np.ndarray]:
area_values = []
image_ids = []

for ann_id in coco.getAnnIds():
annotation = coco.loadAnns(ann_id)[0]
area = annotation["area"]
image_id = annotation["image_id"]
area_values.append(area)
image_ids.append(image_id)

area_values_np = np.array(area_values)
image_ids = np.array(image_ids)
return area_values_np, image_ids


def log_results(
duplicate_image_filenames,
filename_duplicates,
byte_counts,
channel_counts,
area_outlier_filenames,
):
if duplicate_image_filenames:
logging.info("\nDuplicate images:")
for filename in duplicate_image_filenames:
logging.info(f" - {filename}")

if filename_duplicates:
logging.info("\nDuplicate filenames:")
for filename in filename_duplicates:
logging.info(f" - {filename}")

logging.info("\nNumber of images per byte count:")
for nbr_bytes, count in byte_counts.items():
logging.info(f" - {nbr_bytes}: {count} images")

logging.info("\nNumber of images per channel count:")
for nbr_channels, count in channel_counts.items():
logging.info(f" - {nbr_channels}: {count} images")

len_outlier_files = len(area_outlier_filenames)
logging.info(f"\nyou have {len_outlier_files} image(s) with outlier areas")
if len_outlier_files > 0:
logging.info("\nFilenames with outlier areas:")
for filename in area_outlier_filenames:
logging.info(f" - {filename}")
15 changes: 15 additions & 0 deletions docker/Dockerfile.sanity
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM picsellia/cuda:11.7.1-cudnn8-ubuntu20.04

COPY ./dataset/sanity_checks/requirements.txt .
ARG REBUILD_ALL
RUN pip3 install -r ./requirements.txt --no-cache-dir
ARG REBUILD_PICSELLIA

WORKDIR /experiment

COPY ./dataset/sanity_checks ./
RUN mkdir -p /workspace
RUN chown -R 42420:42420 /workspace


ENTRYPOINT ["run", "main.py"]