update the MODALITY_TRANSFORMS dict to include the new video modality…

… keys and their mappings to transforms
swiss-ai · Aug 1, 2024 · cda245d · cda245d
1 parent b6f7747
commit cda245d
Showing 1 changed file with 14 additions and 0 deletions.
diff --git a/fourm/data/modality_info.py b/fourm/data/modality_info.py
@@ -29,6 +29,11 @@
     ColorPaletteTransform,
     SAMInstanceTokTransform,
     SAMInstanceTransform,
+    VideoDescriptionTransform,
+    VideoDetectionTransform,
+    VideoRGBTransform,
+    VideoTokTransform,
+    VideoTranscriptTransform,
 )
 from fourm.models.decoder_embeddings import ImageTokenDecoderEmbedding, SequenceDecoderEmbedding
 from fourm.models.encoder_embeddings import (
@@ -468,6 +473,15 @@
     "tok_imagebind_global": TokTransform(),
     # Other
     "mask_valid": MaskTransform(mask_pool_size=1),
+    # Video
+    "video_rgb": VideoRGBTransform(imagenet_default_mean_and_std=True),  # TODO: check parameters
+    "video_tok_rgb": VideoTokTransform(),  # tok_ indicates its a token representation
+    "video_tok_clip": VideoTokTransform(),  # TODO: check parameters
+    "video_description": VideoDescriptionTransform(aligned_captions=True),  # TODO: check parameters
+    "video_transcript": VideoTranscriptTransform(aligned_captions=True),  # TODO: check parameters
+    "video_det": VideoDetectionTransform(
+        det_threshold=0.6, det_max_instances=None, bbox_order="dist_to_orig", coord_bins=1000, min_visibility=0.0
+    ),  # TODO: check parameters
 }
 
 MODALITY_TRANSFORMS_DIVAE = {