From 055aae4d43540ea7db3261d4de0bf9d13cb49a16 Mon Sep 17 00:00:00 2001 From: HuiyingLi Date: Sat, 6 Jul 2024 01:31:01 -0700 Subject: [PATCH] refac tiledsiglip processor to util Signed-off-by: HuiyingLi --- nemo/collections/multimodal/parts/utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py index e85e4cc82f0d..86b5a9a45fe5 100644 --- a/nemo/collections/multimodal/parts/utils.py +++ b/nemo/collections/multimodal/parts/utils.py @@ -514,6 +514,8 @@ def expand2square(pil_img, background_color): def create_image_processor(mm_cfg): + from nemo.collections.multimodal.models.multimodal_llm.neva.neva_model import TiledSiglipImageProcessor + if mm_cfg.vision_encoder.get("from_hf", False): if "clip" in mm_cfg.vision_encoder.from_pretrained: image_processor = CLIPImageProcessor.from_pretrained( @@ -523,6 +525,11 @@ def create_image_processor(mm_cfg): image_processor = SiglipImageProcessor.from_pretrained( mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16 ) + image_processor = TiledSiglipImageProcessor(image_processor, + grid_width = mm_cfg.vision_encoder.get("grid_width", 1), + grid_height = mm_cfg.vision_encoder.get("grid_height", 1), + max_upscale = mm_cfg.vision_encoder.get("max_upscale", 2.0), + ) else: raise (ValueError("Currently only support CLIPImageProcessor and SiglipImageProcessor from Huggingface"))