diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py index e85e4cc82f0d..86b5a9a45fe5 100644 --- a/nemo/collections/multimodal/parts/utils.py +++ b/nemo/collections/multimodal/parts/utils.py @@ -514,6 +514,8 @@ def expand2square(pil_img, background_color): def create_image_processor(mm_cfg): + from nemo.collections.multimodal.models.multimodal_llm.neva.neva_model import TiledSiglipImageProcessor + if mm_cfg.vision_encoder.get("from_hf", False): if "clip" in mm_cfg.vision_encoder.from_pretrained: image_processor = CLIPImageProcessor.from_pretrained( @@ -523,6 +525,11 @@ def create_image_processor(mm_cfg): image_processor = SiglipImageProcessor.from_pretrained( mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16 ) + image_processor = TiledSiglipImageProcessor(image_processor, + grid_width = mm_cfg.vision_encoder.get("grid_width", 1), + grid_height = mm_cfg.vision_encoder.get("grid_height", 1), + max_upscale = mm_cfg.vision_encoder.get("max_upscale", 2.0), + ) else: raise (ValueError("Currently only support CLIPImageProcessor and SiglipImageProcessor from Huggingface"))