diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py
index e85e4cc82f0d..86b5a9a45fe5 100644
--- a/nemo/collections/multimodal/parts/utils.py
+++ b/nemo/collections/multimodal/parts/utils.py
@@ -514,6 +514,8 @@ def expand2square(pil_img, background_color):
 
 
 def create_image_processor(mm_cfg):
+    from nemo.collections.multimodal.models.multimodal_llm.neva.neva_model import TiledSiglipImageProcessor
+    
     if mm_cfg.vision_encoder.get("from_hf", False):
         if "clip" in mm_cfg.vision_encoder.from_pretrained:
             image_processor = CLIPImageProcessor.from_pretrained(
@@ -523,6 +525,11 @@ def create_image_processor(mm_cfg):
             image_processor = SiglipImageProcessor.from_pretrained(
                 mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16
             )
+            image_processor = TiledSiglipImageProcessor(image_processor,
+                                                        grid_width = mm_cfg.vision_encoder.get("grid_width", 1),
+                                                        grid_height = mm_cfg.vision_encoder.get("grid_height", 1),
+                                                        max_upscale = mm_cfg.vision_encoder.get("max_upscale", 2.0),
+                                                        )
         else:
             raise (ValueError("Currently only support CLIPImageProcessor and SiglipImageProcessor from Huggingface"))