diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json deleted file mode 100644 index 77b66f36ee50..000000000000 --- a/.devcontainer/devcontainer.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "name": "NeMo Dev", - "image": "nvcr.io/nvidia/nemo:dev", - "runArgs": [ - "--gpus=all", - "-v", "/home/ykarnati:/home/ykarnati", - "-v", "/home/ykarnati/projects/nemo/rebase_pr/NeMo:/opt/NeMo" - ], - "customizations": { - "vscode": { - "extensions": [ - "ms-python.python", - "ms-toolsai.jupyter", - "ms-vscode.cpptools" - ] - } - }, - "settings": { - "terminal.integrated.defaultProfile.linux": "bash" - }, - } \ No newline at end of file diff --git a/nemo/collections/vlm/llava_next/model/base.py b/nemo/collections/vlm/llava_next/model/base.py index ff576003d762..d4f66dc758fb 100644 --- a/nemo/collections/vlm/llava_next/model/base.py +++ b/nemo/collections/vlm/llava_next/model/base.py @@ -44,7 +44,8 @@ def llava_next_data_step(dataloader_iter) -> Dict[str, torch.Tensor]: from megatron.core import parallel_state # Based on: https://github.com/NVIDIA/Megatron-LM/blob/main/pretrain_gpt.py#L87 - # https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py#L828-L842 + # https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/models/language_modeling/ + # megatron_gpt_model.py#L828-L842 batch = next(dataloader_iter) _batch: dict if isinstance(batch, tuple) and len(batch) == 3: @@ -227,11 +228,12 @@ def forward( """Forward function of the LLaVA Next model. Args: - images (torch.Tensor): input image of shape [num_tiles, img_h, img_w]. num_tiles means the number of image tiles in this batch. + images (torch.Tensor): input image of shape [num_tiles, img_h, img_w]. + num_tiles means the number of image tiles in this batch. input_ids (torch.Tensor): input text ids [batch, text_seq_len]. position_ids (torch.Tensor): input text position ids [batch, text_seq_len]. image_sizes (torch.Tensor): Raw image sizes before tiling (N,2). - attention_mask (torch.Tensor): Attention mask for the language model [batch, 1, combined_seq_len, combined_seq_len]. + attention_mask (torch.Tensor): Attention mask for the language model [batch, text seq length]. labels (torch.Tensor): Optional target text labels [batch, combined_seq_len]. loss_mask (torch.Tensor): Text loss mask [batch, text_seq_len]. inference_params (InferenceParams): Inference-time parameters including KV cache. @@ -239,7 +241,7 @@ def forward( image_token_index (int): ID for input images. Returns: - output (torch.Tensor): Loss of shape [b, s] if labels are provided, otherwise logits of shape [b, s, vocab_size]. + output (torch.Tensor): Loss ([b, s]) if labels are provided; logits ([b, s, vocab_size]) otherwise. loss_mask (torch.Tensor): Loss mask expanded to combined sequence length. Shape [b, s]. """ @@ -248,7 +250,8 @@ def forward( ) has_images = media.shape[0] > 0 - # If running inference, we can skip media token computation if they were computed already earlier for this sample. + # If running inference, we can skip media token computation + # if they were computed already earlier for this sample. if use_inference_kv_cache: media_embeddings = None elif self.add_encoder and not has_images: diff --git a/nemo/collections/vlm/llava_next/model/llava_next.py b/nemo/collections/vlm/llava_next/model/llava_next.py index 4db2716360ee..8b0d79da9bf0 100644 --- a/nemo/collections/vlm/llava_next/model/llava_next.py +++ b/nemo/collections/vlm/llava_next/model/llava_next.py @@ -60,7 +60,7 @@ class LlavaNextModel(NevaModel): Attributes: config (LlavaNextConfig): Configuration object for the model. - optim (Optional[OptimizerModule]): Optimizer module for training the model. Defaults to a Megatron optimizer. + optim (Optional[OptimizerModule]): Optimizer module. Defaults to a Megatron optimizer. tokenizer (Optional[TokenizerSpec]): Tokenizer specification for processing text inputs. model_transform (Optional[Callable[[torch.nn.Module], torch.nn.Module]]): Optional transformation applied to the model after initialization. @@ -78,7 +78,7 @@ def __init__( Args: config (LlavaNextConfig): Configuration object for the model. - optim (Optional[OptimizerModule]): Optional optimizer module. If not provided, a default Megatron optimizer is used. + optim (Optional[OptimizerModule]): optimizer module. Defaults to Megatron optimizer. tokenizer (Optional[TokenizerSpec]): Optional tokenizer specification for processing text inputs. model_transform (Optional[Callable[[torch.nn.Module], torch.nn.Module]]): Optional transformation function applied to the model after initialization. @@ -121,7 +121,7 @@ def forward( position_ids (torch.Tensor): Position IDs of shape [batch, text_seq_len]. image_sizes (torch.Tensor): Raw image sizes before tiling, of shape [batch, 2]. loss_mask (Optional[torch.Tensor]): Text loss mask of shape [batch, text_seq_len]. - attention_mask (Optional[torch.Tensor]): Attention mask (before merging image embeddings) of shape [batch, text_seq_len]. + attention_mask (Optional[torch.Tensor]): Attention mask shape [batch, text_seq_len]. media (Optional[torch.Tensor]): Input media tensor. labels (Optional[torch.Tensor]): Target labels of shape [batch, combined_seq_len]. inference_params (InferenceParams): Inference-time parameters. diff --git a/nemo/collections/vlm/llava_next/model/utils.py b/nemo/collections/vlm/llava_next/model/utils.py index 4a2b5e4b3b3f..2996bc277983 100644 --- a/nemo/collections/vlm/llava_next/model/utils.py +++ b/nemo/collections/vlm/llava_next/model/utils.py @@ -158,8 +158,8 @@ def merge_input_ids_with_image_features( total_num_special_image_tokens = torch.sum(special_image_token_mask) if total_num_special_image_tokens != num_images: raise ValueError( - f"Number of image tokens in input_ids ({total_num_special_image_tokens}) - different from num_images ({num_images})." + f"Number of image tokens in input_ids ({total_num_special_image_tokens}) " + f"different from num_images ({num_images})." ) # Compute the maximum embed dimension # max_image_feature_lens is max_feature_lens per batch @@ -274,8 +274,8 @@ def unpad_image(tensor, original_size): if not isinstance(original_size, (list, tuple)): if not isinstance(original_size, (torch.Tensor, np.ndarray)): raise TypeError( - f"image_size invalid type: {type(original_size)} not valid, - should be either list, tuple, np.ndarray or tensor" + f"image_size invalid type: {type(original_size)} not valid ", + "should be either list, tuple, np.ndarray or tensor", ) original_size = original_size.tolist() original_height, original_width = original_size @@ -355,8 +355,8 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): if not isinstance(image_size, (list, tuple)): if not isinstance(image_size, (torch.Tensor, np.ndarray)): raise TypeError( - f"image_size invalid type: {type(image_size)} not valid, - should be either list, tuple, np.ndarray or tensor" + f"image_size invalid type: {type(image_size)} not valid, " + "should be either list, tuple, np.ndarray or tensor" ) image_size = image_size.tolist()