Fix neva tutorial

Signed-off-by: yaoyu-33 <[email protected]>
NVIDIA · Jul 10, 2024 · b0e05a1 · b0e05a1
1 parent 2b2e62d
commit b0e05a1
Show file tree

Hide file tree

Showing 3 changed files with 132 additions and 145 deletions.
diff --git a/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml
@@ -71,10 +71,10 @@ model:
       freeze: False
       model_type: llama_2 # Only support nvgpt or llama_2
     vision_encoder:
-      from_pretrained: "openai/clip-vit-large-patch14" # path or name
+      from_pretrained: "openai/clip-vit-large-patch14-336" # path or name
       from_hf: True
       patch_dim: 14
-      crop_size: [224, 224]
+      crop_size: [336, 336]
       hidden_size: 1024 # could be found from model but tricky in code
       vision_select_layer: -2   # default to the last layer
       class_token_length: 1

diff --git a/scripts/checkpoint_converters/convert_llava_hf_to_nemo.py b/scripts/checkpoint_converters/convert_llava_hf_to_nemo.py
@@ -292,7 +292,7 @@ def convert(args):
         batch_dict = hf_tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
         batch_dict_cuda = {k: v.cuda() for k, v in batch_dict.items()}
         hf_model = hf_model.cuda().eval()
-        model = model.eval()
+        model = model.cuda().eval()
 
         hf_outputs = hf_model(**batch_dict_cuda, output_hidden_states=True)
         ids = batch_dict_cuda['input_ids']
@@ -307,7 +307,7 @@ def convert(args):
             attn_mask, _, pos_ids = attn_mask_and_pos_ids
 
             outputs = model(
-                tokens=tokens, text_position_ids=pos_ids.cuda(), attention_mask=attn_mask.cuda(), labels=None
+                tokens=tokens.cuda(), text_position_ids=pos_ids.cuda(), attention_mask=attn_mask.cuda(), labels=None
             )
 
         hf_next_token = hf_outputs.logits[0, -1].argmax()