From 2f21d8db6a8f981290182b7993f83de4b0e901fa Mon Sep 17 00:00:00 2001
From: Huy Vu2 <huvu@login-eos01.eos.clusters.nvidia.com>
Date: Wed, 31 Jan 2024 14:46:21 -0800
Subject: [PATCH] revert to original eval code files 4

---
 .../conf/megatron_retro_inference.yaml        | 55 ++++++++-----------
 1 file changed, 22 insertions(+), 33 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_retro_inference.yaml b/examples/nlp/language_modeling/conf/megatron_retro_inference.yaml
index 7fe07ea2b6b9..1b99a65f46ad 100644
--- a/examples/nlp/language_modeling/conf/megatron_retro_inference.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_retro_inference.yaml
@@ -9,13 +9,7 @@ inference:
   repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
   min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
   compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
-  end_strings: ["<|endoftext|>"]  # generation will stop when one of these tokens is generated
-  # RETRO-specific arguments
-  retro_inference:
-    retro_gpt_retrieved_length: 128
-    retro_num_neighbors: 2
-    ft_neighbours: 0
-    reuse_top: True
+
 
 trainer:
   devices: 1
@@ -23,33 +17,28 @@ trainer:
   accelerator: gpu
   logger: False # logger provided by exp_manager
   precision: 16 # 16, 32, or bf16
-  use_distributed_sampler: False
-  
 
+inference_batch_size: 2
 tensor_model_parallel_size: -1
 pipeline_model_parallel_size: -1
 pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others)
-megatron_amp_O2: False  # Enable O2-level automatic mixed precision to save memory
-
-
-retro_model_file: null  # Retro nemo file path
-checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the Retro training
-checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading
-hparams_file: null # model configuration file, only used for PTL checkpoint loading
-
-
-prompts: # prompts for Retro inference
-  - "prompt1"
-  - "prompt2"
-
-# RETRO inference
-neighbors: null
-
-server: False  # whether launch the API server
-port: 5555 # the port number for the inference server
-web_server: False # whether launch the web inference server
-share: False  # whether create a public URL
-username: test # user name for web client
-password: test2  # password for web client
-web_port: 9889 # the port number of the web server
-
+retro_model_file: null  # RETRO nemo file path
+
+use_predict_method: False  # whether to use the predict method
+
+prompts: # prompts for RETRO model inference
+  - "hello,"
+  - "good morning,"
+  - "good afternoon,"
+  - "good evening,"
+ 
+########### Faiss service parameters ########
+retrieval_service:
+  strategy: RetroModelTextGenerationStrategy  # choose customized inference strategy 
+  neighbors: 4
+  frequent_query: False  # for the current token generation, frequently update the retrieval context. If false, update it every 64 tokens 
+  pad_tokens: True # pad the tokens at the beginning to make it minimum of 64 tokens for retrieving at least once
+  store_retrieved: False # whether store the retrieved documents, so it can be checked
+  combo_service:
+    service_ip: '0.0.0.0'
+    service_port: 17181 
\ No newline at end of file