diff --git a/script/app-mlperf-inference-mlcommons-python/customize.py b/script/app-mlperf-inference-mlcommons-python/customize.py index c00ba02883..b56896d6ce 100644 --- a/script/app-mlperf-inference-mlcommons-python/customize.py +++ b/script/app-mlperf-inference-mlcommons-python/customize.py @@ -297,6 +297,7 @@ def get_run_cmd_reference(os_info, env, scenario_extra_options, mode_extra_optio env['RUN_DIR'] = os.path.join(env['CM_MLPERF_INFERENCE_SOURCE'], "language", "llama2-70b") backend = env['CM_MLPERF_BACKEND'] device = env['CM_MLPERF_DEVICE'] if env['CM_MLPERF_DEVICE'] != "gpu" else "cuda" + cmd = env['CM_PYTHON_BIN_WITH_PATH'] + " main.py " \ " --scenario " + env['CM_MLPERF_LOADGEN_SCENARIO'] + \ " --dataset-path " + env['CM_DATASET_PREPROCESSED_PATH'] + \ @@ -305,12 +306,17 @@ def get_run_cmd_reference(os_info, env, scenario_extra_options, mode_extra_optio scenario_extra_options + mode_extra_options + \ " --output-log-dir " + env['CM_MLPERF_OUTPUT_DIR'] + \ ' --dtype ' + env['CM_MLPERF_MODEL_PRECISION'] + if env.get('CM_MLPERF_INFERENCE_API_SERVER', '') != '': env['CM_VLLM_SERVER_MODEL_NAME'] = env.get("CM_VLLM_SERVER_MODEL_NAME") or "NousResearch/Meta-Llama-3-8B-Instruct" #env['CM_MLPERF_INFERENCE_API_SERVER'] = "http://localhost:8000" cmd += f" --api-server {env['CM_MLPERF_INFERENCE_API_SERVER']} --model-path {env['CM_VLLM_SERVER_MODEL_NAME']} --api-model-name {env['CM_VLLM_SERVER_MODEL_NAME']} --vllm " else: cmd += f" --model-path {env['MODEL_DIR']}" + + if env.get('CM_MLPERF_INFERENCE_NUM_WORKERS', '') != '': + cmd += f" --num-workers {env['CM_MLPERF_INFERENCE_NUM_WORKERS']}" + cmd = cmd.replace("--count", "--total-sample-count") elif "mixtral-8x7b" in env['CM_MODEL']: diff --git a/script/run-mlperf-inference-app/_cm.yaml b/script/run-mlperf-inference-app/_cm.yaml index 7f9427bc16..706fd1dd2d 100644 --- a/script/run-mlperf-inference-app/_cm.yaml +++ b/script/run-mlperf-inference-app/_cm.yaml @@ -100,6 +100,7 @@ input_mapping: nvidia_llama2_dataset_file_path: CM_NVIDIA_LLAMA_DATASET_FILE_PATH tp_size: CM_NVIDIA_TP_SIZE vllm_model_name: CM_VLLM_SERVER_MODEL_NAME + num_workers: CM_MLPERF_INFERENCE_NUM_WORKERS new_state_keys: - app_mlperf_inference_*