From a88081bf768fcc1c662e4f588bd01ca9ddcc6aad Mon Sep 17 00:00:00 2001 From: SangBin Cho Date: Fri, 26 Apr 2024 16:16:58 +0900 Subject: [PATCH] [CI] Disable non-lazy string operation on logging (#4326) Co-authored-by: Danny Guinther --- docs/source/conf.py | 5 +- pyproject.toml | 1 + setup.py | 7 ++- vllm/config.py | 16 ++--- vllm/core/scheduler.py | 10 +-- .../device_communicators/custom_all_reduce.py | 8 +-- .../device_communicators/pynccl.py | 15 ++--- .../device_communicators/pynccl_utils.py | 4 +- vllm/distributed/parallel_state.py | 6 +- vllm/distributed/utils.py | 4 +- vllm/engine/async_llm_engine.py | 18 +++--- vllm/engine/llm_engine.py | 61 +++++++++++-------- vllm/engine/metrics.py | 21 ++++--- vllm/entrypoints/openai/api_server.py | 4 +- vllm/entrypoints/openai/serving_chat.py | 11 ++-- vllm/executor/cpu_executor.py | 2 +- vllm/executor/gpu_executor.py | 4 +- vllm/executor/ray_gpu_executor.py | 4 +- vllm/executor/ray_utils.py | 6 +- vllm/logger.py | 2 +- vllm/lora/models.py | 6 +- .../layers/fused_moe/fused_moe.py | 4 +- .../model_executor/model_loader/tensorizer.py | 8 +-- .../model_loader/weight_utils.py | 14 ++--- vllm/model_executor/models/__init__.py | 10 +-- vllm/model_executor/models/gemma.py | 6 +- vllm/spec_decode/spec_decode_worker.py | 3 +- vllm/transformers_utils/configs/dbrx.py | 13 ++-- vllm/transformers_utils/tokenizer.py | 5 +- vllm/utils.py | 20 +++--- vllm/worker/model_runner.py | 27 ++++---- 31 files changed, 176 insertions(+), 149 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index aac8cbb63ebeb..9da5a4991734d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -98,9 +98,10 @@ def setup(app): for mock_target in autodoc_mock_imports: if mock_target in sys.modules: logger.info( - f"Potentially problematic mock target ({mock_target}) found; " + "Potentially problematic mock target (%s) found; " "autodoc_mock_imports cannot mock modules that have already " - "been loaded into sys.modules when the sphinx build starts.") + "been loaded into sys.modules when the sphinx build starts.", + mock_target) class MockedClassDocumenter(autodoc.ClassDocumenter): diff --git a/pyproject.toml b/pyproject.toml index a171d45b4e064..2e026c1ac8911 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ select = [ "SIM", # isort # "I", + "G", ] ignore = [ # star imports diff --git a/setup.py b/setup.py index 4b672e1af8494..6ba36b85ea318 100644 --- a/setup.py +++ b/setup.py @@ -63,7 +63,7 @@ def compute_num_jobs(self): num_jobs = os.environ.get("MAX_JOBS", None) if num_jobs is not None: num_jobs = int(num_jobs) - logger.info(f"Using MAX_JOBS={num_jobs} as the number of jobs.") + logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs) else: try: # os.sched_getaffinity() isn't universally available, so fall @@ -81,8 +81,9 @@ def compute_num_jobs(self): nvcc_threads = os.getenv("NVCC_THREADS", None) if nvcc_threads is not None: nvcc_threads = int(nvcc_threads) - logger.info(f"Using NVCC_THREADS={nvcc_threads} as the number" - " of nvcc threads.") + logger.info( + "Using NVCC_THREADS=%d as the number of nvcc threads.", + nvcc_threads) else: nvcc_threads = 1 num_jobs = max(1, num_jobs // nvcc_threads) diff --git a/vllm/config.py b/vllm/config.py index 311a69f822571..887a73d9462dc 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -167,9 +167,9 @@ def _verify_quantization(self) -> None: f"supported in ROCm.") if self.quantization != "marlin": logger.warning( - f"{self.quantization} quantization is not fully " + "%s quantization is not fully " "optimized yet. The speed can be slower than " - "non-quantized models.") + "non-quantized models.", self.quantization) def _verify_cuda_graph(self) -> None: if self.max_context_len_to_capture is None: @@ -360,7 +360,7 @@ def verify_with_parallel_config( if cpu_memory_usage > 0.7 * total_cpu_memory: raise ValueError("Too large swap space. " + msg) elif cpu_memory_usage > 0.4 * total_cpu_memory: - logger.warning("Possibly too large swap space. " + msg) + logger.warning("Possibly too large swap space. %s", msg) @dataclass @@ -898,8 +898,8 @@ def verify_with_model_config(self, model_config: ModelConfig): "awq", "gptq" ]: # TODO support marlin and squeezellm - logger.warning(f"{model_config.quantization} quantization is not " - "tested with LoRA yet.") + logger.warning("%s quantization is not tested with LoRA yet.", + model_config.quantization) def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): if scheduler_config.max_num_batched_tokens > 65528: @@ -1008,7 +1008,7 @@ def _get_and_verify_dtype( pass else: # Casting between float16 and bfloat16 is allowed with a warning. - logger.warning(f"Casting {config_dtype} to {torch_dtype}.") + logger.warning("Casting %s to %s.", config_dtype, torch_dtype) return torch_dtype @@ -1051,8 +1051,8 @@ def _get_and_verify_max_len( logger.warning( "The model's config.json does not contain any of the following " "keys to determine the original maximum length of the model: " - f"{possible_keys}. Assuming the model's maximum length is " - f"{default_max_len}.") + "%d. Assuming the model's maximum length is %d.", possible_keys, + default_max_len) derived_max_model_len = default_max_len rope_scaling = getattr(hf_config, "rope_scaling", None) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 99f7a34d336a4..ac3bd7d228e94 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -617,8 +617,9 @@ def _schedule_prefills( if num_new_tokens > self.prompt_limit: logger.warning( - f"Input prompt ({num_new_tokens} tokens) is too long" - f" and exceeds limit of {self.prompt_limit}") + "Input prompt (%d tokens) is too long" + " and exceeds limit of %d", num_new_tokens, + self.prompt_limit) for seq in waiting_seqs: seq.status = SequenceStatus.FINISHED_IGNORED ignored_seq_groups.append(seq_group) @@ -631,8 +632,9 @@ def _schedule_prefills( break elif can_allocate == AllocStatus.NEVER: logger.warning( - f"Input prompt ({num_new_tokens} tokens) is too long" - f" and exceeds the capacity of block_manager") + "Input prompt (%d tokens) is too long" + " and exceeds the capacity of block_manager", + num_new_tokens) for seq in waiting_seqs: seq.status = SequenceStatus.FINISHED_IGNORED ignored_seq_groups.append(seq_group) diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index 9dbb427d91ff1..ec4533326e841 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -37,7 +37,7 @@ def init_custom_ar() -> None: return if world_size not in _SUPPORTED_WORLD_SIZES: - logger.warn( + logger.warning( "Custom allreduce is disabled due to an unsupported world size: " "%d. Supported world sizes: %s. To silence this warning, specify" " disable_custom_all_reduce=True explicitly.", world_size, @@ -47,7 +47,7 @@ def init_custom_ar() -> None: # note: num dev can be larger than world_size if we're only using # first few GPUs if num_dev < world_size: - logger.warn( + logger.warning( "Cannot test GPU P2P because not all GPUs are visible to the " "current process. This might be the case if 'CUDA_VISIBLE_DEVICES'" " is set.") @@ -62,7 +62,7 @@ def init_custom_ar() -> None: # this checks hardware and driver support for NVLink full_nvlink = _is_full_nvlink(device_ids) if world_size > 2 and not full_nvlink: - logger.warn( + logger.warning( "Custom allreduce is disabled because it's not supported on more" " than two PCIe-only GPUs. To silence this warning, specify" " disable_custom_all_reduce=True explicitly.") @@ -71,7 +71,7 @@ def init_custom_ar() -> None: # this is expensive to compute at the first time # then we cache the result if not _can_p2p(rank, world_size): - logger.warn( + logger.warning( "Custom allreduce is disabled because your platform lacks GPU P2P" " capability or P2P test failed. To silence this warning, specify" " disable_custom_all_reduce=True explicitly.") diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index e922beba44bfa..9434867e1b120 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -43,15 +43,16 @@ nccl = ctypes.CDLL(so_file) except Exception as e: logger.error( - f"Failed to load NCCL library from {so_file} ." + "Failed to load NCCL library from %s ." "It is expected if you are not running on NVIDIA/AMD GPUs." "Otherwise, the nccl library might not exist, be corrupted " - f"or it does not support the current platform {platform.platform()}." - f"One solution is to download libnccl2 version 2.18 from " - f"https://developer.download.nvidia.com/compute/cuda/repos/ " - f"and extract the libnccl.so.2 file. If you already have the " - f"library, please set the environment variable VLLM_NCCL_SO_PATH" - " to point to the correct nccl library path.") + "or it does not support the current platform %s." + "One solution is to download libnccl2 version 2.18 from " + "https://developer.download.nvidia.com/compute/cuda/repos/ " + "and extract the libnccl.so.2 file. If you already have the " + "library, please set the environment variable VLLM_NCCL_SO_PATH" + " to point to the correct nccl library path.", so_file, + platform.platform()) raise e # === export types and functions from nccl to Python === diff --git a/vllm/distributed/device_communicators/pynccl_utils.py b/vllm/distributed/device_communicators/pynccl_utils.py index a717fddb695ba..44e4f39217a41 100644 --- a/vllm/distributed/device_communicators/pynccl_utils.py +++ b/vllm/distributed/device_communicators/pynccl_utils.py @@ -14,7 +14,7 @@ except Exception as e: # in non-NVIDIA environments, we can't import the nccl module # e.g. when running on machines with AMD GPUs - logger.info(f"Failed to import NCCL library: {e}") + logger.info("Failed to import NCCL library: %s", e) logger.info("It is expected if you are not running on NVIDIA GPUs.") pass @@ -40,7 +40,7 @@ def set_pynccl_stream(stream: torch.cuda.Stream): def init_process_group(group: Optional[ProcessGroup] = None) -> None: assert not is_initialized() global comm - logger.info(f"vLLM is using nccl=={ncclGetVersion()}") + logger.info("vLLM is using nccl==%s", ncclGetVersion()) comm = NCCLCommunicator(group=group) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 515f2212511b7..6ca6fc5b5f9fe 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -57,8 +57,10 @@ def init_distributed_environment( local_rank: int = -1, backend: str = "nccl", ): - logger.debug(f"{world_size=} {rank=} {local_rank=} " - f"{distributed_init_method=} {backend=}") + logger.debug( + "world_size=%d rank=%d local_rank=%d " + "distributed_init_method=%s backend=%s", world_size, rank, local_rank, + distributed_init_method, backend) if not torch.distributed.is_initialized(): assert distributed_init_method is not None, ( "distributed_init_method must be provided when initializing " diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index e0a871ebe1756..9a13b94c3ada1 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -112,7 +112,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool: and (not os.path.exists(path)): # only the local master process (with local_rank == 0) can # enter this block to calculate the cache - logger.info(f"generating GPU P2P access cache for in {path}") + logger.info("generating GPU P2P access cache for in %s", path) cache = {} for _i in range(num_dev): for _j in range(num_dev): @@ -126,7 +126,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool: if is_distributed: cpu_world_group = get_cpu_world_group() dist.barrier(cpu_world_group) - logger.info(f"reading GPU P2P access cache from {path}") + logger.info("reading GPU P2P access cache from %s", path) with open(path, "r") as f: cache = json.load(f) _gpu_p2p_access_cache = cache diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 4b007d71e9cfc..518532e4a280d 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -117,7 +117,7 @@ def process_request_output(self, self._request_streams[request_id].put(request_output) if request_output.finished: if verbose: - logger.info(f"Finished request {request_id}.") + logger.info("Finished request %s.", request_id) self.abort_request(request_id) def process_exception(self, @@ -128,7 +128,7 @@ def process_exception(self, """Propagate an exception from the engine.""" self._request_streams[request_id].put(exception) if verbose: - logger.info(f"Finished request {request_id}.") + logger.info("Finished request %s.", request_id) self.abort_request(request_id) def add_request(self, request_id: str, @@ -151,7 +151,7 @@ def add_request(self, request_id: str, def abort_request(self, request_id: str, *, verbose: bool = False) -> None: """Abort a request during next background loop iteration.""" if verbose: - logger.info(f"Aborted request {request_id}.") + logger.info("Aborted request %s.", request_id) self._finished_requests.put_nowait(request_id) @@ -521,11 +521,11 @@ async def add_request( if shortened_token_ids is not None: shortened_token_ids = shortened_token_ids[:self. max_log_len] - logger.info(f"Received request {request_id}: " - f"prompt: {shortened_prompt!r}, " - f"sampling_params: {sampling_params}, " - f"prompt_token_ids: {shortened_token_ids}, " - f"lora_request: {lora_request}.") + logger.info( + "Received request %s: prompt: %r, " + "sampling_params: %s, prompt_token_ids: %s, " + "lora_request: %s.", request_id, shortened_prompt, + sampling_params, shortened_token_ids, lora_request) if not self.is_running: if self.start_engine_loop: @@ -717,4 +717,4 @@ async def check_health(self) -> None: raise RuntimeError("Engine is dead.") from e else: await self.engine.check_health_async() - logger.debug(f"Health check took {time.perf_counter()-t}s") + logger.debug("Health check took %fs", time.perf_counter() - t) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 7de60d738113e..d2f5379e621c6 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -96,29 +96,38 @@ def __init__( usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, ) -> None: logger.info( - f"Initializing an LLM engine (v{vllm.__version__}) with config: " - f"model={model_config.model!r}, " - f"speculative_config={speculative_config!r}, " - f"tokenizer={model_config.tokenizer!r}, " - f"skip_tokenizer_init={model_config.skip_tokenizer_init}, " - f"tokenizer_mode={model_config.tokenizer_mode}, " - f"revision={model_config.revision}, " - f"tokenizer_revision={model_config.tokenizer_revision}, " - f"trust_remote_code={model_config.trust_remote_code}, " - f"dtype={model_config.dtype}, " - f"max_seq_len={model_config.max_model_len}, " - f"download_dir={load_config.download_dir!r}, " - f"load_format={load_config.load_format}, " - f"tensor_parallel_size={parallel_config.tensor_parallel_size}, " - f"disable_custom_all_reduce=" - f"{parallel_config.disable_custom_all_reduce}, " - f"quantization={model_config.quantization}, " - f"enforce_eager={model_config.enforce_eager}, " - f"kv_cache_dtype={cache_config.cache_dtype}, " - f"quantization_param_path={model_config.quantization_param_path}, " - f"device_config={device_config.device}, " - f"decoding_config={decoding_config!r}, " - f"seed={model_config.seed})") + "Initializing an LLM engine (v%s) with config: " + "model=%r, speculative_config=%r, tokenizer=%r, " + "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, " + "tokenizer_revision=%s, trust_remote_code=%s, dtype=%s, " + "max_seq_len=%d, download_dir=%r, load_format=%s, " + "tensor_parallel_size=%d, disable_custom_all_reduce=%s" + "quantization=%s, enforce_eager=%s, kv_cache_dtype=%s, " + "quantization_param_path=%s, device_config=%s, " + "decoding_config=%r, seed=%d)", + vllm.__version__, + model_config.model, + speculative_config, + model_config.tokenizer, + model_config.skip_tokenizer_init, + model_config.tokenizer_mode, + model_config.revision, + model_config.tokenizer_revision, + model_config.trust_remote_code, + model_config.dtype, + model_config.max_model_len, + load_config.download_dir, + load_config.load_format, + parallel_config.tensor_parallel_size, + parallel_config.disable_custom_all_reduce, + model_config.quantization, + model_config.enforce_eager, + cache_config.cache_dtype, + model_config.quantization_param_path, + device_config.device, + decoding_config, + model_config.seed, + ) # TODO(woosuk): Print more configs in debug mode. self.model_config = model_config @@ -237,8 +246,10 @@ def _initialize_kv_caches(self) -> None: if self.cache_config.num_gpu_blocks_override is not None: num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override - logger.info(f"Overriding {num_gpu_blocks=} with " - f"{num_gpu_blocks_override=}") + logger.info( + "Overriding num_gpu_blocks=%d with " + "num_gpu_blocks_override=%d", num_gpu_blocks, + num_gpu_blocks_override) num_gpu_blocks = num_gpu_blocks_override self.cache_config.num_gpu_blocks = num_gpu_blocks diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 25e96f6c7eaf7..d3560f5fefff1 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -227,14 +227,19 @@ def log(self, stats: Stats) -> None: # Log to stdout. logger.info( - f"Avg prompt throughput: {prompt_throughput:.1f} tokens/s, " - f"Avg generation throughput: " - f"{generation_throughput:.1f} tokens/s, " - f"Running: {stats.num_running} reqs, " - f"Swapped: {stats.num_swapped} reqs, " - f"Pending: {stats.num_waiting} reqs, " - f"GPU KV cache usage: {stats.gpu_cache_usage * 100:.1f}%, " - f"CPU KV cache usage: {stats.cpu_cache_usage * 100:.1f}%") + "Avg prompt throughput: %.1f tokens/s, " + "Avg generation throughput: %.1f tokens/s, " + "Running: %d reqs, Swapped: %d reqs, " + "Pending: %d reqs, GPU KV cache usage: %.1f%, " + "CPU KV cache usage: %.1f%", + prompt_throughput, + generation_throughput, + stats.num_running, + stats.num_swapped, + stats.num_waiting, + stats.gpu_cache_usage * 100, + stats.cpu_cache_usage * 100, + ) # Reset tracked stats for next interval. self.num_prompt_tokens = [] diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 37d76b8e74055..af9ba7a3bc825 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -148,8 +148,8 @@ async def authentication(request: Request, call_next): raise ValueError(f"Invalid middleware {middleware}. " f"Must be a function or a class.") - logger.info(f"vLLM API server version {vllm.__version__}") - logger.info(f"args: {args}") + logger.info("vLLM API server version %s", vllm.__version__) + logger.info("args: %s", args) if args.served_model_name is not None: served_model_names = args.served_model_name diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 2ff335eb71073..f6011b6fc4cb6 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -57,8 +57,7 @@ async def create_chat_completion( tokenize=False, add_generation_prompt=request.add_generation_prompt) except Exception as e: - logger.error( - f"Error in applying chat template from request: {str(e)}") + logger.error("Error in applying chat template from request: %s", e) return self.create_error_response(str(e)) request_id = f"cmpl-{random_uuid()}" @@ -338,11 +337,11 @@ def _load_chat_template(self, chat_template): tokenizer.chat_template = codecs.decode( chat_template, "unicode_escape") - logger.info( - f"Using supplied chat template:\n{tokenizer.chat_template}") + logger.info("Using supplied chat template:\n%s", + tokenizer.chat_template) elif tokenizer.chat_template is not None: - logger.info( - f"Using default chat template:\n{tokenizer.chat_template}") + logger.info("Using default chat template:\n%s", + tokenizer.chat_template) else: logger.warning( "No chat template provided. Chat API will not work.") diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 8d6a1fff91fd8..aa810f9743395 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -69,7 +69,7 @@ def initialize_cache(self, num_gpu_blocks: int, # NOTE: `cpu block` for CPU backend is located on CPU memory but is # referred as `gpu block`. Because we want to reuse the existing block # management procedure. - logger.info(f"# CPU blocks: {num_gpu_blocks}") + logger.info("# CPU blocks: %d", num_gpu_blocks) self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) def execute_model(self, diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index d413a7d27ff37..d2c60a3b68e14 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -116,8 +116,8 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: # NOTE: This is logged in the executor because there can be >1 worker # with other executors. We could log in the engine level, but work # remains to abstract away the device for non-GPU configurations. - logger.info(f"# GPU blocks: {num_gpu_blocks}, " - f"# CPU blocks: {num_cpu_blocks}") + logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks, + num_cpu_blocks) self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 14b3f803782c6..6f72babe14fd5 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -214,8 +214,8 @@ def initialize_cache(self, num_gpu_blocks: int, # NOTE: We log here to avoid multiple logs when number of workers is # greater than one. We could log in the engine, but not all executors # have GPUs. - logger.info(f"# GPU blocks: {num_gpu_blocks}, " - f"# CPU blocks: {num_cpu_blocks}") + logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks, + num_cpu_blocks) self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index febae42b84549..9db3ae2ff8298 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -43,9 +43,9 @@ def execute_model_compiled_dag_remote(self, ignored): return output except ImportError as e: - logger.warning(f"Failed to import Ray with {e!r}. " - "For distributed inference, please install Ray with " - "`pip install ray`.") + logger.warning( + "Failed to import Ray with %r. For distributed inference, " + "please install Ray with `pip install ray`.", e) ray = None # type: ignore RayWorkerWrapper = None # type: ignore diff --git a/vllm/logger.py b/vllm/logger.py index 341fc473585d7..3928e5367d1e6 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -126,7 +126,7 @@ def enable_trace_function_call(log_file_path: str, "VLLM_TRACE_FUNCTION is enabled. It will record every" " function executed by Python. This will slow down the code. It " "is suggested to be used for debugging hang or crashes only.") - logger.info(f"Trace frame log is saved to {log_file_path}") + logger.info("Trace frame log is saved to %s", log_file_path) if root_dir is None: # by default, this is the vllm root directory root_dir = os.path.dirname(os.path.dirname(__file__)) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index c249497a4d893..6a077e9b0c755 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -345,8 +345,8 @@ def activate_lora( index, _ = first_free_slot self._active_loras[lora_id] = None lora_model = self._registered_loras[lora_id] - logger.debug( - f"Activating LoRA. int id: {lora_model.id}, slot index: {index}") + logger.debug("Activating LoRA. int id: %d, slot index: %d", + lora_model.id, index) self.lora_index_to_id[index] = lora_model.id for module_name, module in self.modules.items(): module_lora = lora_model.get_lora(module_name) @@ -567,7 +567,7 @@ def __init__(self, capacity: int, deactivate_lora_fn: Callable[[int], self.deactivate_lora_fn = deactivate_lora_fn def _on_remove(self, key: int, value: LoRAModel): - logger.debug(f"Removing LoRA. int id: {key}") + logger.debug("Removing LoRA. int id: %d", key) self.deactivate_lora_fn(key) return super()._on_remove(key, value) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index ac7c30e2a9727..aed2c350bdd10 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -296,8 +296,8 @@ def get_moe_configs(E: int, N: int, os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name) if os.path.exists(config_file_path): with open(config_file_path) as f: - logger.info( - f"Using configuration from {config_file_path} for MoE layer.") + logger.info("Using configuration from %s for MoE layer.", + config_file_path) # If a configuration has been found, return it return {int(key): val for key, val in json.load(f).items()} diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 16be0ecf9ce07..7e65d54bc522f 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -334,10 +334,10 @@ def deserialize(self): per_second = convert_bytes(deserializer.total_tensor_bytes / duration) after_mem = get_mem_usage() deserializer.close() - logger.info(f"Deserialized {total_bytes_str} in " - f"{end - start:0.2f}s, {per_second}/s") - logger.info(f"Memory usage before: {before_mem}") - logger.info(f"Memory usage after: {after_mem}") + logger.info("Deserialized %s in %0.2fs, %f/s", total_bytes_str, + end - start, per_second) + logger.info("Memory usage before: %s", before_mem) + logger.info("Memory usage after: %s", after_mem) self._check_tensors_on_meta_device() self._resize_lora_embeddings() diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 9995f2afe3cf7..c0905b9051314 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -190,7 +190,7 @@ def download_weights_from_hf(model_name_or_path: str, allow_patterns = [pattern] break - logger.info(f"Using model weights format {allow_patterns}") + logger.info("Using model weights format %s", allow_patterns) # Use file lock to prevent multiple processes from # downloading the same model weights at the same time. with get_lock(model_name_or_path, cache_dir): @@ -310,17 +310,17 @@ def kv_cache_scales_loader( return layer_scales_map.items() except FileNotFoundError: - logger.error(f"File or directory '{filename}' not found.") + logger.error("File or directory '%s' not found.", filename) except json.JSONDecodeError: - logger.error(f"Error decoding JSON in file '{filename}'.") + logger.error("Error decoding JSON in file '%s'.", filename) except Exception as e: - logger.error(f"An error occurred while reading '{filename}': {e}") + logger.error("An error occurred while reading '%s': %s", filename, e) # This section is reached if and only if any of the excepts are hit # Return an empty iterable (list) => no KV cache scales are loaded # which ultimately defaults to 1.0 scales - logger.warning("Defaulting to KV cache scaling factors = 1.0 " - f"for all layers in TP rank {tp_rank} " - "as an error occurred during loading.") + logger.warning( + "Defaulting to KV cache scaling factors = 1.0 for all " + "layers in TP rank %d as an error occurred during loading.", tp_rank) return [] diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 6afb2f31c1334..c5cdc059473b3 100755 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -91,8 +91,8 @@ def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]: "ROCm for now.") if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS: logger.warning( - f"Model architecture {model_arch} is partially supported " - "by ROCm: " + _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]) + "Model architecture %s is partially supported by ROCm: %s", + model_arch, _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]) module_name, model_cls_name = _MODELS[model_arch] module = importlib.import_module( @@ -107,9 +107,9 @@ def get_supported_archs() -> List[str]: def register_model(model_arch: str, model_cls: Type[nn.Module]): if model_arch in _MODELS: logger.warning( - f"Model architecture {model_arch} is already registered, " - "and will be overwritten by the new model " - f"class {model_cls.__name__}.") + "Model architecture %s is already registered, and will be " + "overwritten by the new model class %s.", model_arch, + model_cls.__name__) global _OOT_MODELS _OOT_MODELS[model_arch] = model_cls diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 6d01537c5c344..c3193258d6418 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -55,10 +55,10 @@ def _get_gemma_act_fn( "in the config JSON file when it was initially released. " "Changing the activation function to approximate GeLU " "(`gelu_pytorch_tanh`). If you want to use the legacy " - f"`{hidden_act}`, edit the config JSON to set " - f"`hidden_activation={hidden_act}` instead of `hidden_act`. " + "`%s`, edit the config JSON to set " + "`hidden_activation=%s` instead of `hidden_act`. " "See https://github.com/huggingface/transformers/pull/29402 " - "for more details.") + "for more details.", hidden_act, hidden_act) return GeluAndMul(approximate="tanh") elif hidden_activation == "gelu_pytorch_tanh": return GeluAndMul(approximate="tanh") diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 2c6642f5a3c81..4e70ea9686005 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -183,7 +183,8 @@ def execute_model( "speculative decoding " "requires non-None seq_group_metadata_list") - logger.info(f"spec_decode_worker.execute_model {num_lookahead_slots=}") + logger.info("spec_decode_worker.execute_model num_lookahead_slots=%d", + num_lookahead_slots) # If no spec tokens, call the proposer and scorer workers normally. # Used for prefill. diff --git a/vllm/transformers_utils/configs/dbrx.py b/vllm/transformers_utils/configs/dbrx.py index 1d2724f22abd6..0dc9664723d34 100644 --- a/vllm/transformers_utils/configs/dbrx.py +++ b/vllm/transformers_utils/configs/dbrx.py @@ -72,9 +72,10 @@ def from_pretrained( and config_dict["model_type"] != cls.model_type ): logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) + "You are using a model of type %s to instantiate a model of " + "type %s. This is not supported for all configurations of " + "models and can yield errors.", + config_dict["model_type"], cls.model_type) return cls.from_dict(config_dict, **kwargs) @@ -151,9 +152,9 @@ def from_pretrained( and config_dict["model_type"] != cls.model_type ): logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) + "You are using a model of type %s to instantiate a model of " + "type %s. This is not supported for all " + "configurations of models and can yield errors.", config_dict["model_type"], cls.model_type) return cls.from_dict(config_dict, **kwargs) diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index afc02c434dd43..2fcddc3bea5ab 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -138,9 +138,8 @@ def get_lora_tokenizer(lora_request: LoRARequest, *args, # No tokenizer was found in the LoRA folder, # use base model tokenizer logger.warning( - f"No tokenizer found in {lora_request.lora_local_path}, " - "using base model tokenizer instead. " - f"(Exception: {str(e)})") + "No tokenizer found in %s, using base model tokenizer instead. " + "(Exception: %s)", lora_request.lora_local_path, e) tokenizer = None return tokenizer diff --git a/vllm/utils.py b/vllm/utils.py index 79ac1db01fc69..76c2fc66e47c3 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -289,8 +289,9 @@ def get_open_port() -> int: def update_environment_variables(envs: Dict[str, str]): for k, v in envs.items(): if k in os.environ and os.environ[k] != v: - logger.warning(f"Overwriting environment variable {k} " - f"from '{os.environ[k]}' to '{v}'") + logger.warning( + "Overwriting environment variable %s " + "from '%s' to '%s'", k, os.environ[k], v) os.environ[k] = v @@ -310,11 +311,12 @@ def get_nvcc_cuda_version() -> Optional[Version]: if not cuda_home: cuda_home = '/usr/local/cuda' if os.path.isfile(cuda_home + '/bin/nvcc'): - logger.info(f'CUDA_HOME is not found in the environment. ' - f'Using {cuda_home} as CUDA_HOME.') + logger.info( + 'CUDA_HOME is not found in the environment. ' + 'Using %s as CUDA_HOME.', cuda_home) else: - logger.warning( - f'Not found nvcc in {cuda_home}. Skip cuda version check!') + logger.warning('Not found nvcc in %s. Skip cuda version check!', + cuda_home) return None nvcc_output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"], universal_newlines=True) @@ -599,8 +601,8 @@ def find_nccl_library(): # manually load the nccl library if so_file: logger.info( - f"Found nccl from environment variable VLLM_NCCL_SO_PATH={so_file}" - ) + "Found nccl from environment variable VLLM_NCCL_SO_PATH=%s", + so_file) else: if torch.version.cuda is not None: so_file = vllm_nccl_path or find_library("libnccl.so.2") @@ -608,7 +610,7 @@ def find_nccl_library(): so_file = find_library("librccl.so.1") else: raise ValueError("NCCL only supports CUDA and ROCm backends.") - logger.info(f"Found nccl from library {so_file}") + logger.info("Found nccl from library %s", so_file) return so_file diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 33dbf8d90c35d..c6da28f110325 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -170,8 +170,8 @@ def load_model(self) -> None: ) self.model_memory_usage = m.consumed_memory - logger.info(f"Loading model weights took " - f"{self.model_memory_usage / float(2**30):.4f} GB") + logger.info("Loading model weights took %.4f GB", + self.model_memory_usage / float(2**30)) if self.lora_config: assert hasattr(self.model, "supported_lora_modules" @@ -196,18 +196,19 @@ def load_model(self) -> None: self.model.load_kv_cache_scales( self.model_config.quantization_param_path) else: - raise RuntimeError("Using FP8 KV cache and scaling " - "factors provided but model " - f"{self.model.__class__} does not " - "support loading scaling factors.") + raise RuntimeError( + "Using FP8 KV cache and scaling factors provided but " + "model %s does not support loading scaling factors.", + self.model.__class__) else: - logger.warn("Using FP8 KV cache but no scaling factors " - "provided. Defaulting to scaling factors of 1.0. " - "This may lead to less accurate results!") + logger.warning( + "Using FP8 KV cache but no scaling factors " + "provided. Defaulting to scaling factors of 1.0. " + "This may lead to less accurate results!") elif self.model_config.quantization_param_path is not None: - logger.warn("KV cache scaling factors provided, " - "but the KV cache data type is not FP8. " - "KV cache scaling factors will not be used.") + logger.warning("KV cache scaling factors provided, " + "but the KV cache data type is not FP8. " + "KV cache scaling factors will not be used.") def set_block_size(self, block_size: int) -> None: self.block_size = block_size @@ -1054,7 +1055,7 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None: end_time = time.perf_counter() elapsed_time = end_time - start_time # This usually takes < 10 seconds. - logger.info(f"Graph capturing finished in {elapsed_time:.0f} secs.") + logger.info("Graph capturing finished in %.0f secs.", elapsed_time) def __del__(self) -> None: # Delete the CUDA graphs before deleting the pynccl communicator.