From a88081bf768fcc1c662e4f588bd01ca9ddcc6aad Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Fri, 26 Apr 2024 16:16:58 +0900
Subject: [PATCH] [CI] Disable non-lazy string operation on logging (#4326)

Co-authored-by: Danny Guinther <dguinther@neuralmagic.com>
---
 docs/source/conf.py                           |  5 +-
 pyproject.toml                                |  1 +
 setup.py                                      |  7 ++-
 vllm/config.py                                | 16 ++---
 vllm/core/scheduler.py                        | 10 +--
 .../device_communicators/custom_all_reduce.py |  8 +--
 .../device_communicators/pynccl.py            | 15 ++---
 .../device_communicators/pynccl_utils.py      |  4 +-
 vllm/distributed/parallel_state.py            |  6 +-
 vllm/distributed/utils.py                     |  4 +-
 vllm/engine/async_llm_engine.py               | 18 +++---
 vllm/engine/llm_engine.py                     | 61 +++++++++++--------
 vllm/engine/metrics.py                        | 21 ++++---
 vllm/entrypoints/openai/api_server.py         |  4 +-
 vllm/entrypoints/openai/serving_chat.py       | 11 ++--
 vllm/executor/cpu_executor.py                 |  2 +-
 vllm/executor/gpu_executor.py                 |  4 +-
 vllm/executor/ray_gpu_executor.py             |  4 +-
 vllm/executor/ray_utils.py                    |  6 +-
 vllm/logger.py                                |  2 +-
 vllm/lora/models.py                           |  6 +-
 .../layers/fused_moe/fused_moe.py             |  4 +-
 .../model_executor/model_loader/tensorizer.py |  8 +--
 .../model_loader/weight_utils.py              | 14 ++---
 vllm/model_executor/models/__init__.py        | 10 +--
 vllm/model_executor/models/gemma.py           |  6 +-
 vllm/spec_decode/spec_decode_worker.py        |  3 +-
 vllm/transformers_utils/configs/dbrx.py       | 13 ++--
 vllm/transformers_utils/tokenizer.py          |  5 +-
 vllm/utils.py                                 | 20 +++---
 vllm/worker/model_runner.py                   | 27 ++++----
 31 files changed, 176 insertions(+), 149 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index aac8cbb63ebeb..9da5a4991734d 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -98,9 +98,10 @@ def setup(app):
 for mock_target in autodoc_mock_imports:
     if mock_target in sys.modules:
         logger.info(
-            f"Potentially problematic mock target ({mock_target}) found; "
+            "Potentially problematic mock target (%s) found; "
             "autodoc_mock_imports cannot mock modules that have already "
-            "been loaded into sys.modules when the sphinx build starts.")
+            "been loaded into sys.modules when the sphinx build starts.",
+            mock_target)
 
 
 class MockedClassDocumenter(autodoc.ClassDocumenter):
diff --git a/pyproject.toml b/pyproject.toml
index a171d45b4e064..2e026c1ac8911 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,6 +32,7 @@ select = [
     "SIM",
     # isort
     # "I",
+    "G",
 ]
 ignore = [
     # star imports
diff --git a/setup.py b/setup.py
index 4b672e1af8494..6ba36b85ea318 100644
--- a/setup.py
+++ b/setup.py
@@ -63,7 +63,7 @@ def compute_num_jobs(self):
         num_jobs = os.environ.get("MAX_JOBS", None)
         if num_jobs is not None:
             num_jobs = int(num_jobs)
-            logger.info(f"Using MAX_JOBS={num_jobs} as the number of jobs.")
+            logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs)
         else:
             try:
                 # os.sched_getaffinity() isn't universally available, so fall
@@ -81,8 +81,9 @@ def compute_num_jobs(self):
             nvcc_threads = os.getenv("NVCC_THREADS", None)
             if nvcc_threads is not None:
                 nvcc_threads = int(nvcc_threads)
-                logger.info(f"Using NVCC_THREADS={nvcc_threads} as the number"
-                            " of nvcc threads.")
+                logger.info(
+                    "Using NVCC_THREADS=%d as the number of nvcc threads.",
+                    nvcc_threads)
             else:
                 nvcc_threads = 1
             num_jobs = max(1, num_jobs // nvcc_threads)
diff --git a/vllm/config.py b/vllm/config.py
index 311a69f822571..887a73d9462dc 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -167,9 +167,9 @@ def _verify_quantization(self) -> None:
                     f"supported in ROCm.")
             if self.quantization != "marlin":
                 logger.warning(
-                    f"{self.quantization} quantization is not fully "
+                    "%s quantization is not fully "
                     "optimized yet. The speed can be slower than "
-                    "non-quantized models.")
+                    "non-quantized models.", self.quantization)
 
     def _verify_cuda_graph(self) -> None:
         if self.max_context_len_to_capture is None:
@@ -360,7 +360,7 @@ def verify_with_parallel_config(
         if cpu_memory_usage > 0.7 * total_cpu_memory:
             raise ValueError("Too large swap space. " + msg)
         elif cpu_memory_usage > 0.4 * total_cpu_memory:
-            logger.warning("Possibly too large swap space. " + msg)
+            logger.warning("Possibly too large swap space. %s", msg)
 
 
 @dataclass
@@ -898,8 +898,8 @@ def verify_with_model_config(self, model_config: ModelConfig):
                 "awq", "gptq"
         ]:
             # TODO support marlin and squeezellm
-            logger.warning(f"{model_config.quantization} quantization is not "
-                           "tested with LoRA yet.")
+            logger.warning("%s quantization is not tested with LoRA yet.",
+                           model_config.quantization)
 
     def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
         if scheduler_config.max_num_batched_tokens > 65528:
@@ -1008,7 +1008,7 @@ def _get_and_verify_dtype(
             pass
         else:
             # Casting between float16 and bfloat16 is allowed with a warning.
-            logger.warning(f"Casting {config_dtype} to {torch_dtype}.")
+            logger.warning("Casting %s to %s.", config_dtype, torch_dtype)
 
     return torch_dtype
 
@@ -1051,8 +1051,8 @@ def _get_and_verify_max_len(
         logger.warning(
             "The model's config.json does not contain any of the following "
             "keys to determine the original maximum length of the model: "
-            f"{possible_keys}. Assuming the model's maximum length is "
-            f"{default_max_len}.")
+            "%d. Assuming the model's maximum length is %d.", possible_keys,
+            default_max_len)
         derived_max_model_len = default_max_len
 
     rope_scaling = getattr(hf_config, "rope_scaling", None)
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 99f7a34d336a4..ac3bd7d228e94 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -617,8 +617,9 @@ def _schedule_prefills(
 
             if num_new_tokens > self.prompt_limit:
                 logger.warning(
-                    f"Input prompt ({num_new_tokens} tokens) is too long"
-                    f" and exceeds limit of {self.prompt_limit}")
+                    "Input prompt (%d tokens) is too long"
+                    " and exceeds limit of %d", num_new_tokens,
+                    self.prompt_limit)
                 for seq in waiting_seqs:
                     seq.status = SequenceStatus.FINISHED_IGNORED
                 ignored_seq_groups.append(seq_group)
@@ -631,8 +632,9 @@ def _schedule_prefills(
                 break
             elif can_allocate == AllocStatus.NEVER:
                 logger.warning(
-                    f"Input prompt ({num_new_tokens} tokens) is too long"
-                    f" and exceeds the capacity of block_manager")
+                    "Input prompt (%d tokens) is too long"
+                    " and exceeds the capacity of block_manager",
+                    num_new_tokens)
                 for seq in waiting_seqs:
                     seq.status = SequenceStatus.FINISHED_IGNORED
                 ignored_seq_groups.append(seq_group)
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 9dbb427d91ff1..ec4533326e841 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -37,7 +37,7 @@ def init_custom_ar() -> None:
         return
 
     if world_size not in _SUPPORTED_WORLD_SIZES:
-        logger.warn(
+        logger.warning(
             "Custom allreduce is disabled due to an unsupported world size: "
             "%d. Supported world sizes: %s. To silence this warning, specify"
             " disable_custom_all_reduce=True explicitly.", world_size,
@@ -47,7 +47,7 @@ def init_custom_ar() -> None:
     # note: num dev can be larger than world_size if we're only using
     # first few GPUs
     if num_dev < world_size:
-        logger.warn(
+        logger.warning(
             "Cannot test GPU P2P because not all GPUs are visible to the "
             "current process. This might be the case if 'CUDA_VISIBLE_DEVICES'"
             " is set.")
@@ -62,7 +62,7 @@ def init_custom_ar() -> None:
     # this checks hardware and driver support for NVLink
     full_nvlink = _is_full_nvlink(device_ids)
     if world_size > 2 and not full_nvlink:
-        logger.warn(
+        logger.warning(
             "Custom allreduce is disabled because it's not supported on more"
             " than two PCIe-only GPUs. To silence this warning, specify"
             " disable_custom_all_reduce=True explicitly.")
@@ -71,7 +71,7 @@ def init_custom_ar() -> None:
     # this is expensive to compute at the first time
     # then we cache the result
     if not _can_p2p(rank, world_size):
-        logger.warn(
+        logger.warning(
             "Custom allreduce is disabled because your platform lacks GPU P2P"
             " capability or P2P test failed. To silence this warning, specify"
             " disable_custom_all_reduce=True explicitly.")
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index e922beba44bfa..9434867e1b120 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -43,15 +43,16 @@
     nccl = ctypes.CDLL(so_file)
 except Exception as e:
     logger.error(
-        f"Failed to load NCCL library from {so_file} ."
+        "Failed to load NCCL library from %s ."
         "It is expected if you are not running on NVIDIA/AMD GPUs."
         "Otherwise, the nccl library might not exist, be corrupted "
-        f"or it does not support the current platform {platform.platform()}."
-        f"One solution is to download libnccl2 version 2.18 from "
-        f"https://developer.download.nvidia.com/compute/cuda/repos/ "
-        f"and extract the libnccl.so.2 file. If you already have the "
-        f"library, please set the environment variable VLLM_NCCL_SO_PATH"
-        " to point to the correct nccl library path.")
+        "or it does not support the current platform %s."
+        "One solution is to download libnccl2 version 2.18 from "
+        "https://developer.download.nvidia.com/compute/cuda/repos/ "
+        "and extract the libnccl.so.2 file. If you already have the "
+        "library, please set the environment variable VLLM_NCCL_SO_PATH"
+        " to point to the correct nccl library path.", so_file,
+        platform.platform())
     raise e
 
 # === export types and functions from nccl to Python ===
diff --git a/vllm/distributed/device_communicators/pynccl_utils.py b/vllm/distributed/device_communicators/pynccl_utils.py
index a717fddb695ba..44e4f39217a41 100644
--- a/vllm/distributed/device_communicators/pynccl_utils.py
+++ b/vllm/distributed/device_communicators/pynccl_utils.py
@@ -14,7 +14,7 @@
 except Exception as e:
     # in non-NVIDIA environments, we can't import the nccl module
     # e.g. when running on machines with AMD GPUs
-    logger.info(f"Failed to import NCCL library: {e}")
+    logger.info("Failed to import NCCL library: %s", e)
     logger.info("It is expected if you are not running on NVIDIA GPUs.")
     pass
 
@@ -40,7 +40,7 @@ def set_pynccl_stream(stream: torch.cuda.Stream):
 def init_process_group(group: Optional[ProcessGroup] = None) -> None:
     assert not is_initialized()
     global comm
-    logger.info(f"vLLM is using nccl=={ncclGetVersion()}")
+    logger.info("vLLM is using nccl==%s", ncclGetVersion())
     comm = NCCLCommunicator(group=group)
 
 
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 515f2212511b7..6ca6fc5b5f9fe 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -57,8 +57,10 @@ def init_distributed_environment(
     local_rank: int = -1,
     backend: str = "nccl",
 ):
-    logger.debug(f"{world_size=} {rank=} {local_rank=} "
-                 f"{distributed_init_method=} {backend=}")
+    logger.debug(
+        "world_size=%d rank=%d local_rank=%d "
+        "distributed_init_method=%s backend=%s", world_size, rank, local_rank,
+        distributed_init_method, backend)
     if not torch.distributed.is_initialized():
         assert distributed_init_method is not None, (
             "distributed_init_method must be provided when initializing "
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index e0a871ebe1756..9a13b94c3ada1 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -112,7 +112,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
         and (not os.path.exists(path)):
         # only the local master process (with local_rank == 0) can
         #  enter this block to calculate the cache
-        logger.info(f"generating GPU P2P access cache for in {path}")
+        logger.info("generating GPU P2P access cache for in %s", path)
         cache = {}
         for _i in range(num_dev):
             for _j in range(num_dev):
@@ -126,7 +126,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
     if is_distributed:
         cpu_world_group = get_cpu_world_group()
         dist.barrier(cpu_world_group)
-    logger.info(f"reading GPU P2P access cache from {path}")
+    logger.info("reading GPU P2P access cache from %s", path)
     with open(path, "r") as f:
         cache = json.load(f)
     _gpu_p2p_access_cache = cache
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 4b007d71e9cfc..518532e4a280d 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -117,7 +117,7 @@ def process_request_output(self,
         self._request_streams[request_id].put(request_output)
         if request_output.finished:
             if verbose:
-                logger.info(f"Finished request {request_id}.")
+                logger.info("Finished request %s.", request_id)
             self.abort_request(request_id)
 
     def process_exception(self,
@@ -128,7 +128,7 @@ def process_exception(self,
         """Propagate an exception from the engine."""
         self._request_streams[request_id].put(exception)
         if verbose:
-            logger.info(f"Finished request {request_id}.")
+            logger.info("Finished request %s.", request_id)
         self.abort_request(request_id)
 
     def add_request(self, request_id: str,
@@ -151,7 +151,7 @@ def add_request(self, request_id: str,
     def abort_request(self, request_id: str, *, verbose: bool = False) -> None:
         """Abort a request during next background loop iteration."""
         if verbose:
-            logger.info(f"Aborted request {request_id}.")
+            logger.info("Aborted request %s.", request_id)
 
         self._finished_requests.put_nowait(request_id)
 
@@ -521,11 +521,11 @@ async def add_request(
                 if shortened_token_ids is not None:
                     shortened_token_ids = shortened_token_ids[:self.
                                                               max_log_len]
-            logger.info(f"Received request {request_id}: "
-                        f"prompt: {shortened_prompt!r}, "
-                        f"sampling_params: {sampling_params}, "
-                        f"prompt_token_ids: {shortened_token_ids}, "
-                        f"lora_request: {lora_request}.")
+            logger.info(
+                "Received request %s: prompt: %r, "
+                "sampling_params: %s, prompt_token_ids: %s, "
+                "lora_request: %s.", request_id, shortened_prompt,
+                sampling_params, shortened_token_ids, lora_request)
 
         if not self.is_running:
             if self.start_engine_loop:
@@ -717,4 +717,4 @@ async def check_health(self) -> None:
                 raise RuntimeError("Engine is dead.") from e
         else:
             await self.engine.check_health_async()
-        logger.debug(f"Health check took {time.perf_counter()-t}s")
+        logger.debug("Health check took %fs", time.perf_counter() - t)
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 7de60d738113e..d2f5379e621c6 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -96,29 +96,38 @@ def __init__(
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
     ) -> None:
         logger.info(
-            f"Initializing an LLM engine (v{vllm.__version__}) with config: "
-            f"model={model_config.model!r}, "
-            f"speculative_config={speculative_config!r}, "
-            f"tokenizer={model_config.tokenizer!r}, "
-            f"skip_tokenizer_init={model_config.skip_tokenizer_init}, "
-            f"tokenizer_mode={model_config.tokenizer_mode}, "
-            f"revision={model_config.revision}, "
-            f"tokenizer_revision={model_config.tokenizer_revision}, "
-            f"trust_remote_code={model_config.trust_remote_code}, "
-            f"dtype={model_config.dtype}, "
-            f"max_seq_len={model_config.max_model_len}, "
-            f"download_dir={load_config.download_dir!r}, "
-            f"load_format={load_config.load_format}, "
-            f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
-            f"disable_custom_all_reduce="
-            f"{parallel_config.disable_custom_all_reduce}, "
-            f"quantization={model_config.quantization}, "
-            f"enforce_eager={model_config.enforce_eager}, "
-            f"kv_cache_dtype={cache_config.cache_dtype}, "
-            f"quantization_param_path={model_config.quantization_param_path}, "
-            f"device_config={device_config.device}, "
-            f"decoding_config={decoding_config!r}, "
-            f"seed={model_config.seed})")
+            "Initializing an LLM engine (v%s) with config: "
+            "model=%r, speculative_config=%r, tokenizer=%r, "
+            "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
+            "tokenizer_revision=%s, trust_remote_code=%s, dtype=%s, "
+            "max_seq_len=%d, download_dir=%r, load_format=%s, "
+            "tensor_parallel_size=%d, disable_custom_all_reduce=%s"
+            "quantization=%s, enforce_eager=%s, kv_cache_dtype=%s, "
+            "quantization_param_path=%s, device_config=%s, "
+            "decoding_config=%r, seed=%d)",
+            vllm.__version__,
+            model_config.model,
+            speculative_config,
+            model_config.tokenizer,
+            model_config.skip_tokenizer_init,
+            model_config.tokenizer_mode,
+            model_config.revision,
+            model_config.tokenizer_revision,
+            model_config.trust_remote_code,
+            model_config.dtype,
+            model_config.max_model_len,
+            load_config.download_dir,
+            load_config.load_format,
+            parallel_config.tensor_parallel_size,
+            parallel_config.disable_custom_all_reduce,
+            model_config.quantization,
+            model_config.enforce_eager,
+            cache_config.cache_dtype,
+            model_config.quantization_param_path,
+            device_config.device,
+            decoding_config,
+            model_config.seed,
+        )
         # TODO(woosuk): Print more configs in debug mode.
 
         self.model_config = model_config
@@ -237,8 +246,10 @@ def _initialize_kv_caches(self) -> None:
 
         if self.cache_config.num_gpu_blocks_override is not None:
             num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override
-            logger.info(f"Overriding {num_gpu_blocks=} with "
-                        f"{num_gpu_blocks_override=}")
+            logger.info(
+                "Overriding num_gpu_blocks=%d with "
+                "num_gpu_blocks_override=%d", num_gpu_blocks,
+                num_gpu_blocks_override)
             num_gpu_blocks = num_gpu_blocks_override
 
         self.cache_config.num_gpu_blocks = num_gpu_blocks
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 25e96f6c7eaf7..d3560f5fefff1 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -227,14 +227,19 @@ def log(self, stats: Stats) -> None:
 
             # Log to stdout.
             logger.info(
-                f"Avg prompt throughput: {prompt_throughput:.1f} tokens/s, "
-                f"Avg generation throughput: "
-                f"{generation_throughput:.1f} tokens/s, "
-                f"Running: {stats.num_running} reqs, "
-                f"Swapped: {stats.num_swapped} reqs, "
-                f"Pending: {stats.num_waiting} reqs, "
-                f"GPU KV cache usage: {stats.gpu_cache_usage * 100:.1f}%, "
-                f"CPU KV cache usage: {stats.cpu_cache_usage * 100:.1f}%")
+                "Avg prompt throughput: %.1f tokens/s, "
+                "Avg generation throughput: %.1f tokens/s, "
+                "Running: %d reqs, Swapped: %d reqs, "
+                "Pending: %d reqs, GPU KV cache usage: %.1f%, "
+                "CPU KV cache usage: %.1f%",
+                prompt_throughput,
+                generation_throughput,
+                stats.num_running,
+                stats.num_swapped,
+                stats.num_waiting,
+                stats.gpu_cache_usage * 100,
+                stats.cpu_cache_usage * 100,
+            )
 
             # Reset tracked stats for next interval.
             self.num_prompt_tokens = []
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 37d76b8e74055..af9ba7a3bc825 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -148,8 +148,8 @@ async def authentication(request: Request, call_next):
             raise ValueError(f"Invalid middleware {middleware}. "
                              f"Must be a function or a class.")
 
-    logger.info(f"vLLM API server version {vllm.__version__}")
-    logger.info(f"args: {args}")
+    logger.info("vLLM API server version %s", vllm.__version__)
+    logger.info("args: %s", args)
 
     if args.served_model_name is not None:
         served_model_names = args.served_model_name
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 2ff335eb71073..f6011b6fc4cb6 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -57,8 +57,7 @@ async def create_chat_completion(
                 tokenize=False,
                 add_generation_prompt=request.add_generation_prompt)
         except Exception as e:
-            logger.error(
-                f"Error in applying chat template from request: {str(e)}")
+            logger.error("Error in applying chat template from request: %s", e)
             return self.create_error_response(str(e))
 
         request_id = f"cmpl-{random_uuid()}"
@@ -338,11 +337,11 @@ def _load_chat_template(self, chat_template):
                 tokenizer.chat_template = codecs.decode(
                     chat_template, "unicode_escape")
 
-            logger.info(
-                f"Using supplied chat template:\n{tokenizer.chat_template}")
+            logger.info("Using supplied chat template:\n%s",
+                        tokenizer.chat_template)
         elif tokenizer.chat_template is not None:
-            logger.info(
-                f"Using default chat template:\n{tokenizer.chat_template}")
+            logger.info("Using default chat template:\n%s",
+                        tokenizer.chat_template)
         else:
             logger.warning(
                 "No chat template provided. Chat API will not work.")
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index 8d6a1fff91fd8..aa810f9743395 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -69,7 +69,7 @@ def initialize_cache(self, num_gpu_blocks: int,
         # NOTE: `cpu block` for CPU backend is located on CPU memory but is
         # referred as `gpu block`. Because we want to reuse the existing block
         # management procedure.
-        logger.info(f"# CPU blocks: {num_gpu_blocks}")
+        logger.info("# CPU blocks: %d", num_gpu_blocks)
         self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
 
     def execute_model(self,
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index d413a7d27ff37..d2c60a3b68e14 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -116,8 +116,8 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
         # NOTE: This is logged in the executor because there can be >1 worker
         # with other executors. We could log in the engine level, but work
         # remains to abstract away the device for non-GPU configurations.
-        logger.info(f"# GPU blocks: {num_gpu_blocks}, "
-                    f"# CPU blocks: {num_cpu_blocks}")
+        logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
+                    num_cpu_blocks)
 
         self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
 
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 14b3f803782c6..6f72babe14fd5 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -214,8 +214,8 @@ def initialize_cache(self, num_gpu_blocks: int,
         # NOTE: We log here to avoid multiple logs when number of workers is
         # greater than one. We could log in the engine, but not all executors
         # have GPUs.
-        logger.info(f"# GPU blocks: {num_gpu_blocks}, "
-                    f"# CPU blocks: {num_cpu_blocks}")
+        logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
+                    num_cpu_blocks)
 
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index febae42b84549..9db3ae2ff8298 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -43,9 +43,9 @@ def execute_model_compiled_dag_remote(self, ignored):
             return output
 
 except ImportError as e:
-    logger.warning(f"Failed to import Ray with {e!r}. "
-                   "For distributed inference, please install Ray with "
-                   "`pip install ray`.")
+    logger.warning(
+        "Failed to import Ray with %r. For distributed inference, "
+        "please install Ray with `pip install ray`.", e)
     ray = None  # type: ignore
     RayWorkerWrapper = None  # type: ignore
 
diff --git a/vllm/logger.py b/vllm/logger.py
index 341fc473585d7..3928e5367d1e6 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -126,7 +126,7 @@ def enable_trace_function_call(log_file_path: str,
         "VLLM_TRACE_FUNCTION is enabled. It will record every"
         " function executed by Python. This will slow down the code. It "
         "is suggested to be used for debugging hang or crashes only.")
-    logger.info(f"Trace frame log is saved to {log_file_path}")
+    logger.info("Trace frame log is saved to %s", log_file_path)
     if root_dir is None:
         # by default, this is the vllm root directory
         root_dir = os.path.dirname(os.path.dirname(__file__))
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index c249497a4d893..6a077e9b0c755 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -345,8 +345,8 @@ def activate_lora(
         index, _ = first_free_slot
         self._active_loras[lora_id] = None
         lora_model = self._registered_loras[lora_id]
-        logger.debug(
-            f"Activating LoRA. int id: {lora_model.id}, slot index: {index}")
+        logger.debug("Activating LoRA. int id: %d, slot index: %d",
+                     lora_model.id, index)
         self.lora_index_to_id[index] = lora_model.id
         for module_name, module in self.modules.items():
             module_lora = lora_model.get_lora(module_name)
@@ -567,7 +567,7 @@ def __init__(self, capacity: int, deactivate_lora_fn: Callable[[int],
         self.deactivate_lora_fn = deactivate_lora_fn
 
     def _on_remove(self, key: int, value: LoRAModel):
-        logger.debug(f"Removing LoRA. int id: {key}")
+        logger.debug("Removing LoRA. int id: %d", key)
         self.deactivate_lora_fn(key)
         return super()._on_remove(key, value)
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index ac7c30e2a9727..aed2c350bdd10 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -296,8 +296,8 @@ def get_moe_configs(E: int, N: int,
         os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
     if os.path.exists(config_file_path):
         with open(config_file_path) as f:
-            logger.info(
-                f"Using configuration from {config_file_path} for MoE layer.")
+            logger.info("Using configuration from %s for MoE layer.",
+                        config_file_path)
             # If a configuration has been found, return it
             return {int(key): val for key, val in json.load(f).items()}
 
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 16be0ecf9ce07..7e65d54bc522f 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -334,10 +334,10 @@ def deserialize(self):
         per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
         after_mem = get_mem_usage()
         deserializer.close()
-        logger.info(f"Deserialized {total_bytes_str} in "
-                    f"{end - start:0.2f}s, {per_second}/s")
-        logger.info(f"Memory usage before: {before_mem}")
-        logger.info(f"Memory usage after: {after_mem}")
+        logger.info("Deserialized %s in %0.2fs, %f/s", total_bytes_str,
+                    end - start, per_second)
+        logger.info("Memory usage before: %s", before_mem)
+        logger.info("Memory usage after: %s", after_mem)
 
         self._check_tensors_on_meta_device()
         self._resize_lora_embeddings()
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 9995f2afe3cf7..c0905b9051314 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -190,7 +190,7 @@ def download_weights_from_hf(model_name_or_path: str,
             allow_patterns = [pattern]
             break
 
-    logger.info(f"Using model weights format {allow_patterns}")
+    logger.info("Using model weights format %s", allow_patterns)
     # Use file lock to prevent multiple processes from
     # downloading the same model weights at the same time.
     with get_lock(model_name_or_path, cache_dir):
@@ -310,17 +310,17 @@ def kv_cache_scales_loader(
             return layer_scales_map.items()
 
     except FileNotFoundError:
-        logger.error(f"File or directory '{filename}' not found.")
+        logger.error("File or directory '%s' not found.", filename)
     except json.JSONDecodeError:
-        logger.error(f"Error decoding JSON in file '{filename}'.")
+        logger.error("Error decoding JSON in file '%s'.", filename)
     except Exception as e:
-        logger.error(f"An error occurred while reading '{filename}': {e}")
+        logger.error("An error occurred while reading '%s': %s", filename, e)
     # This section is reached if and only if any of the excepts are hit
     # Return an empty iterable (list) => no KV cache scales are loaded
     # which ultimately defaults to 1.0 scales
-    logger.warning("Defaulting to KV cache scaling factors = 1.0 "
-                   f"for all layers in TP rank {tp_rank} "
-                   "as an error occurred during loading.")
+    logger.warning(
+        "Defaulting to KV cache scaling factors = 1.0 for all "
+        "layers in TP rank %d as an error occurred during loading.", tp_rank)
     return []
 
 
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 6afb2f31c1334..c5cdc059473b3 100755
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -91,8 +91,8 @@ def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:
                     "ROCm for now.")
             if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
                 logger.warning(
-                    f"Model architecture {model_arch} is partially supported "
-                    "by ROCm: " + _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch])
+                    "Model architecture %s is partially supported by ROCm: %s",
+                    model_arch, _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch])
 
         module_name, model_cls_name = _MODELS[model_arch]
         module = importlib.import_module(
@@ -107,9 +107,9 @@ def get_supported_archs() -> List[str]:
     def register_model(model_arch: str, model_cls: Type[nn.Module]):
         if model_arch in _MODELS:
             logger.warning(
-                f"Model architecture {model_arch} is already registered, "
-                "and will be overwritten by the new model "
-                f"class {model_cls.__name__}.")
+                "Model architecture %s is already registered, and will be "
+                "overwritten by the new model class %s.", model_arch,
+                model_cls.__name__)
         global _OOT_MODELS
         _OOT_MODELS[model_arch] = model_cls
 
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 6d01537c5c344..c3193258d6418 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -55,10 +55,10 @@ def _get_gemma_act_fn(
                 "in the config JSON file when it was initially released. "
                 "Changing the activation function to approximate GeLU "
                 "(`gelu_pytorch_tanh`). If you want to use the legacy "
-                f"`{hidden_act}`, edit the config JSON to set "
-                f"`hidden_activation={hidden_act}` instead of `hidden_act`. "
+                "`%s`, edit the config JSON to set "
+                "`hidden_activation=%s` instead of `hidden_act`. "
                 "See https://github.com/huggingface/transformers/pull/29402 "
-                "for more details.")
+                "for more details.", hidden_act, hidden_act)
         return GeluAndMul(approximate="tanh")
     elif hidden_activation == "gelu_pytorch_tanh":
         return GeluAndMul(approximate="tanh")
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 2c6642f5a3c81..4e70ea9686005 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -183,7 +183,8 @@ def execute_model(
             "speculative decoding "
             "requires non-None seq_group_metadata_list")
 
-        logger.info(f"spec_decode_worker.execute_model {num_lookahead_slots=}")
+        logger.info("spec_decode_worker.execute_model num_lookahead_slots=%d",
+                    num_lookahead_slots)
 
         # If no spec tokens, call the proposer and scorer workers normally.
         # Used for prefill.
diff --git a/vllm/transformers_utils/configs/dbrx.py b/vllm/transformers_utils/configs/dbrx.py
index 1d2724f22abd6..0dc9664723d34 100644
--- a/vllm/transformers_utils/configs/dbrx.py
+++ b/vllm/transformers_utils/configs/dbrx.py
@@ -72,9 +72,10 @@ def from_pretrained(
             and config_dict["model_type"] != cls.model_type
         ):
             logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
+                "You are using a model of type %s to instantiate a model of "
+                "type %s. This is not supported for all configurations of "
+                "models and can yield errors.",
+                config_dict["model_type"], cls.model_type)
 
         return cls.from_dict(config_dict, **kwargs)
 
@@ -151,9 +152,9 @@ def from_pretrained(
             and config_dict["model_type"] != cls.model_type
         ):
             logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
+                "You are using a model of type %s to instantiate a model of "
+                "type %s. This is not supported for all "
+                "configurations of models and can yield errors.", config_dict["model_type"], cls.model_type)
 
         return cls.from_dict(config_dict, **kwargs)
 
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index afc02c434dd43..2fcddc3bea5ab 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -138,9 +138,8 @@ def get_lora_tokenizer(lora_request: LoRARequest, *args,
         # No tokenizer was found in the LoRA folder,
         # use base model tokenizer
         logger.warning(
-            f"No tokenizer found in {lora_request.lora_local_path}, "
-            "using base model tokenizer instead. "
-            f"(Exception: {str(e)})")
+            "No tokenizer found in %s, using base model tokenizer instead. "
+            "(Exception: %s)", lora_request.lora_local_path, e)
         tokenizer = None
     return tokenizer
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 79ac1db01fc69..76c2fc66e47c3 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -289,8 +289,9 @@ def get_open_port() -> int:
 def update_environment_variables(envs: Dict[str, str]):
     for k, v in envs.items():
         if k in os.environ and os.environ[k] != v:
-            logger.warning(f"Overwriting environment variable {k} "
-                           f"from '{os.environ[k]}' to '{v}'")
+            logger.warning(
+                "Overwriting environment variable %s "
+                "from '%s' to '%s'", k, os.environ[k], v)
         os.environ[k] = v
 
 
@@ -310,11 +311,12 @@ def get_nvcc_cuda_version() -> Optional[Version]:
     if not cuda_home:
         cuda_home = '/usr/local/cuda'
         if os.path.isfile(cuda_home + '/bin/nvcc'):
-            logger.info(f'CUDA_HOME is not found in the environment. '
-                        f'Using {cuda_home} as CUDA_HOME.')
+            logger.info(
+                'CUDA_HOME is not found in the environment. '
+                'Using %s as CUDA_HOME.', cuda_home)
         else:
-            logger.warning(
-                f'Not found nvcc in {cuda_home}. Skip cuda version check!')
+            logger.warning('Not found nvcc in %s. Skip cuda version check!',
+                           cuda_home)
             return None
     nvcc_output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"],
                                           universal_newlines=True)
@@ -599,8 +601,8 @@ def find_nccl_library():
     # manually load the nccl library
     if so_file:
         logger.info(
-            f"Found nccl from environment variable VLLM_NCCL_SO_PATH={so_file}"
-        )
+            "Found nccl from environment variable VLLM_NCCL_SO_PATH=%s",
+            so_file)
     else:
         if torch.version.cuda is not None:
             so_file = vllm_nccl_path or find_library("libnccl.so.2")
@@ -608,7 +610,7 @@ def find_nccl_library():
             so_file = find_library("librccl.so.1")
         else:
             raise ValueError("NCCL only supports CUDA and ROCm backends.")
-        logger.info(f"Found nccl from library {so_file}")
+        logger.info("Found nccl from library %s", so_file)
     return so_file
 
 
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 33dbf8d90c35d..c6da28f110325 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -170,8 +170,8 @@ def load_model(self) -> None:
             )
 
         self.model_memory_usage = m.consumed_memory
-        logger.info(f"Loading model weights took "
-                    f"{self.model_memory_usage / float(2**30):.4f} GB")
+        logger.info("Loading model weights took %.4f GB",
+                    self.model_memory_usage / float(2**30))
 
         if self.lora_config:
             assert hasattr(self.model, "supported_lora_modules"
@@ -196,18 +196,19 @@ def load_model(self) -> None:
                     self.model.load_kv_cache_scales(
                         self.model_config.quantization_param_path)
                 else:
-                    raise RuntimeError("Using FP8 KV cache and scaling "
-                                       "factors provided but model "
-                                       f"{self.model.__class__} does not "
-                                       "support loading scaling factors.")
+                    raise RuntimeError(
+                        "Using FP8 KV cache and scaling factors provided but "
+                        "model %s does not support loading scaling factors.",
+                        self.model.__class__)
             else:
-                logger.warn("Using FP8 KV cache but no scaling factors "
-                            "provided. Defaulting to scaling factors of 1.0. "
-                            "This may lead to less accurate results!")
+                logger.warning(
+                    "Using FP8 KV cache but no scaling factors "
+                    "provided. Defaulting to scaling factors of 1.0. "
+                    "This may lead to less accurate results!")
         elif self.model_config.quantization_param_path is not None:
-            logger.warn("KV cache scaling factors provided, "
-                        "but the KV cache data type is not FP8. "
-                        "KV cache scaling factors will not be used.")
+            logger.warning("KV cache scaling factors provided, "
+                           "but the KV cache data type is not FP8. "
+                           "KV cache scaling factors will not be used.")
 
     def set_block_size(self, block_size: int) -> None:
         self.block_size = block_size
@@ -1054,7 +1055,7 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
         end_time = time.perf_counter()
         elapsed_time = end_time - start_time
         # This usually takes < 10 seconds.
-        logger.info(f"Graph capturing finished in {elapsed_time:.0f} secs.")
+        logger.info("Graph capturing finished in %.0f secs.", elapsed_time)
 
     def __del__(self) -> None:
         # Delete the CUDA graphs before deleting the pynccl communicator.