diff --git a/sdks/python/apache_beam/examples/inference/vllm_text_completion.py b/sdks/python/apache_beam/examples/inference/vllm_text_completion.py
index 26cf99d52c33..367094f13539 100644
--- a/sdks/python/apache_beam/examples/inference/vllm_text_completion.py
+++ b/sdks/python/apache_beam/examples/inference/vllm_text_completion.py
@@ -56,9 +56,7 @@
             role='user', content='What colors are in the rainbow?'),
         OpenAIChatMessage(
             role='system',
-            content=
-            'Red, orange, yellow, green, blue, indigo, and violet.'
-        ),
+            content='Red, orange, yellow, green, blue, indigo, and violet.'),
         OpenAIChatMessage(role='user', content='Do other colors ever appear?')
     ],
     [
@@ -110,7 +108,7 @@ def parse_known_args(argv):
 
 class PostProcessor(beam.DoFn):
   def process(self, element: PredictionResult) -> Iterable[str]:
-    yield element.example + ": " + element.inference
+    yield element.example + ": " + str(element.inference)
 
 
 def run(
diff --git a/sdks/python/apache_beam/ml/inference/vllm_inference.py b/sdks/python/apache_beam/ml/inference/vllm_inference.py
index 02875f5e6ffa..f581ac85b9b8 100644
--- a/sdks/python/apache_beam/ml/inference/vllm_inference.py
+++ b/sdks/python/apache_beam/ml/inference/vllm_inference.py
@@ -126,8 +126,8 @@ def start_server(self, retries=3):
 
     if retries == 0:
       raise Exception(
-          'Failed to start vLLM server, process exited with code %s. '
-          'See worker logs to determine cause.' % self._server_process.poll())
+          "Failed to start vLLM server, process exited with code %s" %
+          self._server_process.poll())
     else:
       self.start_server(retries - 1)
 
@@ -179,6 +179,7 @@ def run_inference(
       An Iterable of type PredictionResult.
     """
     client = getVLLMClient(model.get_server_port())
+    inference_args = inference_args or {}
     predictions = []
     for prompt in batch:
       completion = client.completions.create(
@@ -192,6 +193,9 @@ def share_model_across_processes(self) -> bool:
   def should_skip_batching(self) -> bool:
     # Batching does not help since vllm is already doing dynamic batching and
     # each request is sent one by one anyways
+    # TODO(https://github.com/apache/beam/issues/32528): We should add support
+    # for taking in batches and doing a bunch of async calls. That will end up
+    # being more efficient when we can do in bundle batching.
     return True
 
 
@@ -237,9 +241,10 @@ def run_inference(
       An Iterable of type PredictionResult.
     """
     client = getVLLMClient(model.get_server_port())
+    inference_args = inference_args or {}
     predictions = []
     for messages in batch:
-      completion = client.completions.create(
+      completion = client.chat.completions.create(
           model=self._model_name, messages=messages, **inference_args)
       predictions.append(completion)
     return [PredictionResult(x, y) for x, y in zip(batch, predictions)]
@@ -250,4 +255,7 @@ def share_model_across_processes(self) -> bool:
   def should_skip_batching(self) -> bool:
     # Batching does not help since vllm is already doing dynamic batching and
     # each request is sent one by one anyways
+    # TODO(https://github.com/apache/beam/issues/32528): We should add support
+    # for taking in batches and doing a bunch of async calls. That will end up
+    # being more efficient when we can do in bundle batching.
     return True
diff --git a/sdks/python/test-suites/dataflow/common.gradle b/sdks/python/test-suites/dataflow/common.gradle
index 2c832fc3075e..c9aac452d5de 100644
--- a/sdks/python/test-suites/dataflow/common.gradle
+++ b/sdks/python/test-suites/dataflow/common.gradle
@@ -448,11 +448,11 @@ def vllmTests = tasks.create("vllmTests") {
   // Exec one version with and one version without the chat option
   exec {
     executable 'sh'
-    args '-c', ". ${envdir}/bin/activate && pip install openai && python -m apache_beam.examples.inference.vllm_text_completion $cmdArgs --experiment='worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver'"
+    args '-c', ". ${envdir}/bin/activate && pip install openai && python -m apache_beam.examples.inference.vllm_text_completion $cmdArgs --experiment='worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver:5xx'"
   }
   exec {
     executable 'sh'
-    args '-c', ". ${envdir}/bin/activate && pip install openai && python -m apache_beam.examples.inference.vllm_text_completion $cmdArgs --chat --experiment='worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver'"
+    args '-c', ". ${envdir}/bin/activate && pip install openai && python -m apache_beam.examples.inference.vllm_text_completion $cmdArgs --chat --experiment='worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver:5xx'"
   }
  }
 }