diff --git a/sdks/python/apache_beam/examples/inference/vllm_text_completion.py b/sdks/python/apache_beam/examples/inference/vllm_text_completion.py index 26cf99d52c33..367094f13539 100644 --- a/sdks/python/apache_beam/examples/inference/vllm_text_completion.py +++ b/sdks/python/apache_beam/examples/inference/vllm_text_completion.py @@ -56,9 +56,7 @@ role='user', content='What colors are in the rainbow?'), OpenAIChatMessage( role='system', - content= - 'Red, orange, yellow, green, blue, indigo, and violet.' - ), + content='Red, orange, yellow, green, blue, indigo, and violet.'), OpenAIChatMessage(role='user', content='Do other colors ever appear?') ], [ @@ -110,7 +108,7 @@ def parse_known_args(argv): class PostProcessor(beam.DoFn): def process(self, element: PredictionResult) -> Iterable[str]: - yield element.example + ": " + element.inference + yield element.example + ": " + str(element.inference) def run( diff --git a/sdks/python/apache_beam/ml/inference/vllm_inference.py b/sdks/python/apache_beam/ml/inference/vllm_inference.py index 02875f5e6ffa..f581ac85b9b8 100644 --- a/sdks/python/apache_beam/ml/inference/vllm_inference.py +++ b/sdks/python/apache_beam/ml/inference/vllm_inference.py @@ -126,8 +126,8 @@ def start_server(self, retries=3): if retries == 0: raise Exception( - 'Failed to start vLLM server, process exited with code %s. ' - 'See worker logs to determine cause.' % self._server_process.poll()) + "Failed to start vLLM server, process exited with code %s" % + self._server_process.poll()) else: self.start_server(retries - 1) @@ -179,6 +179,7 @@ def run_inference( An Iterable of type PredictionResult. """ client = getVLLMClient(model.get_server_port()) + inference_args = inference_args or {} predictions = [] for prompt in batch: completion = client.completions.create( @@ -192,6 +193,9 @@ def share_model_across_processes(self) -> bool: def should_skip_batching(self) -> bool: # Batching does not help since vllm is already doing dynamic batching and # each request is sent one by one anyways + # TODO(https://github.com/apache/beam/issues/32528): We should add support + # for taking in batches and doing a bunch of async calls. That will end up + # being more efficient when we can do in bundle batching. return True @@ -237,9 +241,10 @@ def run_inference( An Iterable of type PredictionResult. """ client = getVLLMClient(model.get_server_port()) + inference_args = inference_args or {} predictions = [] for messages in batch: - completion = client.completions.create( + completion = client.chat.completions.create( model=self._model_name, messages=messages, **inference_args) predictions.append(completion) return [PredictionResult(x, y) for x, y in zip(batch, predictions)] @@ -250,4 +255,7 @@ def share_model_across_processes(self) -> bool: def should_skip_batching(self) -> bool: # Batching does not help since vllm is already doing dynamic batching and # each request is sent one by one anyways + # TODO(https://github.com/apache/beam/issues/32528): We should add support + # for taking in batches and doing a bunch of async calls. That will end up + # being more efficient when we can do in bundle batching. return True diff --git a/sdks/python/test-suites/dataflow/common.gradle b/sdks/python/test-suites/dataflow/common.gradle index 2c832fc3075e..c9aac452d5de 100644 --- a/sdks/python/test-suites/dataflow/common.gradle +++ b/sdks/python/test-suites/dataflow/common.gradle @@ -448,11 +448,11 @@ def vllmTests = tasks.create("vllmTests") { // Exec one version with and one version without the chat option exec { executable 'sh' - args '-c', ". ${envdir}/bin/activate && pip install openai && python -m apache_beam.examples.inference.vllm_text_completion $cmdArgs --experiment='worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver'" + args '-c', ". ${envdir}/bin/activate && pip install openai && python -m apache_beam.examples.inference.vllm_text_completion $cmdArgs --experiment='worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver:5xx'" } exec { executable 'sh' - args '-c', ". ${envdir}/bin/activate && pip install openai && python -m apache_beam.examples.inference.vllm_text_completion $cmdArgs --chat --experiment='worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver'" + args '-c', ". ${envdir}/bin/activate && pip install openai && python -m apache_beam.examples.inference.vllm_text_completion $cmdArgs --chat --experiment='worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver:5xx'" } } }