Skip to content

Commit

Permalink
Get tests working with 5xx driver
Browse files Browse the repository at this point in the history
  • Loading branch information
damccorm committed Sep 21, 2024
1 parent c5bf4f9 commit 000401d
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,7 @@
role='user', content='What colors are in the rainbow?'),
OpenAIChatMessage(
role='system',
content=
'Red, orange, yellow, green, blue, indigo, and violet.'
),
content='Red, orange, yellow, green, blue, indigo, and violet.'),
OpenAIChatMessage(role='user', content='Do other colors ever appear?')
],
[
Expand Down Expand Up @@ -110,7 +108,7 @@ def parse_known_args(argv):

class PostProcessor(beam.DoFn):
def process(self, element: PredictionResult) -> Iterable[str]:
yield element.example + ": " + element.inference
yield element.example + ": " + str(element.inference)


def run(
Expand Down
14 changes: 11 additions & 3 deletions sdks/python/apache_beam/ml/inference/vllm_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,8 @@ def start_server(self, retries=3):

if retries == 0:
raise Exception(
'Failed to start vLLM server, process exited with code %s. '
'See worker logs to determine cause.' % self._server_process.poll())
"Failed to start vLLM server, process exited with code %s" %
self._server_process.poll())
else:
self.start_server(retries - 1)

Expand Down Expand Up @@ -179,6 +179,7 @@ def run_inference(
An Iterable of type PredictionResult.
"""
client = getVLLMClient(model.get_server_port())
inference_args = inference_args or {}
predictions = []
for prompt in batch:
completion = client.completions.create(
Expand All @@ -192,6 +193,9 @@ def share_model_across_processes(self) -> bool:
def should_skip_batching(self) -> bool:
# Batching does not help since vllm is already doing dynamic batching and
# each request is sent one by one anyways
# TODO(https://github.com/apache/beam/issues/32528): We should add support
# for taking in batches and doing a bunch of async calls. That will end up
# being more efficient when we can do in bundle batching.
return True


Expand Down Expand Up @@ -237,9 +241,10 @@ def run_inference(
An Iterable of type PredictionResult.
"""
client = getVLLMClient(model.get_server_port())
inference_args = inference_args or {}
predictions = []
for messages in batch:
completion = client.completions.create(
completion = client.chat.completions.create(
model=self._model_name, messages=messages, **inference_args)
predictions.append(completion)
return [PredictionResult(x, y) for x, y in zip(batch, predictions)]
Expand All @@ -250,4 +255,7 @@ def share_model_across_processes(self) -> bool:
def should_skip_batching(self) -> bool:
# Batching does not help since vllm is already doing dynamic batching and
# each request is sent one by one anyways
# TODO(https://github.com/apache/beam/issues/32528): We should add support
# for taking in batches and doing a bunch of async calls. That will end up
# being more efficient when we can do in bundle batching.
return True
4 changes: 2 additions & 2 deletions sdks/python/test-suites/dataflow/common.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -448,11 +448,11 @@ def vllmTests = tasks.create("vllmTests") {
// Exec one version with and one version without the chat option
exec {
executable 'sh'
args '-c', ". ${envdir}/bin/activate && pip install openai && python -m apache_beam.examples.inference.vllm_text_completion $cmdArgs --experiment='worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver'"
args '-c', ". ${envdir}/bin/activate && pip install openai && python -m apache_beam.examples.inference.vllm_text_completion $cmdArgs --experiment='worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver:5xx'"
}
exec {
executable 'sh'
args '-c', ". ${envdir}/bin/activate && pip install openai && python -m apache_beam.examples.inference.vllm_text_completion $cmdArgs --chat --experiment='worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver'"
args '-c', ". ${envdir}/bin/activate && pip install openai && python -m apache_beam.examples.inference.vllm_text_completion $cmdArgs --chat --experiment='worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver:5xx'"
}
}
}
Expand Down

0 comments on commit 000401d

Please sign in to comment.