Skip to content

Commit

Permalink
Rename device to accelerator for Runtime class
Browse files Browse the repository at this point in the history
Signed-off-by: Andrey Velichkevich <[email protected]>
  • Loading branch information
andreyvelich committed Dec 11, 2024
1 parent 7c4cb22 commit 52f73da
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 29 deletions.
22 changes: 13 additions & 9 deletions sdk_v2/kubeflow/training/api/training_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,12 +123,16 @@ def list_runtimes(self) -> List[types.Runtime]:
ml_policy.torch.num_proc_per_node if ml_policy.torch else None
)

# Get the device count per Trainer node.
# TODO (andreyvelich): Currently, we get the device type from
# Get the accelerator for the Trainer nodes.
# TODO (andreyvelich): Currently, we get the accelerator type from
# the runtime labels.
_, device_count = utils.get_container_devices(resources, num_procs)
if device_count != constants.UNKNOWN:
device_count = str(int(device_count) * int(ml_policy.num_nodes))
_, accelerator_count = utils.get_container_devices(
resources, num_procs
)
if accelerator_count != constants.UNKNOWN:
accelerator_count = str(
int(accelerator_count) * int(ml_policy.num_nodes)
)

result.append(
types.Runtime(
Expand All @@ -138,12 +142,12 @@ def list_runtimes(self) -> List[types.Runtime]:
if constants.PHASE_KEY in metadata.labels
else constants.UNKNOWN
),
device=(
metadata.labels[constants.DEVICE_KEY]
if constants.DEVICE_KEY in metadata.labels
accelerator=(
metadata.labels[constants.ACCELERATOR_KEY]
if constants.ACCELERATOR_KEY in metadata.labels
else constants.UNKNOWN
),
device_count=device_count,
accelerator_count=accelerator_count,
)
)

Expand Down
24 changes: 12 additions & 12 deletions sdk_v2/kubeflow/training/constants/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,29 +41,29 @@
# The value indicates that runtime can be used for the model post-training.
PHASE_POST_TRAINING = "post-training"

# The label key to identify the device (e.g. GPU, TPU) that is used for training.
# TODO: Potentially, we should get this data from the Node selectors.
DEVICE_KEY = "training.kubeflow.org/device"
# The label key to identify the accelerator type for model training (e.g. GPU-Tesla-V100-16GB).
# TODO: Potentially, we should take this from the Node selectors.
ACCELERATOR_KEY = "training.kubeflow.org/accelerator"

# Unknown indicates that the value can't be identified.
UNKNOWN = "Unknown"

# The label for CPU device in the container resources.
CPU_DEVICE_LABEL = "cpu"
# The label for CPU in the container resources.
CPU_LABEL = "cpu"

# The default type for CPU.
# The default type for CPU device.
CPU_DEVICE_TYPE = "cpu"

# The label for NVIDIA GPU device in the container resources.
NVIDIA_GPU_DEVICE_LABEL = "nvidia.com/gpu"
# The label for NVIDIA GPU in the container resources.
NVIDIA_GPU_LABEL = "nvidia.com/gpu"

# The default type for GPU
# The default type for GPU device.
GPU_DEVICE_TYPE = "gpu"

# The label for TPU device in the container resources.
TPU_DEVICE_LABEL = "google.com/tpu"
# The label for TPU in the container resources.
TPU_LABEL = "google.com/tpu"

# The default type for TPU.
# The default type for TPU device.
TPU_DEVICE_TYPE = "tpu"

# The Kind name for the TrainJob.
Expand Down
4 changes: 2 additions & 2 deletions sdk_v2/kubeflow/training/types/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
class Runtime:
name: str
phase: str
device: str
device_count: str
accelerator: str
accelerator_count: str


# Representation for the TrainJob component.
Expand Down
12 changes: 6 additions & 6 deletions sdk_v2/kubeflow/training/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,15 @@ def get_container_devices(
return device, device_count

# TODO (andreyvelich): Support other resource labels (e.g. NPUs).
if constants.NVIDIA_GPU_DEVICE_LABEL in resources.limits:
if constants.NVIDIA_GPU_LABEL in resources.limits:
device = constants.GPU_DEVICE_TYPE
device_count = resources.limits[constants.NVIDIA_GPU_DEVICE_LABEL]
elif constants.TPU_DEVICE_LABEL in resources.limits:
device_count = resources.limits[constants.NVIDIA_GPU_LABEL]
elif constants.TPU_LABEL in resources.limits:
device = constants.TPU_DEVICE_TYPE
device_count = resources.limits[constants.TPU_DEVICE_LABEL]
elif constants.CPU_DEVICE_LABEL in resources.limits:
device_count = resources.limits[constants.TPU_LABEL]
elif constants.CPU_LABEL in resources.limits:
device = constants.CPU_DEVICE_TYPE
device_count = resources.limits[constants.CPU_DEVICE_LABEL]
device_count = resources.limits[constants.CPU_LABEL]
else:
raise Exception(
f"Unknown device type in the container resources: {resources.limits}"
Expand Down

0 comments on commit 52f73da

Please sign in to comment.