Skip to content

Commit

Permalink
Name compute resources with the amount of memory and number of cores (#…
Browse files Browse the repository at this point in the history
…265)

Instead of naming the queue and the CR the same, after the instance type,
name the CR with the instance type memory and core amounts to give cluster
users more information about the compute nodes.
Many users will not know how much memory and how many cores are available
based on the instance type name.

Resolves #264
  • Loading branch information
cartalla authored Oct 18, 2024
1 parent 5da1512 commit 694e464
Showing 1 changed file with 20 additions and 3 deletions.
23 changes: 20 additions & 3 deletions source/cdk/cdk_slurm_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -2497,12 +2497,20 @@ def create_parallel_cluster_config(self):
number_of_queues = 0
number_of_compute_resources = 0

disable_smt = self.config['slurm']['ParallelClusterConfig']['DisableSimultaneousMultithreading']

# Create 1 queue and compute resource for each instance type and purchase option.
# The queue is named after the instance type.
# The CR is named after the amount of memory and number of cores.
for purchase_option in purchase_options:
for instance_type in self.instance_types:
logger.debug(f"Creating queue for {purchase_option} {instance_type}")
efa_supported = self.plugin.get_EfaSupported(self.cluster_region, instance_type) and self.config['slurm']['ParallelClusterConfig']['EnableEfa']
mem_gb = int(self.plugin.get_MemoryInMiB(self.cluster_region, instance_type) / 1024)
core_count = int(self.plugin.get_CoreCount(self.cluster_region, instance_type))
threads_per_core = int(self.plugin.get_DefaultThreadsPerCore(self.cluster_region, instance_type))
if not disable_smt:
core_count *= threads_per_core
if purchase_option == 'ONDEMAND':
queue_name_prefix = "od"
allocation_strategy = 'lowest-price'
Expand All @@ -2524,6 +2532,7 @@ def create_parallel_cluster_config(self):
if number_of_queues >= MAX_NUMBER_OF_QUEUES:
logger.error(f"Can't create {queue_name} queue because MAX_NUMBER_OF_QUEUES=={MAX_NUMBER_OF_QUEUES} and have {number_of_queues} queues.")
exit(1)
# ParallelCluster creates a NodeSet for each queue that contains all NodeNames in the queue.
nodeset = f"{queue_name}_nodes"
if purchase_option_partition not in partition_nodesets:
partition_nodesets[purchase_option_partition] = []
Expand All @@ -2532,12 +2541,20 @@ def create_parallel_cluster_config(self):
if mem_partition not in partition_nodesets:
partition_nodesets[mem_partition] = []
partition_nodesets[mem_partition].append(nodeset)
mem_core_partition = f"{queue_name_prefix}-{mem_gb}-gb-{core_count}-cores"
if mem_core_partition not in partition_nodesets:
partition_nodesets[mem_core_partition] = []
partition_nodesets[mem_core_partition].append(nodeset)
parallel_cluster_queue = self.create_queue_config(queue_name, allocation_strategy, purchase_option)
number_of_queues += 1

compute_resource_name = f"{queue_name_prefix}-{instance_type}".replace('.', '-')
compute_resource_name = compute_resource_name.replace('large', 'l')
compute_resource_name = compute_resource_name.replace('medium', 'm')
if True:
# CR must begin with an alpha character, otherwise don't need the queue_name_prefix
compute_resource_name = f"{queue_name_prefix}-{mem_gb}-gb-{core_count}-cores"
else:
compute_resource_name = f"{queue_name_prefix}-{instance_type}".replace('.', '-')
compute_resource_name = compute_resource_name.replace('large', 'l')
compute_resource_name = compute_resource_name.replace('medium', 'm')
if number_of_compute_resources >= MAX_NUMBER_OF_COMPUTE_RESOURCES:
logger.error(f"Can't create {compute_resource_name} compute resource because MAX_NUMBER_OF_COMPUTE_RESOURCES=={MAX_NUMBER_OF_COMPUTE_RESOURCES} and have {number_of_compute_resources} compute resources")
exit(1)
Expand Down

0 comments on commit 694e464

Please sign in to comment.