diff --git a/deployer/__main__.py b/deployer/__main__.py index 215a348712..6880ee265d 100644 --- a/deployer/__main__.py +++ b/deployer/__main__.py @@ -14,7 +14,10 @@ import deployer.commands.grafana.tokens # noqa: F401 import deployer.commands.validate.config # noqa: F401 import deployer.keys.decrypt_age # noqa: F401 -from deployer.cli_app import app +import deployer.resource_allocation.generate_choices # noqa: F401 +import deployer.resource_allocation.update_nodeinfo # noqa: F401 + +from .cli_app import app def main(): diff --git a/deployer/resource_allocation/__init__.py b/deployer/resource_allocation/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/deployer/resource_allocation/generate_choices.py b/deployer/resource_allocation/generate_choices.py new file mode 100644 index 0000000000..9b5043e29e --- /dev/null +++ b/deployer/resource_allocation/generate_choices.py @@ -0,0 +1,130 @@ +import json +import sys +from enum import Enum +from pathlib import Path + +import typer +from ruamel.yaml import YAML + +from ..cli_app import app + +yaml = YAML(typ="rt") + +HERE = Path(__file__).parent + + +class ResourceAllocationStrategies(str, Enum): + PROPORTIONAL_MEMORY_STRATEGY = "proportional-memory-strategy" + + +def proportional_memory_strategy( + instance_type: str, nodeinfo: dict, num_allocations: int +): + """ + Generate choices for resource allocation based on proportional changes to memory + + Used primarily in research cases where: + 1. Workloads are more memory constrained than CPU constrained + 2. End users can be expected to select appropriate amount of memory they need for a given + workload, either by their own intrinsic knowledge or instructed by an instructor. + + It features: + 1. No memory overcommit at all, as end users are expected to ask for as much memory as + they need. + 2. CPU *guarantees* are proportional to amount of memory guarantee - the more memory you + ask for, the more CPU you are guaranteed. This allows end users to pick resources purely + based on memory only, simplifying the mental model. Also allows for maximum packing of + user pods onto a node, as we will *not* run out of CPU on a node before running out of + memory. + 3. No CPU limits at all, as CPU is a more flexible resource. The CPU guarantee will ensure + that users will not be starved of CPU. + 4. Each choice the user can make approximately has half as many resources as the next largest + choice, with the largest being a full node. This offers a decent compromise - if you pick + the largest option, you will most likely have to wait for a full node spawn, while smaller + options are much more likely to be shared. + """ + + # We operate on *available* memory, which already accounts for system components (like kubelet & systemd) + # as well as daemonsets we run on every node. This represents the resources that are available + # for user pods. + + # FIXME: Add some more more wiggle room here + available_node_mem = nodeinfo["available"]["memory"] + available_node_cpu = nodeinfo["available"]["cpu"] + + # We always start from the top, and provide a choice that takes up the whole node. + mem_limit = available_node_mem + + choices = {} + for i in range(num_allocations): + # CPU guarantee is proportional to the memory limit for this particular choice. + # This makes sure we utilize all the memory on a node all the time. + cpu_guarantee = (mem_limit / available_node_mem) * available_node_cpu + + # Memory is in bytes, let's convert it to GB to display + mem_display = f"{mem_limit / 1024 / 1024 / 1024:.1f}" + display_name = f"{mem_display} GB RAM, upto {available_node_cpu} CPUs" + + choice = { + "display_name": display_name, + "kubespawner_override": { + # Guarantee and Limit are the same - this strategy has no oversubscription + "mem_guarantee": int(mem_limit), + "mem_limit": int(mem_limit), + "cpu_guarantee": cpu_guarantee, + # CPU limit is set to entire available CPU of the node, making sure no single + # user can starve the node of critical kubelet / systemd resources. + # Leaving it unset sets it to same as guarantee, which we do not want. + "cpu_limit": available_node_cpu, + # Explicitly set node_selector here, so the output can be easily combined + # multiple times, with multiple instance types + "node_selector": {"node.kubernetes.io/instance-type": instance_type}, + }, + } + + # Use the amount of RAM made available as a slug, to allow combining choices from + # multiple instance types in the same profile. This does mean you can not have + # the same RAM allocation from multiple node selectors. But that's a feature, not a bug. + choices[f"mem_{mem_display.replace('.', '_')}"] = choice + + # Halve the mem_limit for the next choice + mem_limit = mem_limit / 2 + + # Reverse the choices so the smallest one is first + choices = dict(reversed(choices.items())) + + # Make the smallest choice the default explicitly + choices[list(choices.keys())[0]]["default"] = True + + return choices + + +@app.command() +def generate_resource_allocation_choices( + instance_type: str = typer.Argument( + ..., help="Instance type to generate Resource Allocation options for" + ), + num_allocations: int = typer.Option(5, help="Number of choices to generate"), + strategy: ResourceAllocationStrategies = typer.Option( + ResourceAllocationStrategies.PROPORTIONAL_MEMORY_STRATEGY, + help="Strategy to use for generating resource allocation choices choices", + ), +): + with open(HERE / "node-capacity-info.json") as f: + nodeinfo = json.load(f) + + if instance_type not in nodeinfo: + print( + f"Capacity information about {instance_type} not available", file=sys.stderr + ) + print("TODO: Provide information on how to update it", file=sys.stderr) + sys.exit(1) + + # Call appropriate function based on what strategy we want to use + if strategy == ResourceAllocationStrategies.PROPORTIONAL_MEMORY_STRATEGY: + choices = proportional_memory_strategy( + instance_type, nodeinfo[instance_type], num_allocations + ) + else: + raise ValueError(f"Strategy {strategy} is not currently supported") + yaml.dump(choices, sys.stdout) diff --git a/deployer/resource_allocation/node-capacity-info.json b/deployer/resource_allocation/node-capacity-info.json new file mode 100644 index 0000000000..758a7d8d48 --- /dev/null +++ b/deployer/resource_allocation/node-capacity-info.json @@ -0,0 +1,102 @@ +{ + "r5.xlarge": { + "capacity": { + "cpu": 4.0, + "memory": 33186611200 + }, + "allocatable": { + "cpu": 3.92, + "memory": 32145375232 + }, + "measured_overhead": { + "cpu": 0.17, + "memory": 262144000 + }, + "available": { + "cpu": 3.75, + "memory": 31883231232 + } + }, + "r5.16xlarge": { + "capacity": { + "cpu": 64.0, + "memory": 535146246144 + }, + "available": { + "cpu": 63.6, + "memory": 526011052032 + } + }, + "n2-highmem-4": { + "capacity": { + "cpu": 4.0, + "memory": 33672949760 + }, + "allocatable": { + "cpu": 3.92, + "memory": 29786927104 + }, + "measured_overhead": { + "cpu": 0.435, + "memory": 488636416 + }, + "available": { + "cpu": 3.485, + "memory": 29298290688 + } + }, + "r5.4xlarge": { + "capacity": { + "cpu": 16.0, + "memory": 133545017344 + }, + "allocatable": { + "cpu": 15.89, + "memory": 130473738240 + }, + "measured_overhead": { + "cpu": 0.17, + "memory": 262144000 + }, + "available": { + "cpu": 15.72, + "memory": 130211594240 + } + }, + "n2-highmem-32": { + "capacity": { + "cpu": 32.0, + "memory": 270473359360 + }, + "allocatable": { + "cpu": 31.85, + "memory": 257783492608 + }, + "measured_overhead": { + "cpu": 0.426, + "memory": 457179136 + }, + "available": { + "cpu": 31.424, + "memory": 257326313472 + } + }, + "n1-highmem-4": { + "capacity": { + "cpu": 4.0, + "memory": 27328200704 + }, + "allocatable": { + "cpu": 3.92, + "memory": 23829102592 + }, + "measured_overhead": { + "cpu": 0.441, + "memory": 593494016 + }, + "available": { + "cpu": 3.479, + "memory": 23235608576 + } + } +} \ No newline at end of file diff --git a/deployer/resource_allocation/update_nodeinfo.py b/deployer/resource_allocation/update_nodeinfo.py new file mode 100644 index 0000000000..33fd3e7bec --- /dev/null +++ b/deployer/resource_allocation/update_nodeinfo.py @@ -0,0 +1,153 @@ +import json +import subprocess +from datetime import datetime, timedelta, timezone +from pathlib import Path + +import typer +from dateutil.parser import parse +from kubernetes.utils.quantity import parse_quantity +from ruamel.yaml import YAML + +from ..cli_app import app + +HERE = Path(__file__).parent + +yaml = YAML(typ="rt") + + +def get_node_capacity_info(instance_type: str): + # Get full YAML spec of all nodes with this instance_type + nodes = json.loads( + subprocess.check_output( + [ + "kubectl", + "get", + "node", + "-l", + # Let's make sure we don't accidentally pick up a core node + f"node.kubernetes.io/instance-type={instance_type},hub.jupyter.org/node-purpose=user", + "-o", + "json", + ] + ).decode() + ) + + if not nodes.get("items"): + # No nodes with given instance_type found! + # A node with this instance_type needs to be actively running for us to accurately + # calculate how much resources are available, as it relies on the non-jupyter pods + # running at that time. + raise ValueError( + f"No nodes with instance-type={instance_type} found in current kubernetes cluster" + ) + + # Let's just make sure it is at least 3 minutes old, to give all pods a + # chance to actually schedule on this. + nodes["items"] = [ + n + for n in nodes["items"] + if datetime.now(timezone.utc) - parse(n["metadata"]["creationTimestamp"]) + > timedelta(minutes=3) + ] + + if not nodes.get("items"): + # A node was found, but it was not old enough. + # We want to wait a while before using it, so daemonsets get time to + # be scheduled + raise ValueError( + f"Node with instance-type={instance_type} found in current kubernetes cluster is not 3 minutes old yet. Wait and try again" + ) + # Just pick one node + node = nodes["items"][0] + + # This is the toal amount of RAM and CPU on the node. + capacity = node["status"]["capacity"] + cpu_capacity = parse_quantity(capacity["cpu"]) + mem_capacity = parse_quantity(capacity["memory"]) + + # Total amount of RAM and CPU available to kubernetes as a whole. + # This accounts for things running on the node, such as kubelet, the + # container runtime, systemd, etc. This does *not* count for daemonsets + # and pods runninng on the kubernetes cluster. + allocatable = node["status"]["allocatable"] + cpu_allocatable = parse_quantity(allocatable["cpu"]) + mem_allocatable = parse_quantity(allocatable["memory"]) + + # Find all pods running on this node + all_pods = json.loads( + subprocess.check_output( + [ + "kubectl", + "get", + "pod", + "-A", + "--field-selector", + f'spec.nodeName={node["metadata"]["name"]}', + "-o", + "json", + ] + ).decode() + )["items"] + + # Filter out jupyterhub user pods + # TODO: Filter out dask scheduler and worker pods + pods = [ + p + for p in all_pods + if p["metadata"]["labels"].get("component") not in ("singleuser-server",) + ] + + # This is the amount of resources available for our workloads - jupyter and dask. + # We start with the allocatable resources, and subtract the resource *requirements* + # for all the pods that are running on every node, primarily from kube-system and + # support. The amount left over is what is available for the *scheduler* to put user pods + # on to. + cpu_available = cpu_allocatable + mem_available = mem_allocatable + + for p in pods: + mem_request = 0 + cpu_request = 0 + # Iterate through all the containers in the pod, and count the memory & cpu requests + # they make. We don't count initContainers' requests as they don't overlap with the + # container requests at any point. + for c in p["spec"]["containers"]: + mem_request += parse_quantity( + c.get("resources", {}).get("requests", {}).get("memory", "0") + ) + cpu_request += parse_quantity( + c.get("resources", {}).get("requests", {}).get("cpu", "0") + ) + cpu_available -= cpu_request + mem_available -= mem_request + + return { + # CPU units are in fractions, while memory units are bytes + "capacity": {"cpu": float(cpu_capacity), "memory": int(mem_capacity)}, + "allocatable": {"cpu": float(cpu_allocatable), "memory": int(mem_allocatable)}, + "measured_overhead": { + "cpu": float(cpu_allocatable - cpu_available), + "memory": int(mem_allocatable - mem_available), + }, + "available": {"cpu": float(cpu_available), "memory": int(mem_available)}, + } + + +@app.command() +def update_node_capacity_info( + instance_type: str = typer.Argument( + ..., help="Instance type to generate Resource Allocation options for" + ), +): + try: + with open(HERE / "node-capacity-info.json") as f: + instances_info = json.load(f) + except FileNotFoundError: + instances_info = {} + node_capacity = get_node_capacity_info(instance_type) + + instances_info[instance_type] = node_capacity + with open(HERE / "node-capacity-info.json", "w") as f: + json.dump(instances_info, f, indent=4) + + print(f"Updated node-capacity-info.json for {instance_type}") diff --git a/docs/index.md b/docs/index.md index 7e61d7dc11..1ab70d0e9d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -88,6 +88,7 @@ topic/access-creds/index.md topic/infrastructure/index.md topic/monitoring-alerting/index.md topic/features.md +topic/resource-allocations.md ``` ## Reference diff --git a/docs/topic/resource-allocation.md b/docs/topic/resource-allocation.md new file mode 100644 index 0000000000..479faeaa51 --- /dev/null +++ b/docs/topic/resource-allocation.md @@ -0,0 +1,88 @@ +# Resource Allocation on Profile Lists + +This document lays out general guidelines on how to think about what goes into +the list of choices about resource allocation that are presented to the user +in the profile List, so they can make an informed choice about what they want +without getting overwhelmed. + +This primarily applies just to research hubs, not educational hubs. + +## Factors to balance + +1. **Server startup time** + + If everyone gets an instance just for themselves, servers + take forever to start. Usually, many users are active at the same time, and we + can decrease server startup time by putting many users on the same machine in a + way they don't step on each others' foot. + +2. **Cloud cost** + + If we pick really large machines, fewer scale up events need to be + triggered, so server startup is much faster. However, we pay for instances + regardless of how 'full' they are, so if we have a 64GB instance that only has + 1GB used, we're paying extra for that. So a trade-off has to be chosen for + *machine size*. This can be quantified though, and help make the tradeoff. + +3. **Resource *limits*, which the end user can consistently observe**. + + Memory limits are easy to explain to end users - if you go over the memory limit, your + kernel dies. If you go over the CPU limit, well, you can't - you get throttled. + If we set limits appropriately, they will also helpfully show up in the status + bar, with + [jupyter-resource-usage](https://github.com/jupyter-server/jupyter-resource-usage) + +4. **Resource *requests* are harder for end users to observe** , as they are primarily + meant for the *scheduler*, on how to pack user nodes together for higher + utilization. This has an 'oversubscription' factor, relying on the fact that + most users don't actually use resources upto their limit. However, this factor + varies community to community, and must be carefully tuned. Users may use more + resources than they are guaranteed *sometimes*, but then get their kernels + killed or CPU throttled at *some other times*, based on what *other* users are + doing. This inconsistent behavior is confusing to end users, and we should be + careful to figure this out. + +So in summary, there are two kinds of factors: + +1. **Noticeable by users** + 1. Server startup time + 2. Memory Limit + 3. CPU Limit + +2. **Noticeable by infrastructure & hub admins**: + 1. Cloud cost (proxied via utilization %) + +The *variables* available to Infrastructure Engineers and hub admins to tune +are: + +1. Size of instances offered + +2. "Oversubscription" factor for memory - this is ratio of memory limit to + memory guarantee. If users are using memory > guarantee but < limit, they *may* + get their kernels killed. Based on our knowledge of this community, we can tune + this variable to reduce cloud cost while also reducing disruption in terms of + kernels being killed + +3. "Oversubscription" factor for CPU. This is easier to handle, as CPUs can be + *throttled* easily. A user may use 4 CPUs for a minute, but then go back to 2 + cpus next minute without anything being "killed". This is unlike memory, where + memory once given can not be taken back. If a user is over the guarantee and + another user who is *under* the guarantee needs the memory, the first users's + kernel *will* be killed. Since this doesn't happen with CPUs, we can be more + liberal in oversubscribing CPUs. + +## UX Goals + +The goal when generating list of resource allocation choices is the following: + +1. Profile options should be *automatically* generated by a script, with various + options to be tuned by the whoever is running it. Engineers should have an easy + time making these choices. + +2. The *end user* should be able to easily understand the ramifications of the + options they choose, and it should be visible to them *after* they start their + notebook as well. + +3. It's alright for users who want *more resources* to have to wait longer for a + server start than users who want fewer resources. This is incentive to start + with fewer resources and then size up. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 134218453c..86ed322173 100644 --- a/requirements.txt +++ b/requirements.txt @@ -38,3 +38,5 @@ gspread==5.11.* # requests is used by deployer/cilogon_app.py requests==2.* +# Used to parse units that kubernetes understands (like GiB) +kubernetes \ No newline at end of file