Skip to content

Commit

Permalink
Merge pull request #3337 from consideRatio/pr/update-node-info
Browse files Browse the repository at this point in the history
deployer: add info generating commands under resource-allocation
  • Loading branch information
consideRatio authored Nov 4, 2023
2 parents 43b6e66 + baaea25 commit 1be1355
Show file tree
Hide file tree
Showing 6 changed files with 687 additions and 0 deletions.
20 changes: 20 additions & 0 deletions deployer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,11 @@ The `deployer.py` file is the main file, that contains all of the commands regis
│   │   | ├── decision.py
│   │   | └── jobs.py
| | └── resource_allocation
│   │   ├── daemonset_requests.py
│   │   ├── daemonset_requests.yaml
│   │   ├── generate_choices.py
│   │   ├── instance_capacities.py
│   │   ├── instance_capacities.yaml
│   │   ├── node-capacity-info.json
│   │   ├── resource_allocation_app.py
│   │   └── update_nodeinfo.py
Expand Down Expand Up @@ -264,6 +268,22 @@ This sub-command can be used to generate the resource allocation choices for giv
##### `generate resource-allocation choices`
This generates a custom number of resource allocation choices for a certain instance type, depending on a certain chosen strategy that can be used in the profile list of a hub.

##### `generate resource-allocation daemonset-requests`
Updates `daemonset_requests.yaml` with an individual cluster's DaemonSets' requests summarized.

Only DaemonSet's with running pods are considered, and GPU related DaemonSets (with "nvidia" in the name) are also ignored.

To run this command for all clusters, `xargs` can be used like this:

ls config/clusters | xargs -I {} deployer generate resource-allocation daemonset-requests {}

##### `generate resource-allocation instance-capacities`
Updates `instance_capacities.yaml` with an individual cluster's running instance types' total and allocatable capacity.

To run this command for all clusters, `xargs` can be used like this:

ls config/clusters | xargs -I {} deployer generate resource-allocation instance-capacities {}

##### `generate resource-allocation node-info-update`
This updates the json file `node-capacity-info.json` with info about the capacity of a node of a certain type. This file is then used for generating the resource choices.

Expand Down
2 changes: 2 additions & 0 deletions deployer/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
import deployer.commands.generate.dedicated_cluster.aws # noqa: F401
import deployer.commands.generate.dedicated_cluster.gcp # noqa: F401
import deployer.commands.generate.helm_upgrade.jobs # noqa: F401
import deployer.commands.generate.resource_allocation.daemonset_requests # noqa: F401
import deployer.commands.generate.resource_allocation.generate_choices # noqa: F401
import deployer.commands.generate.resource_allocation.instance_capacities # noqa: F401
import deployer.commands.generate.resource_allocation.update_nodeinfo # noqa: F401
import deployer.commands.grafana.central_grafana # noqa: F401
import deployer.commands.grafana.deploy_dashboards # noqa: F401
Expand Down
157 changes: 157 additions & 0 deletions deployer/commands/generate/resource_allocation/daemonset_requests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import json
import math
import subprocess
from pathlib import Path

import typer
from kubernetes.utils.quantity import parse_quantity
from ruamel.yaml import YAML

from deployer.infra_components.cluster import Cluster
from deployer.utils.file_acquisition import find_absolute_path_to_cluster_file

from .resource_allocation_app import resource_allocation_app

HERE = Path(__file__).parent
yaml = YAML()
yaml.preserve_quotes = True
yaml.indent(mapping=2, sequence=4, offset=2)


def get_k8s_distribution():
"""
Returns a 2-tuple with the guessed the k8s distribution based on the k8s
api-server's reported version, either Google's GKE, Amazon's EKS, or Azure's
AKS, and the server's reported gitVersion.
"""
output = subprocess.check_output(
[
"kubectl",
"version",
"--output=json",
],
text=True,
)
version_info = json.loads(output)
server_version_info = version_info["serverVersion"]["gitVersion"]
if "gke" in server_version_info:
return "gke", server_version_info
if "eks" in server_version_info:
return "eks", server_version_info
return "aks", server_version_info


def get_daemon_sets_requests():
"""
Returns a list of dicts with info about DaemonSets with pods desired to be scheduled on
some nodes the k8s cluster.
"""
output = subprocess.check_output(
[
"kubectl",
"get",
"ds",
"--all-namespaces",
"--output=jsonpath-as-json={.items[*]}",
],
text=True,
)
daemon_sets = json.loads(output)

# filter out DaemonSets that aren't desired on any node
daemon_sets = [ds for ds in daemon_sets if ds["status"]["desiredNumberScheduled"]]

info = []
for ds in daemon_sets:
name = ds["metadata"]["name"]
req_mem = req_cpu = lim_mem = lim_cpu = 0
for c in ds["spec"]["template"]["spec"]["containers"]:
resources = c.get("resources", {})
requests = resources.get("requests", {})
limits = resources.get("limits", {})
req_mem += parse_quantity(requests.get("memory", 0))
lim_mem += parse_quantity(limits.get("memory", 0))
req_cpu += parse_quantity(requests.get("cpu", 0))
lim_cpu += parse_quantity(limits.get("cpu", 0))

info.append(
{
"name": name,
"cpu_request": float(req_cpu),
"cpu_limit": float(lim_cpu),
"memory_request": int(req_mem),
"memory_limit": int(lim_mem),
}
)

return info


def get_daemon_sets_requests_summary():
"""
Returns a summary of the requests from `get_daemon_sets_requests`.
"""
daemon_sets = get_daemon_sets_requests()
# filter out DaemonSets related to nvidia GPUs
daemon_sets = [ds for ds in daemon_sets if "nvidia" not in ds["name"]]
# separate DaemonSets without requests, as only requests are what impacts
# scheduling of pods and reduces a node's remaining allocatable resources
req_daemon_sets = [
ds for ds in daemon_sets if ds["cpu_request"] or ds["memory_request"]
]
other_daemon_sets = [
ds for ds in daemon_sets if not ds["cpu_request"] and not ds["memory_request"]
]

cpu_requests = sum([ds["cpu_request"] for ds in req_daemon_sets])
memory_requests = sum([ds["memory_request"] for ds in req_daemon_sets])
info = {
"requesting_daemon_sets": ",".join(
sorted([ds["name"] for ds in req_daemon_sets])
),
"other_daemon_sets": ",".join(sorted([ds["name"] for ds in other_daemon_sets])),
"cpu_requests": str(math.ceil(cpu_requests * 1000)) + "m",
"memory_requests": str(math.ceil(memory_requests / 1024**2)) + "Mi",
}
return info


@resource_allocation_app.command()
def daemonset_requests(
cluster_name: str = typer.Argument(..., help="Name of cluster to operate on"),
):
"""
Updates `daemonset_requests.yaml` with an individual cluster's DaemonSets'
requests summarized.
Only DaemonSet's with running pods are considered, and GPU related
DaemonSets (with "nvidia" in the name) are also ignored.
To run this command for all clusters, `xargs` can be used like this:
ls config/clusters | xargs -I {} deployer generate resource-allocation daemonset-requests {}
"""
file_path = HERE / "daemonset_requests.yaml"
file_path.touch(exist_ok=True)

# acquire a Cluster object
config_file_path = find_absolute_path_to_cluster_file(cluster_name)
with open(config_file_path) as f:
cluster = Cluster(yaml.load(f), config_file_path.parent)

# auth and inspect cluster
with cluster.auth():
k8s_dist, k8s_version = get_k8s_distribution()
ds_requests = get_daemon_sets_requests_summary()

# read
with open(file_path) as f:
info = yaml.load(f) or {}

# update
ds_requests["k8s_version"] = k8s_version
info.setdefault(k8s_dist, {})[cluster_name] = ds_requests

# write
with open(file_path, "w") as f:
yaml.dump(info, f)
180 changes: 180 additions & 0 deletions deployer/commands/generate/resource_allocation/daemonset_requests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
# This file contains generated information about cpu/memory requests made by
# DaemonSets with running pods in our clusters. This information is relevant
# when planning cpu/memory requests for other pods as the daemonsets requests
# reduces the available allocatable capacity.
#
# The requests vary between cloud providers, clusters, and k8s versions for
# reasons like:
#
# - Cloud providers' managed k8s provides different DaemonSets by default
# - DaemonSets may be coupled to managed k8s features (calico-node)
# - DaemonSets' requests may be coupled to managed k8s version (netd)
# - DaemonSets may have a vertical autoscaler changing requests dynamically over
# time if needed (calico-node-vertical-autoscaler)
# - We may deploy or change a DaemonSet's requests over time (support-cryptnono,
# support-prometheus-node-exporter)
#
# This file isn't updated by automation, but can easily be updated by manually
# running a command once for each cluster:
#
# ls config/clusters | xargs -I {} deployer generate resource-allocation daemonset-requests {}
#
gke:
2i2c:
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: binder-staging-dind,binder-staging-image-cleaner,continuous-image-puller,imagebuilding-demo-binderhub-service-docker-api,netd
cpu_requests: 342m
memory_requests: 566Mi
k8s_version: v1.26.5-gke.2100
2i2c-uk:
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 344m
memory_requests: 596Mi
k8s_version: v1.27.4-gke.900
awi-ciroh:
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: netd
cpu_requests: 342m
memory_requests: 566Mi
k8s_version: v1.25.10-gke.2700
callysto:
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 344m
memory_requests: 596Mi
k8s_version: v1.27.4-gke.900
catalystproject-latam:
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,ip-masq-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 338m
memory_requests: 496Mi
k8s_version: v1.27.3-gke.100
cloudbank:
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: continuous-image-puller,continuous-image-puller,continuous-image-puller,netd
cpu_requests: 342m
memory_requests: 566Mi
k8s_version: v1.26.5-gke.2100
hhmi:
requesting_daemon_sets: fluentbit-gke,gke-metadata-server,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 228m
memory_requests: 480Mi
k8s_version: v1.27.3-gke.100
leap:
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: netd
cpu_requests: 342m
memory_requests: 566Mi
k8s_version: v1.25.10-gke.2700
linked-earth:
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 344m
memory_requests: 596Mi
k8s_version: v1.27.4-gke.900
m2lines:
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 344m
memory_requests: 596Mi
k8s_version: v1.27.4-gke.900
meom-ige:
requesting_daemon_sets: fluentbit-gke,gke-metadata-server,gke-metrics-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 234m
memory_requests: 580Mi
k8s_version: v1.27.4-gke.900
pangeo-hubs:
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: netd
cpu_requests: 342m
memory_requests: 566Mi
k8s_version: v1.26.5-gke.2100
qcl:
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: continuous-image-puller,continuous-image-puller,netd
cpu_requests: 336m
memory_requests: 466Mi
k8s_version: v1.25.10-gke.2700
eks:
2i2c-aws-us:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.25.12-eks-2d98532
carbonplan:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.24.16-eks-2d98532
catalystproject-africa:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.27.4-eks-2d98532
gridsst:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.25.12-eks-2d98532
jupyter-meets-the-earth:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.25.12-eks-2d98532
nasa-cryo:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.25.12-eks-2d98532
nasa-ghg:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.27.4-eks-2d98532
nasa-veda:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.25.12-eks-2d98532
openscapes:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.24.16-eks-2d98532
smithsonian:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.25.12-eks-2d98532
ubc-eoas:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.24.17-eks-f8587cb
victor:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.25.12-eks-2d98532
aks:
utoronto:
requesting_daemon_sets: cloud-node-manager,csi-azuredisk-node,csi-azurefile-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: calico-node,continuous-image-puller,continuous-image-puller,continuous-image-puller,continuous-image-puller
cpu_requests: 226m
memory_requests: 300Mi
k8s_version: v1.26.3
Loading

0 comments on commit 1be1355

Please sign in to comment.