-
Notifications
You must be signed in to change notification settings - Fork 65
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3337 from consideRatio/pr/update-node-info
deployer: add info generating commands under resource-allocation
- Loading branch information
Showing
6 changed files
with
687 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
157 changes: 157 additions & 0 deletions
157
deployer/commands/generate/resource_allocation/daemonset_requests.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
import json | ||
import math | ||
import subprocess | ||
from pathlib import Path | ||
|
||
import typer | ||
from kubernetes.utils.quantity import parse_quantity | ||
from ruamel.yaml import YAML | ||
|
||
from deployer.infra_components.cluster import Cluster | ||
from deployer.utils.file_acquisition import find_absolute_path_to_cluster_file | ||
|
||
from .resource_allocation_app import resource_allocation_app | ||
|
||
HERE = Path(__file__).parent | ||
yaml = YAML() | ||
yaml.preserve_quotes = True | ||
yaml.indent(mapping=2, sequence=4, offset=2) | ||
|
||
|
||
def get_k8s_distribution(): | ||
""" | ||
Returns a 2-tuple with the guessed the k8s distribution based on the k8s | ||
api-server's reported version, either Google's GKE, Amazon's EKS, or Azure's | ||
AKS, and the server's reported gitVersion. | ||
""" | ||
output = subprocess.check_output( | ||
[ | ||
"kubectl", | ||
"version", | ||
"--output=json", | ||
], | ||
text=True, | ||
) | ||
version_info = json.loads(output) | ||
server_version_info = version_info["serverVersion"]["gitVersion"] | ||
if "gke" in server_version_info: | ||
return "gke", server_version_info | ||
if "eks" in server_version_info: | ||
return "eks", server_version_info | ||
return "aks", server_version_info | ||
|
||
|
||
def get_daemon_sets_requests(): | ||
""" | ||
Returns a list of dicts with info about DaemonSets with pods desired to be scheduled on | ||
some nodes the k8s cluster. | ||
""" | ||
output = subprocess.check_output( | ||
[ | ||
"kubectl", | ||
"get", | ||
"ds", | ||
"--all-namespaces", | ||
"--output=jsonpath-as-json={.items[*]}", | ||
], | ||
text=True, | ||
) | ||
daemon_sets = json.loads(output) | ||
|
||
# filter out DaemonSets that aren't desired on any node | ||
daemon_sets = [ds for ds in daemon_sets if ds["status"]["desiredNumberScheduled"]] | ||
|
||
info = [] | ||
for ds in daemon_sets: | ||
name = ds["metadata"]["name"] | ||
req_mem = req_cpu = lim_mem = lim_cpu = 0 | ||
for c in ds["spec"]["template"]["spec"]["containers"]: | ||
resources = c.get("resources", {}) | ||
requests = resources.get("requests", {}) | ||
limits = resources.get("limits", {}) | ||
req_mem += parse_quantity(requests.get("memory", 0)) | ||
lim_mem += parse_quantity(limits.get("memory", 0)) | ||
req_cpu += parse_quantity(requests.get("cpu", 0)) | ||
lim_cpu += parse_quantity(limits.get("cpu", 0)) | ||
|
||
info.append( | ||
{ | ||
"name": name, | ||
"cpu_request": float(req_cpu), | ||
"cpu_limit": float(lim_cpu), | ||
"memory_request": int(req_mem), | ||
"memory_limit": int(lim_mem), | ||
} | ||
) | ||
|
||
return info | ||
|
||
|
||
def get_daemon_sets_requests_summary(): | ||
""" | ||
Returns a summary of the requests from `get_daemon_sets_requests`. | ||
""" | ||
daemon_sets = get_daemon_sets_requests() | ||
# filter out DaemonSets related to nvidia GPUs | ||
daemon_sets = [ds for ds in daemon_sets if "nvidia" not in ds["name"]] | ||
# separate DaemonSets without requests, as only requests are what impacts | ||
# scheduling of pods and reduces a node's remaining allocatable resources | ||
req_daemon_sets = [ | ||
ds for ds in daemon_sets if ds["cpu_request"] or ds["memory_request"] | ||
] | ||
other_daemon_sets = [ | ||
ds for ds in daemon_sets if not ds["cpu_request"] and not ds["memory_request"] | ||
] | ||
|
||
cpu_requests = sum([ds["cpu_request"] for ds in req_daemon_sets]) | ||
memory_requests = sum([ds["memory_request"] for ds in req_daemon_sets]) | ||
info = { | ||
"requesting_daemon_sets": ",".join( | ||
sorted([ds["name"] for ds in req_daemon_sets]) | ||
), | ||
"other_daemon_sets": ",".join(sorted([ds["name"] for ds in other_daemon_sets])), | ||
"cpu_requests": str(math.ceil(cpu_requests * 1000)) + "m", | ||
"memory_requests": str(math.ceil(memory_requests / 1024**2)) + "Mi", | ||
} | ||
return info | ||
|
||
|
||
@resource_allocation_app.command() | ||
def daemonset_requests( | ||
cluster_name: str = typer.Argument(..., help="Name of cluster to operate on"), | ||
): | ||
""" | ||
Updates `daemonset_requests.yaml` with an individual cluster's DaemonSets' | ||
requests summarized. | ||
Only DaemonSet's with running pods are considered, and GPU related | ||
DaemonSets (with "nvidia" in the name) are also ignored. | ||
To run this command for all clusters, `xargs` can be used like this: | ||
ls config/clusters | xargs -I {} deployer generate resource-allocation daemonset-requests {} | ||
""" | ||
file_path = HERE / "daemonset_requests.yaml" | ||
file_path.touch(exist_ok=True) | ||
|
||
# acquire a Cluster object | ||
config_file_path = find_absolute_path_to_cluster_file(cluster_name) | ||
with open(config_file_path) as f: | ||
cluster = Cluster(yaml.load(f), config_file_path.parent) | ||
|
||
# auth and inspect cluster | ||
with cluster.auth(): | ||
k8s_dist, k8s_version = get_k8s_distribution() | ||
ds_requests = get_daemon_sets_requests_summary() | ||
|
||
# read | ||
with open(file_path) as f: | ||
info = yaml.load(f) or {} | ||
|
||
# update | ||
ds_requests["k8s_version"] = k8s_version | ||
info.setdefault(k8s_dist, {})[cluster_name] = ds_requests | ||
|
||
# write | ||
with open(file_path, "w") as f: | ||
yaml.dump(info, f) |
180 changes: 180 additions & 0 deletions
180
deployer/commands/generate/resource_allocation/daemonset_requests.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
# This file contains generated information about cpu/memory requests made by | ||
# DaemonSets with running pods in our clusters. This information is relevant | ||
# when planning cpu/memory requests for other pods as the daemonsets requests | ||
# reduces the available allocatable capacity. | ||
# | ||
# The requests vary between cloud providers, clusters, and k8s versions for | ||
# reasons like: | ||
# | ||
# - Cloud providers' managed k8s provides different DaemonSets by default | ||
# - DaemonSets may be coupled to managed k8s features (calico-node) | ||
# - DaemonSets' requests may be coupled to managed k8s version (netd) | ||
# - DaemonSets may have a vertical autoscaler changing requests dynamically over | ||
# time if needed (calico-node-vertical-autoscaler) | ||
# - We may deploy or change a DaemonSet's requests over time (support-cryptnono, | ||
# support-prometheus-node-exporter) | ||
# | ||
# This file isn't updated by automation, but can easily be updated by manually | ||
# running a command once for each cluster: | ||
# | ||
# ls config/clusters | xargs -I {} deployer generate resource-allocation daemonset-requests {} | ||
# | ||
gke: | ||
2i2c: | ||
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: binder-staging-dind,binder-staging-image-cleaner,continuous-image-puller,imagebuilding-demo-binderhub-service-docker-api,netd | ||
cpu_requests: 342m | ||
memory_requests: 566Mi | ||
k8s_version: v1.26.5-gke.2100 | ||
2i2c-uk: | ||
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 344m | ||
memory_requests: 596Mi | ||
k8s_version: v1.27.4-gke.900 | ||
awi-ciroh: | ||
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: netd | ||
cpu_requests: 342m | ||
memory_requests: 566Mi | ||
k8s_version: v1.25.10-gke.2700 | ||
callysto: | ||
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 344m | ||
memory_requests: 596Mi | ||
k8s_version: v1.27.4-gke.900 | ||
catalystproject-latam: | ||
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,ip-masq-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 338m | ||
memory_requests: 496Mi | ||
k8s_version: v1.27.3-gke.100 | ||
cloudbank: | ||
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: continuous-image-puller,continuous-image-puller,continuous-image-puller,netd | ||
cpu_requests: 342m | ||
memory_requests: 566Mi | ||
k8s_version: v1.26.5-gke.2100 | ||
hhmi: | ||
requesting_daemon_sets: fluentbit-gke,gke-metadata-server,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 228m | ||
memory_requests: 480Mi | ||
k8s_version: v1.27.3-gke.100 | ||
leap: | ||
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: netd | ||
cpu_requests: 342m | ||
memory_requests: 566Mi | ||
k8s_version: v1.25.10-gke.2700 | ||
linked-earth: | ||
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 344m | ||
memory_requests: 596Mi | ||
k8s_version: v1.27.4-gke.900 | ||
m2lines: | ||
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 344m | ||
memory_requests: 596Mi | ||
k8s_version: v1.27.4-gke.900 | ||
meom-ige: | ||
requesting_daemon_sets: fluentbit-gke,gke-metadata-server,gke-metrics-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 234m | ||
memory_requests: 580Mi | ||
k8s_version: v1.27.4-gke.900 | ||
pangeo-hubs: | ||
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: netd | ||
cpu_requests: 342m | ||
memory_requests: 566Mi | ||
k8s_version: v1.26.5-gke.2100 | ||
qcl: | ||
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: continuous-image-puller,continuous-image-puller,netd | ||
cpu_requests: 336m | ||
memory_requests: 466Mi | ||
k8s_version: v1.25.10-gke.2700 | ||
eks: | ||
2i2c-aws-us: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.25.12-eks-2d98532 | ||
carbonplan: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.24.16-eks-2d98532 | ||
catalystproject-africa: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.27.4-eks-2d98532 | ||
gridsst: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.25.12-eks-2d98532 | ||
jupyter-meets-the-earth: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.25.12-eks-2d98532 | ||
nasa-cryo: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.25.12-eks-2d98532 | ||
nasa-ghg: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.27.4-eks-2d98532 | ||
nasa-veda: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.25.12-eks-2d98532 | ||
openscapes: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.24.16-eks-2d98532 | ||
smithsonian: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.25.12-eks-2d98532 | ||
ubc-eoas: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.24.17-eks-f8587cb | ||
victor: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.25.12-eks-2d98532 | ||
aks: | ||
utoronto: | ||
requesting_daemon_sets: cloud-node-manager,csi-azuredisk-node,csi-azurefile-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: calico-node,continuous-image-puller,continuous-image-puller,continuous-image-puller,continuous-image-puller | ||
cpu_requests: 226m | ||
memory_requests: 300Mi | ||
k8s_version: v1.26.3 |
Oops, something went wrong.