sgibson91 · pull · Nov 21, 2023 · Oct 30, 2023 · Oct 30, 2023 · Oct 31, 2023
diff --git a/config/clusters/jupyter-meets-the-earth/common.values.yaml b/config/clusters/jupyter-meets-the-earth/common.values.yaml
@@ -284,65 +284,3 @@ dask-gateway:
         memory:
           request: 2G
           limit: 500G
-
-    # Note that we are overriding options provided in 2i2c's helm chart that has
-    # default values for these config entries.
-    #
-    extraConfig:
-      # This configuration represents options that can be presented to users
-      # that want to create a Dask cluster using dask-gateway. For more
-      # details, see https://gateway.dask.org/cluster-options.html
-      #
-      # The goal is to provide a simple configuration that allow the user some
-      # flexibility while also fitting well well on AWS nodes that are all
-      # having 1:4 ratio between CPU and GB of memory. By providing the
-      # username label, we help administrators to track user pods.
-      option_handler: |
-        from dask_gateway_server.options import Options, Select, String, Mapping
-        def cluster_options(user):
-            def option_handler(options):
-                if ":" not in options.image:
-                    raise ValueError("When specifying an image you must also provide a tag")
-                extra_labels = {}
-                scheduler_extra_pod_annotations = {
-                    "prometheus.io/scrape": "true",
-                    "prometheus.io/port": "8787",
-                }
-                chosen_worker_cpu = int(options.worker_specification.split("CPU")[0])
-                chosen_worker_memory = 4 * chosen_worker_cpu
-                # We multiply the requests by a fraction to ensure that the
-                # worker fit well within a node that need some resources
-                # reserved for system pods.
-                return {
-                    # A default image is suggested via DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE env variable
-                    "image": options.image,
-                    "scheduler_extra_pod_labels": extra_labels,
-                    "scheduler_extra_pod_annotations": scheduler_extra_pod_annotations,
-                    "worker_extra_pod_labels": extra_labels,
-                    "worker_cores": 0.85 * chosen_worker_cpu,
-                    "worker_cores_limit": chosen_worker_cpu,
-                    "worker_memory": "%fG" % (0.85 * chosen_worker_memory),
-                    "worker_memory_limit": "%fG" % chosen_worker_memory,
-                    "environment": options.environment,
-                }
-            return Options(
-                Select(
-                    "worker_specification",
-                    [
-                        "1CPU, 4GB",
-                        "2CPU, 8GB",
-                        "4CPU, 16GB",
-                        "8CPU, 32GB",
-                        "16CPU, 64GB",
-                        "32CPU, 128GB",
-                        "64CPU, 256GB",
-                    ],
-                    default="1CPU, 4GB",
-                    label="Worker specification",
-                ),
-                # The default image is set via DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE env variable
-                String("image", label="Image"),
-                Mapping("environment", {}, label="Environment variables"),
-                handler=option_handler,
-            )
-        c.Backend.cluster_options = cluster_options
diff --git a/helm-charts/basehub/templates/configmap-cluster-info.yaml b/helm-charts/basehub/templates/configmap-cluster-info.yaml
@@ -0,0 +1,19 @@
+kind: ConfigMap
+apiVersion: v1
+metadata:
+  name: basehub-cluster-info
+  labels:
+    helm.sh/chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}
+    app.kubernetes.io/name: basehub
+    app.kubernetes.io/instance: {{ .Release.Name }}
+    app.kubernetes.io/managed-by: {{ .Release.Service }}
+data:
+  {{- $k8s_dist := "" }}
+  {{- if (.Capabilities.KubeVersion.Version | contains "gke") }}
+  {{- $k8s_dist = "gke" }}
+  {{- else if (.Capabilities.KubeVersion.Version | contains "eks") }}
+  {{- $k8s_dist = "eks" }}
+  {{- else }}
+  {{- $k8s_dist = "aks" }}
+  {{- end }}
+  K8S_DIST: {{ $k8s_dist }}
diff --git a/helm-charts/basehub/values.yaml b/helm-charts/basehub/values.yaml
@@ -461,6 +461,12 @@ jupyterhub:
                   - value: "/rstudio"
                     title: RStudio
                     description: An IDE For R, created by the RStudio company
+    extraEnv:
+      BASEHUB_K8S_DIST:
+        valueFrom:
+          configMapKeyRef:
+            name: basehub-cluster-info
+            key: K8S_DIST
     initContainers:
       - name: templates-clone
         image: alpine/git:2.40.1

diff --git a/helm-charts/daskhub/values.yaml b/helm-charts/daskhub/values.yaml
@@ -146,13 +146,32 @@ dask-gateway:
           nodeSelector:
             # Dask workers get their own pre-emptible pool
             k8s.dask.org/node-purpose: worker
+    env:
+      - name: BASEHUB_K8S_DIST
+        valueFrom:
+          configMapKeyRef:
+            name: basehub-cluster-info
+            key: K8S_DIST
 
-    # TODO: figure out a replacement for userLimits.
     extraConfig:
+      # This configuration represents options that can be presented to users
+      # that want to create a Dask cluster using dask-gateway client.
+      #
+      # This configuration is meant to enable the user to request dask worker
+      # pods that fits well on 2i2c's clusters. Currently the only kind of
+      # instance types used are n2-highmem-16 or r5.4xlarge.
+      #
+      # - Documentation about exposing cluster options to users:
+      #   https://gateway.dask.org/cluster-options.html and the
+      # - Reference for KubeClusterConfig, which is what can be configured:
+      #   https://gateway.dask.org/api-server.html#kubeclusterconfig.
+      #
       optionHandler: |
-        from dask_gateway_server.options import Options, Integer, Float, String, Mapping
+        import os
         import string
 
+        from dask_gateway_server.options import Integer, Mapping, Options, Select, String
+
         # Escape a string to be dns-safe in the same way that KubeSpawner does it.
         # Reference https://github.com/jupyterhub/kubespawner/blob/616f72c4aee26c3d2127c6af6086ec50d6cda383/kubespawner/spawner.py#L1828-L1835
         # Adapted from https://github.com/minrk/escapism to avoid installing the package
@@ -177,40 +196,131 @@ dask-gateway:
                     chars.append(escaped_hex_char)
             return u''.join(chars)
 
+        # Decide on available instance types and their resource allocation
+        # choices to expose based on cloud provider. For each daskhub hub
+        # managed by 2i2c, there should be these instance types available.
+        #
+        cloud_provider = os.environ["BASEHUB_K8S_DIST"] # gke, eks, or aks
+        instance_types = {
+            "gke": ["n2-highmem-16"],
+            "eks": ["r5.4xlarge"],
+            # 2i2c doesn't yet manage any dask-gateway installations on AKS, so
+            # this hasn't been configured yet and may cause an error - but that
+            # is good as we really should have this if we setup dask-gateway for
+            # AKS anyhow.
+            # aks: [],
+        }
+
+        # NOTE: Data mentioned below comes from manual inspection of data
+        #       collected and currently only available at
+        #       https://github.com/2i2c-org/infrastructure/pull/3337.
+        #
+        resource_allocations = {
+            # n2-highmem-16 nodes in our clusters have 15.89 allocatable cores
+            # and 116.549Gi allocatable memory, and daemonset are expected to
+            # not add more than 400m cores and 800Mi (0.781Gi) memory with some
+            # margin, so we get 15.49 cores and 115.768Gi available for worker
+            # pods to request.
+            #
+            # This is an initial conservative strategy, allowing a slight
+            # oversubscription of CPU but not any oversubscription of memory.
+            #
+            # To workaround https://github.com/dask/dask-gateway/issues/765, we
+            # round worker_cores down from [0.968, 1.936, 3.872, 7.745, 15.49]
+            # to [0.9, 1.9, 3.8, 7.7, 15.4].
+            #
+            "n2-highmem-16": {
+                "1CPU, 7.2Gi": {"worker_cores": 0.9, "worker_cores_limit": 1, "worker_memory": "7.235G", "worker_memory_limit": "7.235G"},
+                "2CPU, 14.5Gi": {"worker_cores": 1.9, "worker_cores_limit": 2, "worker_memory": "14.471G", "worker_memory_limit": "14.471G"},
+                "4CPU, 28.9Gi": {"worker_cores": 3.8, "worker_cores_limit": 4, "worker_memory": "28.942G", "worker_memory_limit": "28.942G"},
+                "8CPU, 57.9Gi": {"worker_cores": 7.7, "worker_cores_limit": 8, "worker_memory": "57.884G", "worker_memory_limit": "57.884G"},
+                "16CPU, 115.8Gi": {"worker_cores": 15.4, "worker_cores_limit": 16, "worker_memory": "115.768G", "worker_memory_limit": "115.768G"},
+            },
+            # r5.4xlarge nodes in our clusters have 15.89 allocatable cores and
+            # 121.504Gi allocatable memory, and daemonset are expected to not
+            # add more than 400m cores and 800Mi (0.781Gi) memory with some
+            # margin, so we get 15.49 cores and 120.723Gi available for worker
+            # pods to request.
+            #
+            # This is an initial conservative strategy, allowing a slight
+            # oversubscription of CPU but not any oversubscription of memory.
+            #
+            # To workaround https://github.com/dask/dask-gateway/issues/765, we
+            # round worker_cores down from [0.968, 1.936, 3.872, 7.745, 15.49]
+            # to [0.9, 1.9, 3.8, 7.7, 15.4].
+            #
+            "r5.4xlarge": {
+                "1CPU, 7.5Gi": {"worker_cores": 0.9, "worker_cores_limit": 1, "worker_memory": "7.545G", "worker_memory_limit": "7.545G"},
+                "2CPU, 15.1Gi": {"worker_cores": 1.9, "worker_cores_limit": 2, "worker_memory": "15.090G", "worker_memory_limit": "15.090G"},
+                "4CPU, 30.2Gi": {"worker_cores": 3.8, "worker_cores_limit": 4, "worker_memory": "30.180G", "worker_memory_limit": "30.180G"},
+                "8CPU, 60.4Gi": {"worker_cores": 7.7, "worker_cores_limit": 8, "worker_memory": "60.361G", "worker_memory_limit": "60.361G"},
+                "16CPU, 120.7Gi": {"worker_cores": 15.4, "worker_cores_limit": 16, "worker_memory": "120.723G", "worker_memory_limit": "120.723G"},
+            },
+        }
+
+        # for now we support only on one instance type per cluster, listing it
+        # as an option is a way to help convey how things work a bit better
+        it = instance_types[cloud_provider][0]
+        ra = resource_allocations[it]
+        ra_keys = list(ra.keys())
+
         def cluster_options(user):
-            safe_username = escape_string_label_safe(user.name)
             def option_handler(options):
                 if ":" not in options.image:
                     raise ValueError("When specifying an image you must also provide a tag")
+                extra_labels = {
+                    "hub.jupyter.org/username": escape_string_label_safe(user.name),
+                }
                 scheduler_extra_pod_annotations = {
-                    "hub.jupyter.org/username": safe_username,
+                    "hub.jupyter.org/username": user.name,
                     "prometheus.io/scrape": "true",
                     "prometheus.io/port": "8787",
                 }
-                extra_labels = {
-                    "hub.jupyter.org/username": safe_username,
+                worker_extra_pod_annotations = {
+                    "hub.jupyter.org/username": user.name,
                 }
+                picked_ra = ra[options.worker_resource_allocation]
+
                 return {
-                    "worker_cores_limit": options.worker_cores,
-                    "worker_cores": options.worker_cores,
-                    "worker_memory": "%fG" % options.worker_memory,
+                    # A default image is suggested via DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE env variable
                     "image": options.image,
-                    "scheduler_extra_pod_annotations": scheduler_extra_pod_annotations,
                     "scheduler_extra_pod_labels": extra_labels,
+                    "scheduler_extra_pod_annotations": scheduler_extra_pod_annotations,
                     "worker_extra_pod_labels": extra_labels,
+                    "worker_extra_pod_annotations": worker_extra_pod_annotations,
+                    "worker_cores": picked_ra["worker_cores"],
+                    "worker_cores_limit": picked_ra["worker_cores_limit"],
+                    "worker_memory": picked_ra["worker_memory"],
+                    "worker_memory_limit": picked_ra["worker_memory_limit"],
                     "environment": options.environment,
+                    "idle_timeout": options.idle_timeout_minutes * 60,
                 }
             return Options(
-                Integer("worker_cores", 2, min=1, label="Worker Cores"),
-                Float("worker_memory", 4, min=1, label="Worker Memory (GiB)"),
-                # The default image is set via DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE env variable
+                Select(
+                    "instance_type",
+                    [it],
+                    default=it,
+                    label="Instance type running worker containers",
+                ),
+                Select(
+                    "worker_resource_allocation",
+                    ra_keys,
+                    default=ra_keys[0],
+                    label="Resources per worker container",
+                ),
+                # The default image is pre-specified by the dask-gateway client
+                # via the env var DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE set on
+                # the jupyterhub user pods
                 String("image", label="Image"),
-                Mapping("environment", {}, label="Environment Variables"),
+                Mapping("environment", {}, label="Environment variables (YAML)"),
+                Integer("idle_timeout_minutes", 30, min=0, label="Idle cluster terminated after (minutes)"),
                 handler=option_handler,
             )
         c.Backend.cluster_options = cluster_options
-      idle: |
-        # timeout after 30 minutes of inactivity
+
+        # timeout after 30 minutes of inactivity by default, keep this in sync
+        # with the user exposed option idle_timeout_minutes's default value
+        # configured above
         c.KubeClusterConfig.idle_timeout = 1800
     prefix: "/services/dask-gateway" # Users connect to the Gateway through the JupyterHub service.
     auth: