PoC benchmark to track speed of k8s HPA reaction

PiperOrigin-RevId: 698831891
GoogleCloudPlatform · Dec 11, 2024 · 771f39f · 771f39f
1 parent 7e263a0
commit 771f39f
Show file tree

Hide file tree

Showing 12 changed files with 537 additions and 1 deletion.
diff --git a/perfkitbenchmarker/benchmark_spec.py b/perfkitbenchmarker/benchmark_spec.py
@@ -179,7 +179,7 @@ def __init__(
     self.uuid = '%s-%s' % (FLAGS.run_uri, uuid.uuid4())
     self.always_call_cleanup = pkb_flags.ALWAYS_CALL_CLEANUP.value
     self.dpb_service: dpb_service.BaseDpbService = None
-    self.container_cluster = None
+    self.container_cluster: container_service.BaseContainerCluster = None
     self.key = None
     self.relational_db = None
     self.non_relational_db = None

diff --git a/perfkitbenchmarker/container_service.py b/perfkitbenchmarker/container_service.py
@@ -955,6 +955,23 @@ def WaitForResource(
       run_cmd.append('--all')
     RunKubectlCommand(run_cmd, timeout=timeout + 10)
 
+  @staticmethod
+  def WaitForSucceeded(
+      resource_name: str,
+      namespace: str | None = None,
+      timeout: int = vm_util.DEFAULT_TIMEOUT,
+  ):
+    """Waits for a resource to complete (i.e. .status.phase=='Succeeded')."""
+    run_cmd = [
+        'wait',
+        '--for=jsonpath={.status.phase}=Succeeded',
+        f'--timeout={timeout}s',
+        resource_name,
+    ]
+    if namespace:
+      run_cmd.append(f'--namespace={namespace}')
+    RunKubectlCommand(run_cmd, timeout=timeout + 10)
+
   @staticmethod
   def WaitForRollout(
       resource_name: str, timeout: int = vm_util.DEFAULT_TIMEOUT

diff --git a/perfkitbenchmarker/data/container/kubernetes_hpa/fib.yaml.j2 b/perfkitbenchmarker/data/container/kubernetes_hpa/fib.yaml.j2
@@ -0,0 +1,88 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: fib
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: fib
+  namespace: fib
+spec:
+  selector:
+    matchLabels:
+      app: "fib"
+  template:
+    metadata:
+      labels:
+        app: "fib"
+    spec:
+      containers:
+        - name: "fib"
+          image: {{ fib_image }}
+          imagePullPolicy: "Always"
+          resources:
+            requests:
+              cpu: "1000m"
+              memory: "128Mi"
+            limits:
+              cpu: "2000m"
+              memory: "128Mi"
+          ports:
+            - containerPort: 5000
+              name: "web"
+              protocol: "TCP"
+---
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: "fib"
+  namespace: "fib"
+spec:
+  scaleTargetRef:
+    apiVersion: "apps/v1"
+    kind: "Deployment"
+    name: "fib"
+  minReplicas: 5
+  maxReplicas: 250
+  metrics:
+    - type: "Resource"
+      resource:
+        name: "cpu"
+        target:
+          type: "Utilization"
+          averageUtilization: 70
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: 60
+      policies:
+        - periodSeconds: 15
+          type: "Percent"
+          value: 100
+      selectPolicy: "Min"
+    scaleUp:
+      stabilizationWindowSeconds: 0
+      policies:
+        - periodSeconds: 15
+          type: "Percent"
+          value: 100
+        - periodSeconds: 15
+          type: "Pods"
+          value: 1000
+      selectPolicy: "Max"
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: "fib"
+  namespace: "fib"
+spec:
+  selector:
+    app: "fib"
+  type: LoadBalancer
+  externalTrafficPolicy: Cluster
+  ports:
+    - name: "tcp-port"
+      protocol: "TCP"
+      port: 5000
+      targetPort: 5000
diff --git a/perfkitbenchmarker/data/docker/fibonacci/Dockerfile b/perfkitbenchmarker/data/docker/fibonacci/Dockerfile
@@ -0,0 +1,11 @@
+# Some combinations of python 3.13/C++17 cause build failures in pandas:
+# https://github.com/cython/cython/issues/5790
+# Avoid it by just picking 3.12.
+FROM --platform=linux/amd64 python:3.12 as build
+
+WORKDIR /
+COPY requirements.txt requirements.txt
+RUN pip install -r requirements.txt
+EXPOSE 5000
+COPY . .
+ENTRYPOINT [ "./entrypoint.sh" ]
diff --git a/perfkitbenchmarker/data/docker/fibonacci/entrypoint.sh b/perfkitbenchmarker/data/docker/fibonacci/entrypoint.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+gunicorn perf_server:app -w 4 --threads 2 --bind 0.0.0.0:5000
diff --git a/perfkitbenchmarker/data/docker/fibonacci/perf_server.py b/perfkitbenchmarker/data/docker/fibonacci/perf_server.py
@@ -0,0 +1,38 @@
+"""Toy flask app to inefficiently calculate Fibonacci numbers."""
+
+import socket
+import time
+from flask import Flask
+
+app = Flask(__name__)
+hostname = socket.gethostname()
+
+
+def calculate_fibonacci(n):
+  """Returns the nth Fibonacci number (inefficient for the sake of CPU load).
+
+  Args:
+    n: nth Fibonacci number to be calculated.
+  """
+  if n <= 1:
+    return n
+  else:
+    return calculate_fibonacci(n - 1) + calculate_fibonacci(n - 2)
+
+
+@app.route('/calculate')
+def do_calculation():
+  start_time = time.time()
+  result = calculate_fibonacci(30)  # Adjust the Fibonacci number for load
+  end_time = time.time()
+
+  return [{
+      'result': result,
+      'calculation_time': end_time - start_time,
+      'timestamp': start_time,
+      'pod_id': hostname,
+  }]
+
+
+if __name__ == '__main__':
+  app.run(debug=True, host='0.0.0.0', port=5000)
diff --git a/perfkitbenchmarker/data/docker/fibonacci/requirements.txt b/perfkitbenchmarker/data/docker/fibonacci/requirements.txt
@@ -0,0 +1,16 @@
+blinker==1.7.0
+click==8.1.7
+Flask==3.0.2
+gunicorn==21.2.0
+itsdangerous==2.1.2
+Jinja2==3.1.3
+MarkupSafe==2.1.5
+numpy==1.26.4
+packaging==23.2
+pandas==2.2.1
+python-dateutil==2.9.0.post0
+pytz==2024.1
+six==1.16.0
+tzdata==2024.1
+Werkzeug==3.0.1
+
diff --git a/perfkitbenchmarker/data/locust/rampup.py b/perfkitbenchmarker/data/locust/rampup.py
@@ -0,0 +1,47 @@
+"""Locust file to simulate a "stepped" rampup of load."""
+
+import locust
+
+
+class Rampup(locust.HttpUser):
+  # Send 1QPS (per user)
+  wait_time = locust.constant_throughput(1)
+
+  @locust.task
+  def rampup(self):
+    # Close the connection after each request (or else users won't get load
+    # balanced to new pods.)
+    headers = {"Connection": "close"}
+
+    self.client.get("/calculate", headers=headers)
+
+
+class StagesShape(locust.LoadTestShape):
+  """Locust LoadTestShape to simulate a "stepped" rampup."""
+
+  # pyformat: disable
+  # pylint: disable=bad-whitespace
+  _stages = [
+      {"endtime":  60, "users":   1},  #   1 rps for 1m
+      {"endtime": 360, "users":  20},  #  20 rps for 5m
+      {"endtime": 420, "users":  40},  #  40 rps for 1m
+      {"endtime": 480, "users":  60},  #  60 rps for 1m
+      {"endtime": 540, "users":  90},  #  90 rps for 1m
+      {"endtime": 660, "users": 120},  # 120 rps for 2m
+      {"endtime": 780, "users": 150},  # 150 rps for 2m
+      {"endtime": 900, "users":   1},  #   1 rps for 2m
+      # --------------
+      #     Total: 15m
+  ]
+  # pyformat: enable
+
+  def tick(self):
+    run_time = self.get_run_time()
+
+    for stage in self._stages:
+      if run_time < stage["endtime"]:
+        user_count = stage["users"]
+        spawn_rate = 100  # spawn all new users roughly immediately (over 1s)
+        return (user_count, spawn_rate)
+
+    return None
diff --git a/perfkitbenchmarker/data/locust/simple.py b/perfkitbenchmarker/data/locust/simple.py
@@ -0,0 +1,15 @@
+"""Locust file to flood the SUT."""
+
+from locust import HttpUser
+from locust import task
+
+
+class Simple(HttpUser):
+
+  @task
+  def simple(self):
+    # Close the connection after each request (or else users won't get load
+    # balanced to new pods.)
+    headers = {"Connection": "close"}
+
+    self.client.get("/calculate", headers=headers)
diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes_hpa_benchmark.py b/perfkitbenchmarker/linux_benchmarks/kubernetes_hpa_benchmark.py
@@ -0,0 +1,141 @@
+# Copyright 2019 PerfKitBenchmarker Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Runs a locust based hpa benchmark on a k8s cluster."""
+
+import functools
+from typing import Any, Dict, List
+
+from absl import flags
+from perfkitbenchmarker import background_tasks
+from perfkitbenchmarker import benchmark_spec as bm_spec
+from perfkitbenchmarker import configs
+from perfkitbenchmarker import container_service
+from perfkitbenchmarker.linux_packages import locust
+from perfkitbenchmarker.sample import Sample
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string(
+    'kubernetes_hpa_runtime_class_name',
+    None,
+    'A custom runtimeClassName to apply to the pods.',
+)
+
+BENCHMARK_NAME = 'kubernetes_hpa'
+BENCHMARK_CONFIG = """
+kubernetes_hpa:
+  description: Benchmarks how quickly hpa reacts to load
+  vm_groups:
+    default:
+      vm_spec: *default_dual_core
+      vm_count: 1
+  container_specs:
+    kubernetes_fib:
+      image: fibonacci
+  container_registry: {}
+  container_cluster:
+    cloud: GCP
+    type: Kubernetes
+    vm_count: 1
+    vm_spec: *default_dual_core
+    nodepools:
+      fibpool:
+        vm_count: 3
+        vm_spec:
+          GCP:
+            machine_type: n2-standard-4
+          AWS:
+            machine_type: m6i.xlarge
+          Azure:
+            machine_type: Standard_D4s_v5
+"""
+
+
+def GetConfig(user_config: Dict[str, Any]) -> Dict[str, Any]:
+  """Load and return benchmark config.
+
+  Args:
+    user_config: user supplied configuration (flags and config file)
+
+  Returns:
+    loaded benchmark configuration
+  """
+  config = configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME)
+
+  return config
+
+
+def _PrepareCluster(benchmark_spec: bm_spec.BenchmarkSpec):
+  """Prepares a cluster to run the hpa benchmark."""
+  cluster: container_service.KubernetesCluster = (
+      benchmark_spec.container_cluster
+  )
+  fib_image = benchmark_spec.container_specs['kubernetes_fib'].image
+
+  cluster.ApplyManifest(
+      'container/kubernetes_hpa/fib.yaml.j2',
+      fib_image=fib_image,
+      runtime_class_name=FLAGS.kubernetes_hpa_runtime_class_name,
+  )
+
+  cluster.WaitForResource('deploy/fib', 'available', namespace='fib')
+
+
+def _PrepareLocust(benchmark_spec: bm_spec.BenchmarkSpec):
+  """Prepares a vm to run locust."""
+  vm = benchmark_spec.vms[0]
+  locust.Install(vm)
+  locust.Prep(vm, locust.Locustfile.RAMPUP)
+
+
+def Prepare(benchmark_spec: bm_spec.BenchmarkSpec):
+  """Install fib workload (and associated hpa) on the K8s Cluster.
+
+  Args:
+    benchmark_spec: The benchmark specification. Contains all data that is
+      required to run the benchmark.
+  """
+
+  prepare_fns = [
+      functools.partial(_PrepareCluster, benchmark_spec),
+      functools.partial(_PrepareLocust, benchmark_spec),
+  ]
+
+  background_tasks.RunThreaded(lambda f: f(), prepare_fns)
+
+
+def Run(benchmark_spec: bm_spec.BenchmarkSpec) -> List[Sample]:
+  """Run a benchmark against the Nginx server."""
+
+  # Get the SUT address
+  stdout, _, _ = container_service.RunKubectlCommand([
+      'get',
+      '-n',
+      'fib',
+      'svc/fib',
+      '-o',
+      "jsonpath='{.status.loadBalancer.ingress[0].ip}'",
+  ])
+  addr = 'http://' + stdout.strip() + ':5000'
+
+  # Run locust against the SUT
+  vm = benchmark_spec.vms[0]
+  samples = locust.Run(vm, addr)
+
+  return list(samples)
+
+
+def Cleanup(benchmark_spec):
+  """Cleanup."""
+  del benchmark_spec
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		#!/bin/sh
		gunicorn perf_server:app -w 4 --threads 2 --bind 0.0.0.0:5000