From e68bee1284237db0bdfbe94c1d7f32a72b4980e1 Mon Sep 17 00:00:00 2001 From: Rail Aliiev Date: Wed, 22 Apr 2020 00:04:42 -0400 Subject: [PATCH] Refactor SLO calculations * Simplify the main logic by using absolute replica counts instead of relative. * Rename `get_new_worker_count` to `get_target_replica_count`. * Move `max_replicas` and `min_replicas` a level higher in the config. The limits are generic enough to be handled by the main logic. * Do not call the Kubernetes autoscaling API if no change is needed. Fixes #38 --- config-example.yaml | 4 +- configs/relengworker-nonprod/config.yml | 60 +++---- configs/relengworker-prod/config.yml | 200 ++++++++++++------------ configs/schema.yml | 12 +- src/k8s_autoscale/main.py | 53 +++---- src/k8s_autoscale/slo.py | 32 ++-- tests/test_sla.py | 30 ++-- 7 files changed, 194 insertions(+), 197 deletions(-) diff --git a/config-example.yaml b/config-example.yaml index aff326a..ac7b3ec 100644 --- a/config-example.yaml +++ b/config-example.yaml @@ -8,10 +8,10 @@ worker_type: kube_connfig_context: xx autoscale: algorithm: slo + max_replicas: 20 + min_replicas: 1 args: - max_replicas: 20 avg_task_duration: 60 slo_seconds: 300 # cover 100% of pending capacity_ratio: 1.0 - min_replicas: 1 diff --git a/configs/relengworker-nonprod/config.yml b/configs/relengworker-nonprod/config.yml index 07530e9..e8112f9 100644 --- a/configs/relengworker-nonprod/config.yml +++ b/configs/relengworker-nonprod/config.yml @@ -6,12 +6,12 @@ worker_types: deployment_name: beetmover-dev-relengworker-firefoxci-gecko-1-1 autoscale: algorithm: slo + max_replicas: 20 + min_replicas: 0 args: - max_replicas: 20 avg_task_duration: 120 slo_seconds: 240 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-1-balrog-dev provisioner: scriptworker-k8s @@ -20,12 +20,12 @@ worker_types: deployment_name: balrog-dev-relengworker-firefoxci-gecko-1-1 autoscale: algorithm: slo + max_replicas: 10 + min_replicas: 0 args: - max_replicas: 10 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-t-signing-dev provisioner: scriptworker-k8s @@ -34,12 +34,12 @@ worker_types: deployment_name: signing-dev-relengworker-firefoxci-gecko-t-1 autoscale: algorithm: slo + max_replicas: 20 + min_replicas: 0 args: - max_replicas: 20 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-1-addon-dev provisioner: scriptworker-k8s @@ -48,12 +48,12 @@ worker_types: deployment_name: addon-dev-relengworker-firefoxci-gecko-1-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-1-bouncer-dev provisioner: scriptworker-k8s @@ -62,12 +62,12 @@ worker_types: deployment_name: bouncer-dev-relengworker-firefoxci-gecko-1-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-1-pushapk-dev provisioner: scriptworker-k8s @@ -76,12 +76,12 @@ worker_types: deployment_name: pushapk-dev-relengworker-firefoxci-gecko-1-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-1-pushflatpak-dev provisioner: scriptworker-k8s @@ -90,12 +90,12 @@ worker_types: deployment_name: pushflatpak-dev-relengworker-firefoxci-gecko-1-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-1-pushsnap-dev provisioner: scriptworker-k8s @@ -104,12 +104,12 @@ worker_types: deployment_name: pushsnap-dev-relengworker-firefoxci-gecko-1-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-1-shipit-dev provisioner: scriptworker-k8s @@ -118,12 +118,12 @@ worker_types: deployment_name: shipit-dev-relengworker-firefoxci-gecko-1-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: mobile-1-shipit-dev provisioner: scriptworker-k8s @@ -132,12 +132,12 @@ worker_types: deployment_name: shipit-dev-relengworker-firefoxci-mobile-1-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-1-tree-dev provisioner: scriptworker-k8s @@ -146,12 +146,12 @@ worker_types: deployment_name: tree-dev-relengworker-firefoxci-gecko-1-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: xpi-t-signing-dev provisioner: scriptworker-k8s @@ -160,12 +160,12 @@ worker_types: deployment_name: signing-dev-relengworker-firefoxci-xpi-t-1 autoscale: algorithm: slo + max_replicas: 10 + min_replicas: 0 args: - max_replicas: 10 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: xpi-1-shipit-dev provisioner: scriptworker-k8s @@ -174,12 +174,12 @@ worker_types: deployment_name: shipit-dev-relengworker-firefoxci-xpi-1-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: mpd001-t-signing-dev provisioner: scriptworker-k8s @@ -188,12 +188,12 @@ worker_types: deployment_name: signing-dev-relengworker-firefoxci-mpd001-t-1 autoscale: algorithm: slo + max_replicas: 10 + min_replicas: 0 args: - max_replicas: 10 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: adhoc-t-signing-dev provisioner: scriptworker-k8s @@ -202,9 +202,9 @@ worker_types: deployment_name: signing-dev-relengworker-firefoxci-adhoc-t-1 autoscale: algorithm: slo + max_replicas: 10 + min_replicas: 0 args: - max_replicas: 10 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 diff --git a/configs/relengworker-prod/config.yml b/configs/relengworker-prod/config.yml index 02b9e65..c168bcb 100644 --- a/configs/relengworker-prod/config.yml +++ b/configs/relengworker-prod/config.yml @@ -7,12 +7,12 @@ worker_types: deployment_name: beetmover-prod-relengworker-firefoxci-applicationservices-3-1 autoscale: algorithm: slo + max_replicas: 20 + min_replicas: 0 args: - max_replicas: 20 avg_task_duration: 120 slo_seconds: 240 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-3-beetmover provisioner: scriptworker-k8s @@ -21,12 +21,12 @@ worker_types: deployment_name: beetmover-prod-relengworker-firefoxci-gecko-3-1 autoscale: algorithm: slo + max_replicas: 80 + min_replicas: 0 args: - max_replicas: 80 avg_task_duration: 120 slo_seconds: 240 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: mobile-3-beetmover @@ -36,12 +36,12 @@ worker_types: deployment_name: beetmover-prod-relengworker-firefoxci-mobile-3-1 autoscale: algorithm: slo + max_replicas: 10 + min_replicas: 0 args: - max_replicas: 10 avg_task_duration: 120 slo_seconds: 240 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: comm-1-beetmover provisioner: scriptworker-k8s @@ -50,12 +50,12 @@ worker_types: deployment_name: beetmover-prod-relengworker-firefoxci-comm-1-1 autoscale: algorithm: slo + max_replicas: 5 + min_replicas: 0 args: - max_replicas: 5 avg_task_duration: 120 slo_seconds: 240 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: comm-3-beetmover provisioner: scriptworker-k8s @@ -64,12 +64,12 @@ worker_types: deployment_name: beetmover-prod-relengworker-firefoxci-comm-3-1 autoscale: algorithm: slo + max_replicas: 20 + min_replicas: 0 args: - max_replicas: 20 avg_task_duration: 120 slo_seconds: 240 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-1-beetmover provisioner: scriptworker-k8s @@ -78,12 +78,12 @@ worker_types: deployment_name: beetmover-prod-relengworker-firefoxci-gecko-1-1 autoscale: algorithm: slo + max_replicas: 20 + min_replicas: 0 args: - max_replicas: 20 avg_task_duration: 120 slo_seconds: 240 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-3-balrog provisioner: scriptworker-k8s @@ -92,12 +92,12 @@ worker_types: deployment_name: balrog-prod-relengworker-firefoxci-gecko-3-1 autoscale: algorithm: slo + max_replicas: 15 + min_replicas: 0 args: - max_replicas: 15 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-1-balrog provisioner: scriptworker-k8s @@ -106,12 +106,12 @@ worker_types: deployment_name: balrog-prod-relengworker-firefoxci-gecko-1-1 autoscale: algorithm: slo + max_replicas: 10 + min_replicas: 0 args: - max_replicas: 10 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: comm-3-balrog provisioner: scriptworker-k8s @@ -120,12 +120,12 @@ worker_types: deployment_name: balrog-prod-relengworker-firefoxci-comm-3-1 autoscale: algorithm: slo + max_replicas: 10 + min_replicas: 0 args: - max_replicas: 10 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: comm-1-balrog provisioner: scriptworker-k8s @@ -134,12 +134,12 @@ worker_types: deployment_name: balrog-prod-relengworker-firefoxci-comm-1-1 autoscale: algorithm: slo + max_replicas: 10 + min_replicas: 0 args: - max_replicas: 10 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-3-addon provisioner: scriptworker-k8s @@ -148,12 +148,12 @@ worker_types: deployment_name: addon-prod-relengworker-firefoxci-gecko-3-1 autoscale: algorithm: slo + max_replicas: 10 + min_replicas: 0 args: - max_replicas: 10 avg_task_duration: 240 slo_seconds: 480 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-1-addon provisioner: scriptworker-k8s @@ -162,12 +162,12 @@ worker_types: deployment_name: addon-prod-relengworker-firefoxci-gecko-1-1 autoscale: algorithm: slo + max_replicas: 10 + min_replicas: 0 args: - max_replicas: 10 avg_task_duration: 240 slo_seconds: 480 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-t-signing provisioner: scriptworker-k8s @@ -176,12 +176,12 @@ worker_types: deployment_name: signing-prod-relengworker-firefoxci-gecko-t-1 autoscale: algorithm: slo + max_replicas: 20 + min_replicas: 0 args: - max_replicas: 20 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-3-signing provisioner: scriptworker-k8s @@ -190,12 +190,12 @@ worker_types: deployment_name: signing-prod-relengworker-firefoxci-gecko-3-1 autoscale: algorithm: slo + max_replicas: 100 + min_replicas: 0 args: - max_replicas: 100 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: mobile-3-signing provisioner: scriptworker-k8s @@ -204,12 +204,12 @@ worker_types: deployment_name: signing-prod-relengworker-firefoxci-mobile-3-1 autoscale: algorithm: slo + max_replicas: 20 + min_replicas: 0 args: - max_replicas: 20 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: comm-3-signing provisioner: scriptworker-k8s @@ -218,12 +218,12 @@ worker_types: deployment_name: signing-prod-relengworker-firefoxci-comm-3-1 autoscale: algorithm: slo + max_replicas: 20 + min_replicas: 0 args: - max_replicas: 20 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: appservices-1-beetmover provisioner: scriptworker-k8s @@ -232,12 +232,12 @@ worker_types: deployment_name: beetmover-prod-relengworker-firefoxci-applicationservices-1-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: mobile-1-beetmover provisioner: scriptworker-k8s @@ -246,12 +246,12 @@ worker_types: deployment_name: beetmover-prod-relengworker-firefoxci-mobile-1-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: comm-1-bouncer provisioner: scriptworker-k8s @@ -260,12 +260,12 @@ worker_types: deployment_name: bouncer-prod-relengworker-firefoxci-comm-1-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-1-bouncer provisioner: scriptworker-k8s @@ -274,12 +274,12 @@ worker_types: deployment_name: bouncer-prod-relengworker-firefoxci-gecko-1-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: comm-3-bouncer provisioner: scriptworker-k8s @@ -288,12 +288,12 @@ worker_types: deployment_name: bouncer-prod-relengworker-firefoxci-comm-3-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-3-bouncer provisioner: scriptworker-k8s @@ -302,12 +302,12 @@ worker_types: deployment_name: bouncer-prod-relengworker-firefoxci-gecko-3-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-1-pushapk provisioner: scriptworker-k8s @@ -316,12 +316,12 @@ worker_types: deployment_name: pushapk-prod-relengworker-firefoxci-gecko-1-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-3-pushapk provisioner: scriptworker-k8s @@ -330,12 +330,12 @@ worker_types: deployment_name: pushapk-prod-relengworker-firefoxci-gecko-3-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: mobile-1-pushapk provisioner: scriptworker-k8s @@ -344,12 +344,12 @@ worker_types: deployment_name: pushapk-prod-relengworker-firefoxci-mobile-1-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: mobile-3-pushapk provisioner: scriptworker-k8s @@ -358,12 +358,12 @@ worker_types: deployment_name: pushapk-prod-relengworker-firefoxci-mobile-3-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-3-pushflatpak provisioner: scriptworker-k8s @@ -372,12 +372,12 @@ worker_types: deployment_name: pushflatpak-prod-relengworker-firefoxci-gecko-3-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-1-pushflatpak provisioner: scriptworker-k8s @@ -386,12 +386,12 @@ worker_types: deployment_name: pushflatpak-prod-relengworker-firefoxci-gecko-1-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-3-pushsnap provisioner: scriptworker-k8s @@ -400,12 +400,12 @@ worker_types: deployment_name: pushsnap-prod-relengworker-firefoxci-gecko-3-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-1-pushsnap provisioner: scriptworker-k8s @@ -414,12 +414,12 @@ worker_types: deployment_name: pushsnap-prod-relengworker-firefoxci-gecko-1-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: comm-1-shipit provisioner: scriptworker-k8s @@ -428,12 +428,12 @@ worker_types: deployment_name: shipit-prod-relengworker-firefoxci-comm-1-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: comm-3-shipit provisioner: scriptworker-k8s @@ -442,12 +442,12 @@ worker_types: deployment_name: shipit-prod-relengworker-firefoxci-comm-3-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-1-shipit provisioner: scriptworker-k8s @@ -456,12 +456,12 @@ worker_types: deployment_name: shipit-prod-relengworker-firefoxci-gecko-1-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-3-shipit provisioner: scriptworker-k8s @@ -470,12 +470,12 @@ worker_types: deployment_name: shipit-prod-relengworker-firefoxci-gecko-3-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: mobile-1-shipit provisioner: scriptworker-k8s @@ -484,12 +484,12 @@ worker_types: deployment_name: shipit-prod-relengworker-firefoxci-mobile-1-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: mobile-3-shipit provisioner: scriptworker-k8s @@ -498,12 +498,12 @@ worker_types: deployment_name: shipit-prod-relengworker-firefoxci-mobile-3-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: appservices-3-signing provisioner: scriptworker-k8s @@ -512,12 +512,12 @@ worker_types: deployment_name: signing-prod-relengworker-firefoxci-applicationservices-3-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: appservices-t-signing provisioner: scriptworker-k8s @@ -526,12 +526,12 @@ worker_types: deployment_name: signing-prod-relengworker-firefoxci-applicationservices-t-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: comm-t-signing provisioner: scriptworker-k8s @@ -540,12 +540,12 @@ worker_types: deployment_name: signing-prod-relengworker-firefoxci-comm-t-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: mobile-t-signing provisioner: scriptworker-k8s @@ -554,12 +554,12 @@ worker_types: deployment_name: signing-prod-relengworker-firefoxci-mobile-t-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: comm-1-tree provisioner: scriptworker-k8s @@ -568,12 +568,12 @@ worker_types: deployment_name: tree-prod-relengworker-firefoxci-comm-1-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: comm-3-tree provisioner: scriptworker-k8s @@ -582,12 +582,12 @@ worker_types: deployment_name: tree-prod-relengworker-firefoxci-comm-3-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-1-tree provisioner: scriptworker-k8s @@ -596,12 +596,12 @@ worker_types: deployment_name: tree-prod-relengworker-firefoxci-gecko-1-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: gecko-3-tree provisioner: scriptworker-k8s @@ -610,12 +610,12 @@ worker_types: deployment_name: tree-prod-relengworker-firefoxci-gecko-3-1 autoscale: algorithm: slo + max_replicas: 3 + min_replicas: 0 args: - max_replicas: 3 avg_task_duration: 300 slo_seconds: 600 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: xpi-t-signing provisioner: scriptworker-k8s @@ -624,12 +624,12 @@ worker_types: deployment_name: signing-prod-relengworker-firefoxci-xpi-t-1 autoscale: algorithm: slo + max_replicas: 10 + min_replicas: 0 args: - max_replicas: 10 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: xpi-3-signing provisioner: scriptworker-k8s @@ -638,12 +638,12 @@ worker_types: deployment_name: signing-prod-relengworker-firefoxci-xpi-3-1 autoscale: algorithm: slo + max_replicas: 10 + min_replicas: 0 args: - max_replicas: 10 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: xpi-1-shipit provisioner: scriptworker-k8s @@ -652,12 +652,12 @@ worker_types: deployment_name: shipit-prod-relengworker-firefoxci-xpi-1-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: xpi-3-shipit provisioner: scriptworker-k8s @@ -666,12 +666,12 @@ worker_types: deployment_name: shipit-prod-relengworker-firefoxci-xpi-3-1 autoscale: algorithm: slo + max_replicas: 1 + min_replicas: 0 args: - max_replicas: 1 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: mpd001-t-signing provisioner: scriptworker-k8s @@ -680,12 +680,12 @@ worker_types: deployment_name: signing-prod-relengworker-firefoxci-mpd001-t-1 autoscale: algorithm: slo + max_replicas: 10 + min_replicas: 0 args: - max_replicas: 10 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 - worker_type: mpd001-3-signing provisioner: scriptworker-k8s @@ -694,9 +694,9 @@ worker_types: deployment_name: signing-prod-relengworker-firefoxci-mpd001-3-1 autoscale: algorithm: slo + max_replicas: 10 + min_replicas: 0 args: - max_replicas: 10 avg_task_duration: 60 slo_seconds: 120 capacity_ratio: 1.0 - min_replicas: 0 diff --git a/configs/schema.yml b/configs/schema.yml index cf89624..22ae890 100644 --- a/configs/schema.yml +++ b/configs/schema.yml @@ -46,11 +46,17 @@ definitions: required: - algorithm - args + - max_replicas + - min_replicas properties: algorithm: type: string enum: - slo + max_replicas: + type: integer + min_replicas: + type: integer args: schema: $ref: "#/definitions/args" @@ -61,17 +67,11 @@ definitions: required: - avg_task_duration - capacity_ratio - - max_replicas - - min_replicas - slo_seconds properties: avs_task_duration: type: integer capacity_ratio: type: number - max_replicas: - type: integer - min_replicas: - type: integer slo_seconds: type: integer diff --git a/src/k8s_autoscale/main.py b/src/k8s_autoscale/main.py index 4ea4ba6..82553e0 100644 --- a/src/k8s_autoscale/main.py +++ b/src/k8s_autoscale/main.py @@ -7,7 +7,7 @@ from taskcluster import Queue from taskcluster.exceptions import TaskclusterRestFailure -from k8s_autoscale.slo import get_new_worker_count +from k8s_autoscale.slo import get_target_replica_count logger = logging.getLogger(__name__) @@ -60,7 +60,7 @@ def get_pending(queue, provisioner, worker_type): def handle_worker_type(cfg): - min_replicas = cfg["autoscale"]["args"]["min_replicas"] + min_replicas = cfg["autoscale"]["min_replicas"] log_env = dict( worker_type=cfg["worker_type"], provisioner=cfg["provisioner"], @@ -77,41 +77,28 @@ def handle_worker_type(cfg): ) log_env["running"] = running logger.info("Calculating capacity", extra=log_env) - capacity = cfg["autoscale"]["args"]["max_replicas"] - running - log_env["capacity"] = capacity + max_replicas = cfg["autoscale"]["max_replicas"] + min_replicas = cfg["autoscale"]["min_replicas"] + log_env["max_replicas"] = max_replicas + log_env["min_replicas"] = min_replicas logger.info("Checking pending", extra=log_env) queue = Queue({"rootUrl": cfg["root_url"]}) pending = get_pending(queue, cfg["provisioner"], cfg["worker_type"]) log_env["pending"] = pending - logger.info("Calculated desired replica count", extra=log_env) - desired = get_new_worker_count(pending, running, cfg["autoscale"]["args"]) - log_env["desired"] = desired - if desired == 0: - logger.info("Zero replicas needed", extra=log_env) - if running < min_replicas: - logger.info("Using min_replicas", extra=log_env) - adjust_scale(api, min_replicas, cfg["deployment_namespace"], cfg["deployment_name"]) - return - if desired < 0: - logger.info("Need to remove %s of %s", abs(desired), running, extra=log_env) - target_replicas = running + desired - log_env["target_replicas"] = target_replicas - if target_replicas < 0: - logger.info("Target is negative, setting to zero", extra=log_env) - target_replicas = 0 - log_env["target_replicas"] = target_replicas - if target_replicas < min_replicas: - logger.info("Using min_replicas instead of target", extra=log_env) - target_replicas = min_replicas - log_env["target_replicas"] = target_replicas - adjust_scale(api, target_replicas, cfg["deployment_namespace"], cfg["deployment_name"]) + logger.info("Calculating target replica count", extra=log_env) + target_replicas = get_target_replica_count(pending, running, cfg["autoscale"]["args"]) + target_replicas = max(min(target_replicas, max_replicas), min_replicas) + log_env["target_replicas"] = target_replicas + if target_replicas == running: + logger.info("Zero new replicas needed", extra=log_env) else: - adjustment = min([capacity, desired]) - log_env["adjustment"] = adjustment - logger.info("Need to increase capacity from %s running by %s", running, adjustment, extra=log_env) - if capacity <= 0: - logger.info("Maximum capacity reached", extra=log_env) - return - adjust_scale(api, running + adjustment, cfg["deployment_namespace"], cfg["deployment_name"]) + if target_replicas < running: + logger.info(f"Need to remove {running-target_replicas} of {running}", extra=log_env) + else: + logger.info( + f"Need to increase capacity from {running} running by {target_replicas-running}", + extra=log_env, + ) + adjust_scale(api, target_replicas, cfg["deployment_namespace"], cfg["deployment_name"]) logger.info("Done handling worker type", extra=log_env) diff --git a/src/k8s_autoscale/slo.py b/src/k8s_autoscale/slo.py index 13e0abc..5db65b9 100644 --- a/src/k8s_autoscale/slo.py +++ b/src/k8s_autoscale/slo.py @@ -1,23 +1,31 @@ import math -def get_new_worker_count(pending, running, args): +def get_target_replica_count(pending, running, args): # TODO: verify all the args assert args["slo_seconds"] > args["avg_task_duration"] # In case we don't want to cover all the pending tasks pending = int(math.ceil(pending * args["capacity_ratio"])) # Scale down only when we have no pending tasks - if pending == 0: - return -running + if pending < 1: + return 0 + # Assume that all running workers have a task + outstanding = pending + running # How many tasks a replica can process within our tolerance period - new_tasks_per_replica = math.floor(args["slo_seconds"] / args["avg_task_duration"]) + tasks_per_replica = math.floor(args["slo_seconds"] / args["avg_task_duration"]) # how many tasks can be covered by the running replicas, assuming they are # busy and can only take new tasks after they are done with the current one - running_tasks_per_replica = math.floor(args["slo_seconds"] / args["avg_task_duration"]) - 1 - running_can_cover = running * running_tasks_per_replica - still_pending = pending - running_can_cover - if still_pending > 0: - new_replicas_needed = math.ceil(still_pending / new_tasks_per_replica) - return min([new_replicas_needed, args["max_replicas"]]) - else: - return 0 + # target_replicas = running + new_replicas_needed + # = running + (still_pending / tasks_per_replica) + # = running + ((pending - running_can_cover) / tasks_per_replica + # = running + ((pending - (running*running_tasks_per_replica)) / tasks_per_replica) + # = running + ((pending - (running*(tasks_per_replica-1))) / tasks_per_replica) + # = running + ((pending + running - running*tasks_per_replica) / tasks_per_replica) + # = running + ((pending + running) / tasks_per_replica - running) + # = (pending + running) / tasks_per_replica + needed_replicas = math.ceil(outstanding / tasks_per_replica) + # Do not scale down in case we have pending, because some workers will + # receive SIGUSR1 and won't take any tasks after that. + needed_replicas = max(needed_replicas, running) + + return needed_replicas diff --git a/tests/test_sla.py b/tests/test_sla.py index 926871f..9872e4f 100644 --- a/tests/test_sla.py +++ b/tests/test_sla.py @@ -1,10 +1,8 @@ import pytest -from k8s_autoscale.slo import get_new_worker_count +from k8s_autoscale.slo import get_target_replica_count -args = {"max_replicas": 10, "avg_task_duration": 60, "slo_seconds": 300, "capacity_ratio": 1.0} -args_capacity = args.copy() -args_capacity["capacity_ratio"] = 0.5 +args = {"avg_task_duration": 60, "slo_seconds": 300, "capacity_ratio": 1.0} @pytest.mark.parametrize( @@ -12,19 +10,23 @@ [ (0, 0, args, 0), (1, 0, args, 1), - (10000, 0, args, 10), - (0, 10, args, -10), - (10, 20, args, 0), + (100, 0, args, 20), + (0, 1, args, 0), + (0, 10, args, 0), + (0, 100, args, 0), + (10, 20, args, 20), + (20, 20, args, 20), + (80, 20, args, 20), (30, 0, args, 6), - (30, 2, args, 5), - (30, 5, args, 2), - (30, 6, args, 2), - (30, 7, args, 1), - (30, 8, args, 0), + (30, 2, args, 7), + (30, 5, args, 7), + (30, 6, args, 8), + (30, 7, args, 8), + (30, 8, args, 8), ], ) def test_process(pending, running, args, expected): - assert get_new_worker_count(pending, running, args) == expected + assert get_target_replica_count(pending, running, args) == expected @pytest.mark.parametrize( @@ -33,4 +35,4 @@ def test_process(pending, running, args, expected): ) def test_process_raises(pending, running, args, exception_type): with pytest.raises(exception_type): - get_new_worker_count(pending, running, args) + get_target_replica_count(pending, running, args)