From db618e86ff7653364a7c40530581ee46210aeb01 Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Mon, 7 Oct 2024 23:38:54 +0200 Subject: [PATCH 1/8] Jsonnet: Add support to deploy distributors in multi availability zones Signed-off-by: Yuri Nikolic --- CHANGELOG.md | 1 + ...test-multi-zone-distributor-generated.yaml | 1333 +++++++++++++++++ .../test-multi-zone-distributor.jsonnet | 30 + .../mimir/multi-zone-distributor.libsonnet | 172 +++ 4 files changed, 1536 insertions(+) create mode 100644 operations/mimir-tests/test-multi-zone-distributor-generated.yaml create mode 100644 operations/mimir-tests/test-multi-zone-distributor.jsonnet create mode 100644 operations/mimir/multi-zone-distributor.libsonnet diff --git a/CHANGELOG.md b/CHANGELOG.md index 836993c0e65..1fd1f4dbc3c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,7 @@ ### Jsonnet +* [FEATURE] Add support to deploy distributors in multi availability zones. * [ENHANCEMENT] Add `ingest_storage_ingester_autoscaling_triggers` option to specify multiple triggers in ScaledObject created for ingest-store ingester autoscaling. #9422 * [ENHANCEMENT] Add `ingest_storage_ingester_autoscaling_scale_up_stabilization_window_seconds` and `ingest_storage_ingester_autoscaling_scale_down_stabilization_window_seconds` config options to make stabilization window for ingester autoscaling when using ingest-storage configurable. #9445 * [ENHANCEMENT] Make label-selector in ReplicaTemplate/ingester-zone-a object configurable when using ingest-storage. #9480 diff --git a/operations/mimir-tests/test-multi-zone-distributor-generated.yaml b/operations/mimir-tests/test-multi-zone-distributor-generated.yaml new file mode 100644 index 00000000000..2dbd47f7924 --- /dev/null +++ b/operations/mimir-tests/test-multi-zone-distributor-generated.yaml @@ -0,0 +1,1333 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: default +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: compactor + name: compactor + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: compactor +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor + name: distributor + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: ingester + name: ingester + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: ingester +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: memcached + name: memcached + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: memcached +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: memcached-frontend + name: memcached-frontend + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: memcached-frontend +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: memcached-index-queries + name: memcached-index-queries + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: memcached-index-queries +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: memcached-metadata + name: memcached-metadata + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: memcached-metadata +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: querier + name: querier + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: querier +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: query-frontend + name: query-frontend + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: query-frontend +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: query-scheduler + name: query-scheduler + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: query-scheduler +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: store-gateway + name: store-gateway + namespace: default +spec: + maxUnavailable: 2 + selector: + matchLabels: + name: store-gateway +--- +apiVersion: v1 +data: + overrides.yaml: | + overrides: {} +kind: ConfigMap +metadata: + name: overrides + namespace: default +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: compactor + name: compactor + namespace: default +spec: + clusterIP: None + ports: + - name: compactor-http-metrics + port: 8080 + targetPort: 8080 + - name: compactor-grpc + port: 9095 + targetPort: 9095 + - name: compactor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: compactor +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor + name: distributor + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor +--- +apiVersion: v1 +kind: Service +metadata: + name: gossip-ring + namespace: default +spec: + clusterIP: None + ports: + - appProtocol: tcp + name: gossip-ring + port: 7946 + protocol: TCP + targetPort: 7946 + selector: + gossip_ring_member: "true" +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: ingester + name: ingester + namespace: default +spec: + ports: + - name: ingester-http-metrics + port: 8080 + targetPort: 8080 + - name: ingester-grpc + port: 9095 + targetPort: 9095 + - name: ingester-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: ingester +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: memcached + name: memcached + namespace: default +spec: + clusterIP: None + ports: + - name: memcached-client + port: 11211 + targetPort: 11211 + - name: exporter-http-metrics + port: 9150 + targetPort: 9150 + selector: + name: memcached +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: memcached-frontend + name: memcached-frontend + namespace: default +spec: + clusterIP: None + ports: + - name: memcached-client + port: 11211 + targetPort: 11211 + - name: exporter-http-metrics + port: 9150 + targetPort: 9150 + selector: + name: memcached-frontend +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: memcached-index-queries + name: memcached-index-queries + namespace: default +spec: + clusterIP: None + ports: + - name: memcached-client + port: 11211 + targetPort: 11211 + - name: exporter-http-metrics + port: 9150 + targetPort: 9150 + selector: + name: memcached-index-queries +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: memcached-metadata + name: memcached-metadata + namespace: default +spec: + clusterIP: None + ports: + - name: memcached-client + port: 11211 + targetPort: 11211 + - name: exporter-http-metrics + port: 9150 + targetPort: 9150 + selector: + name: memcached-metadata +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: querier + name: querier + namespace: default +spec: + ports: + - name: querier-http-metrics + port: 8080 + targetPort: 8080 + - name: querier-grpc + port: 9095 + targetPort: 9095 + - name: querier-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: querier +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: query-frontend + name: query-frontend + namespace: default +spec: + ports: + - name: query-frontend-http-metrics + port: 8080 + targetPort: 8080 + - name: query-frontend-grpc + port: 9095 + targetPort: 9095 + selector: + name: query-frontend +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: query-scheduler + name: query-scheduler + namespace: default +spec: + ports: + - name: query-scheduler-http-metrics + port: 8080 + targetPort: 8080 + - name: query-scheduler-grpc + port: 9095 + targetPort: 9095 + selector: + name: query-scheduler +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: query-scheduler + name: query-scheduler-discovery + namespace: default +spec: + clusterIP: None + ports: + - name: query-scheduler-http-metrics + port: 8080 + targetPort: 8080 + - name: query-scheduler-grpc + port: 9095 + targetPort: 9095 + publishNotReadyAddresses: true + selector: + name: query-scheduler +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: store-gateway + name: store-gateway + namespace: default +spec: + ports: + - name: store-gateway-http-metrics + port: 8080 + targetPort: 8080 + - name: store-gateway-grpc + port: 9095 + targetPort: 9095 + - name: store-gateway-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: store-gateway +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor + namespace: default +spec: + minReadySeconds: 10 + replicas: 3 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor + spec: + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent + name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: querier + namespace: default +spec: + minReadySeconds: 10 + replicas: 6 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: querier + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: querier + spec: + containers: + - args: + - -blocks-storage.bucket-store.metadata-cache.backend=memcached + - -blocks-storage.bucket-store.metadata-cache.memcached.addresses=dnssrvnoa+memcached-metadata.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-item-size=1048576 + - -blocks-storage.bucket-store.sync-dir=/data/tsdb + - -blocks-storage.bucket-store.sync-interval=15m + - -blocks-storage.gcs.bucket-name=blocks-bucket + - -common.storage.backend=gcs + - -distributor.health-check-ingesters=true + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -mem-ballast-size-bytes=268435456 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -querier.frontend-client.grpc-max-send-msg-size=104857600 + - -querier.max-concurrent=8 + - -querier.max-partial-query-length=768h + - -querier.scheduler-address=query-scheduler-discovery.default.svc.cluster.local.:9095 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -store-gateway.sharding-ring.heartbeat-timeout=4m + - -store-gateway.sharding-ring.prefix= + - -store-gateway.sharding-ring.replication-factor=3 + - -store-gateway.sharding-ring.store=memberlist + - -target=querier + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "5" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "5000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent + name: querier + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 24Gi + requests: + cpu: "1" + memory: 12Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 180 + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: querier + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: query-frontend + namespace: default +spec: + minReadySeconds: 10 + replicas: 2 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: query-frontend + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + name: query-frontend + spec: + containers: + - args: + - -query-frontend.cache-results=true + - -query-frontend.max-cache-freshness=10m + - -query-frontend.max-total-query-length=12000h + - -query-frontend.query-sharding-target-series-per-shard=2500 + - -query-frontend.results-cache.backend=memcached + - -query-frontend.results-cache.memcached.addresses=dnssrvnoa+memcached-frontend.default.svc.cluster.local.:11211 + - -query-frontend.results-cache.memcached.max-item-size=5242880 + - -query-frontend.results-cache.memcached.timeout=500ms + - -query-frontend.scheduler-address=query-scheduler-discovery.default.svc.cluster.local.:9095 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=30s + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=query-frontend + - -usage-stats.installation-mode=jsonnet + env: + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "5000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent + name: query-frontend + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 1200Mi + requests: + cpu: "2" + memory: 600Mi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 390 + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: query-frontend + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: query-scheduler + namespace: default +spec: + minReadySeconds: 10 + replicas: 2 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: query-scheduler + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + name: query-scheduler + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + name: query-scheduler + topologyKey: kubernetes.io/hostname + containers: + - args: + - -query-scheduler.max-outstanding-requests-per-tenant=100 + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -target=query-scheduler + - -usage-stats.installation-mode=jsonnet + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent + name: query-scheduler + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 2Gi + requests: + cpu: "2" + memory: 1Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 180 + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + labels: + name: compactor + name: compactor + namespace: default +spec: + podManagementPolicy: Parallel + replicas: 1 + selector: + matchLabels: + name: compactor + serviceName: compactor + template: + metadata: + labels: + gossip_ring_member: "true" + name: compactor + spec: + containers: + - args: + - -blocks-storage.gcs.bucket-name=blocks-bucket + - -common.storage.backend=gcs + - -compactor.block-ranges=2h,12h,24h + - -compactor.blocks-retention-period=0 + - -compactor.cleanup-interval=15m + - -compactor.compaction-concurrency=1 + - -compactor.compaction-interval=30m + - -compactor.compactor-tenant-shard-size=1 + - -compactor.data-dir=/data + - -compactor.deletion-delay=2h + - -compactor.first-level-compaction-wait-period=25m + - -compactor.max-closing-blocks-concurrency=2 + - -compactor.max-opening-blocks-concurrency=4 + - -compactor.ring.heartbeat-period=1m + - -compactor.ring.heartbeat-timeout=4m + - -compactor.ring.prefix= + - -compactor.ring.store=memberlist + - -compactor.ring.wait-stability-min-duration=1m + - -compactor.split-and-merge-shards=0 + - -compactor.split-groups=1 + - -compactor.symbols-flushers-concurrency=4 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -target=compactor + - -usage-stats.installation-mode=jsonnet + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent + name: compactor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 6Gi + requests: + cpu: 1 + memory: 6Gi + volumeMounts: + - mountPath: /data + name: compactor-data + - mountPath: /etc/mimir + name: overrides + securityContext: + runAsUser: 0 + terminationGracePeriodSeconds: 900 + volumes: + - configMap: + name: overrides + name: overrides + updateStrategy: + type: RollingUpdate + volumeClaimTemplates: + - apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: compactor-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 250Gi + storageClassName: standard +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + labels: + name: ingester + name: ingester + namespace: default +spec: + podManagementPolicy: Parallel + replicas: 3 + selector: + matchLabels: + name: ingester + serviceName: ingester + template: + metadata: + labels: + gossip_ring_member: "true" + name: ingester + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + name: ingester + topologyKey: kubernetes.io/hostname + containers: + - args: + - -blocks-storage.gcs.bucket-name=blocks-bucket + - -blocks-storage.tsdb.block-ranges-period=2h + - -blocks-storage.tsdb.dir=/data/tsdb + - -blocks-storage.tsdb.head-compaction-interval=15m + - -blocks-storage.tsdb.ship-interval=1m + - -blocks-storage.tsdb.wal-replay-concurrency=3 + - -common.storage.backend=gcs + - -distributor.health-check-ingesters=true + - -ingester.max-global-metadata-per-metric=10 + - -ingester.max-global-metadata-per-user=30000 + - -ingester.max-global-series-per-user=150000 + - -ingester.ring.heartbeat-period=2m + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.num-tokens=512 + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.tokens-file-path=/data/tokens + - -ingester.ring.unregister-on-shutdown=true + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc-max-concurrent-streams=500 + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -target=ingester + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "9" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent + name: ingester + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 25Gi + requests: + cpu: "4" + memory: 15Gi + volumeMounts: + - mountPath: /data + name: ingester-data + - mountPath: /etc/mimir + name: overrides + securityContext: + runAsUser: 0 + terminationGracePeriodSeconds: 1200 + volumes: + - configMap: + name: overrides + name: overrides + updateStrategy: + type: RollingUpdate + volumeClaimTemplates: + - apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: ingester-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 100Gi + storageClassName: fast +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: memcached + namespace: default +spec: + replicas: 3 + selector: + matchLabels: + name: memcached + serviceName: memcached + template: + metadata: + labels: + name: memcached + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + name: memcached + topologyKey: kubernetes.io/hostname + containers: + - args: + - -m 6144 + - -I 1m + - -c 16384 + - -v + - --extended=track_sizes + image: memcached:1.6.28-alpine + imagePullPolicy: IfNotPresent + name: memcached + ports: + - containerPort: 11211 + name: client + resources: + limits: + memory: 9Gi + requests: + cpu: 500m + memory: 6552Mi + - args: + - --memcached.address=localhost:11211 + - --web.listen-address=0.0.0.0:9150 + image: prom/memcached-exporter:v0.14.4 + imagePullPolicy: IfNotPresent + name: exporter + ports: + - containerPort: 9150 + name: http-metrics + updateStrategy: + type: RollingUpdate +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: memcached-frontend + namespace: default +spec: + replicas: 3 + selector: + matchLabels: + name: memcached-frontend + serviceName: memcached-frontend + template: + metadata: + labels: + name: memcached-frontend + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + name: memcached-frontend + topologyKey: kubernetes.io/hostname + containers: + - args: + - -m 1024 + - -I 5m + - -c 16384 + - -v + - --extended=track_sizes + image: memcached:1.6.28-alpine + imagePullPolicy: IfNotPresent + name: memcached + ports: + - containerPort: 11211 + name: client + resources: + limits: + memory: 1536Mi + requests: + cpu: 500m + memory: 1176Mi + - args: + - --memcached.address=localhost:11211 + - --web.listen-address=0.0.0.0:9150 + image: prom/memcached-exporter:v0.14.4 + imagePullPolicy: IfNotPresent + name: exporter + ports: + - containerPort: 9150 + name: http-metrics + updateStrategy: + type: RollingUpdate +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: memcached-index-queries + namespace: default +spec: + replicas: 3 + selector: + matchLabels: + name: memcached-index-queries + serviceName: memcached-index-queries + template: + metadata: + labels: + name: memcached-index-queries + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + name: memcached-index-queries + topologyKey: kubernetes.io/hostname + containers: + - args: + - -m 1024 + - -I 5m + - -c 16384 + - -v + - --extended=track_sizes + image: memcached:1.6.28-alpine + imagePullPolicy: IfNotPresent + name: memcached + ports: + - containerPort: 11211 + name: client + resources: + limits: + memory: 1536Mi + requests: + cpu: 500m + memory: 1176Mi + - args: + - --memcached.address=localhost:11211 + - --web.listen-address=0.0.0.0:9150 + image: prom/memcached-exporter:v0.14.4 + imagePullPolicy: IfNotPresent + name: exporter + ports: + - containerPort: 9150 + name: http-metrics + updateStrategy: + type: RollingUpdate +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: memcached-metadata + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + name: memcached-metadata + serviceName: memcached-metadata + template: + metadata: + labels: + name: memcached-metadata + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + name: memcached-metadata + topologyKey: kubernetes.io/hostname + containers: + - args: + - -m 512 + - -I 1m + - -c 16384 + - -v + - --extended=track_sizes + image: memcached:1.6.28-alpine + imagePullPolicy: IfNotPresent + name: memcached + ports: + - containerPort: 11211 + name: client + resources: + limits: + memory: 768Mi + requests: + cpu: 500m + memory: 638Mi + - args: + - --memcached.address=localhost:11211 + - --web.listen-address=0.0.0.0:9150 + image: prom/memcached-exporter:v0.14.4 + imagePullPolicy: IfNotPresent + name: exporter + ports: + - containerPort: 9150 + name: http-metrics + updateStrategy: + type: RollingUpdate +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + labels: + name: store-gateway + name: store-gateway + namespace: default +spec: + podManagementPolicy: Parallel + replicas: 3 + selector: + matchLabels: + name: store-gateway + serviceName: store-gateway + template: + metadata: + labels: + gossip_ring_member: "true" + name: store-gateway + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + name: store-gateway + topologyKey: kubernetes.io/hostname + containers: + - args: + - -blocks-storage.bucket-store.chunks-cache.backend=memcached + - -blocks-storage.bucket-store.chunks-cache.memcached.addresses=dnssrvnoa+memcached.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency=100 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections=150 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-item-size=1048576 + - -blocks-storage.bucket-store.chunks-cache.memcached.timeout=750ms + - -blocks-storage.bucket-store.index-cache.backend=memcached + - -blocks-storage.bucket-store.index-cache.memcached.addresses=dnssrvnoa+memcached-index-queries.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.index-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency=100 + - -blocks-storage.bucket-store.index-cache.memcached.max-idle-connections=150 + - -blocks-storage.bucket-store.index-cache.memcached.max-item-size=5242880 + - -blocks-storage.bucket-store.index-cache.memcached.timeout=750ms + - -blocks-storage.bucket-store.metadata-cache.backend=memcached + - -blocks-storage.bucket-store.metadata-cache.memcached.addresses=dnssrvnoa+memcached-metadata.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency=100 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections=150 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-item-size=1048576 + - -blocks-storage.bucket-store.sync-dir=/data/tsdb + - -blocks-storage.bucket-store.sync-interval=15m + - -blocks-storage.gcs.bucket-name=blocks-bucket + - -common.storage.backend=gcs + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -store-gateway.sharding-ring.heartbeat-period=1m + - -store-gateway.sharding-ring.heartbeat-timeout=4m + - -store-gateway.sharding-ring.prefix= + - -store-gateway.sharding-ring.replication-factor=3 + - -store-gateway.sharding-ring.store=memberlist + - -store-gateway.sharding-ring.tokens-file-path=/data/tokens + - -store-gateway.sharding-ring.unregister-on-shutdown=false + - -store-gateway.sharding-ring.wait-stability-min-duration=1m + - -target=store-gateway + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "5" + - name: GOMEMLIMIT + value: "12884901888" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent + name: store-gateway + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 18Gi + requests: + cpu: "1" + memory: 12Gi + volumeMounts: + - mountPath: /data + name: store-gateway-data + - mountPath: /etc/mimir + name: overrides + securityContext: + runAsUser: 0 + terminationGracePeriodSeconds: 120 + volumes: + - configMap: + name: overrides + name: overrides + updateStrategy: + type: RollingUpdate + volumeClaimTemplates: + - apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: store-gateway-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: standard +--- +apiVersion: etcd.database.coreos.com/v1beta2 +kind: EtcdCluster +metadata: + annotations: + etcd.database.coreos.com/scope: clusterwide + name: etcd + namespace: default +spec: + pod: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + etcd_cluster: etcd + topologyKey: kubernetes.io/hostname + annotations: + prometheus.io/port: "2379" + prometheus.io/scrape: "true" + etcdEnv: + - name: ETCD_AUTO_COMPACTION_RETENTION + value: 1h + labels: + name: etcd + resources: + limits: + memory: 512Mi + requests: + cpu: 500m + memory: 512Mi + size: 3 + version: 3.3.13 diff --git a/operations/mimir-tests/test-multi-zone-distributor.jsonnet b/operations/mimir-tests/test-multi-zone-distributor.jsonnet new file mode 100644 index 00000000000..26634758270 --- /dev/null +++ b/operations/mimir-tests/test-multi-zone-distributor.jsonnet @@ -0,0 +1,30 @@ +local mimir = import 'mimir/mimir.libsonnet'; + +mimir { + local availabilityZones = ['us-east-2a', 'us-east-2b'], + + _config+:: { + namespace: 'default', + external_url: 'http://test', + + storage_backend: 'gcs', + blocks_storage_bucket_name: 'blocks-bucket', + + multi_zone_distributor_enabled: true, + multi_zone_distributor_availability_zones: availabilityZones, + }, + + distributor_zone_a_args+:: { + 'ingest-storage.kafka.address': 'warpstream-agent-write-zone-a.%(namespace)s.svc.cluster.local.:9092' % $._config, + 'ingest-storage.kafka.client-id': $.mimirKafkaClientID($.ingest_storage_distributor_kafka_client_id_settings { + warpstream_az: availabilityZones[0], + }), + }, + + distributor_zone_b_args+:: { + 'ingest-storage.kafka.address': 'warpstream-agent-write-zone-b.%(namespace)s.svc.cluster.local.:9092' % $._config, + 'ingest-storage.kafka.client-id': $.mimirKafkaClientID($.ingest_storage_distributor_kafka_client_id_settings { + warpstream_az: availabilityZones[1], + }), + }, +} diff --git a/operations/mimir/multi-zone-distributor.libsonnet b/operations/mimir/multi-zone-distributor.libsonnet new file mode 100644 index 00000000000..00aeabb59ce --- /dev/null +++ b/operations/mimir/multi-zone-distributor.libsonnet @@ -0,0 +1,172 @@ +// This file contains the experimental configuration to deploy distributors in multi-AZ. +{ + _config+:: { + multi_zone_distributor_enabled: false, + multi_zone_distributor_availability_zones: [], + multi_zone_distributor_replicas: std.length($._config.multi_zone_distributor_availability_zones), + }, + + local container = $.core.v1.container, + local deployment = $.apps.v1.deployment, + local service = $.core.v1.service, + + local isMultiZoneEnabled = $._config.multi_zone_distributor_enabled, + local isZoneAEnabled = isMultiZoneEnabled && std.length($._config.multi_zone_distributor_availability_zones) >= 1, + local isZoneBEnabled = isMultiZoneEnabled && std.length($._config.multi_zone_distributor_availability_zones) >= 2, + local isZoneCEnabled = isMultiZoneEnabled && std.length($._config.multi_zone_distributor_availability_zones) >= 3, + + local gossipLabel = if !$._config.memberlist_ring_enabled then {} else + $.apps.v1.statefulSet.spec.template.metadata.withLabelsMixin({ [$._config.gossip_member_label]: 'true' }), + + distributor_zone_a_args:: $.distributor_args, + distributor_zone_b_args:: $.distributor_args, + distributor_zone_c_args:: $.distributor_args, + + distributor_zone_a_env_map:: {}, + distributor_zone_b_env_map:: {}, + distributor_zone_c_env_map:: {}, + + distributor_zone_a_node_affinity_matchers:: $.distributor_node_affinity_matchers + [$.newMimirNodeAffinityMatcherAZ($._config.multi_zone_distributor_availability_zones[0])], + distributor_zone_b_node_affinity_matchers:: $.distributor_node_affinity_matchers + [$.newMimirNodeAffinityMatcherAZ($._config.multi_zone_distributor_availability_zones[1])], + distributor_zone_c_node_affinity_matchers:: $.distributor_node_affinity_matchers + [$.newMimirNodeAffinityMatcherAZ($._config.multi_zone_distributor_availability_zones[2])], + + distributor_zone_a_container:: if !isZoneAEnabled then null else + $.newDistributorZoneContainer('a', $.distributor_zone_a_args, $.distributor_zone_a_env_map), + + distributor_zone_b_container:: if !isZoneBEnabled then null else + $.newDistributorZoneContainer('b', $.distributor_zone_b_args, $.distributor_zone_b_env_map), + + distributor_zone_c_container:: if !isZoneCEnabled then null else + $.newDistributorZoneContainer('c', $.distributor_zone_c_args, $.distributor_zone_c_env_map), + + distributor_zone_a_deployment: if !isZoneAEnabled then null else + $.newDistributorZoneDeployment('a', $.distributor_zone_a_container, $.distributor_zone_a_node_affinity_matchers), + + distributor_zone_b_deployment: if !isZoneBEnabled then null else + $.newDistributorZoneDeployment('b', $.distributor_zone_b_container, $.distributor_zone_b_node_affinity_matchers), + + distributor_zone_c_deployment: if !isZoneCEnabled then null else + $.newDistributorZoneDeployment('c', $.distributor_zone_c_container, $.distributor_zone_c_node_affinity_matchers), + + distributor_zone_a_service: if !isZoneAEnabled then null else + $.util.serviceFor($.distributor_zone_a_deployment, $._config.service_ignored_labels) + + service.mixin.spec.withClusterIp('None'), + + distributor_zone_b_service: if !isZoneBEnabled then null else + $.util.serviceFor($.distributor_zone_b_deployment, $._config.service_ignored_labels) + + service.mixin.spec.withClusterIp('None'), + + distributor_zone_c_service: if !isZoneCEnabled then null else + $.util.serviceFor($.distributor_zone_c_deployment, $._config.service_ignored_labels) + + service.mixin.spec.withClusterIp('None'), + + distributor_zone_a_pdb: if !isZoneAEnabled then null else + $.newMimirPdb('distributor-zone-a'), + + distributor_zone_b_pdb: if !isZoneBEnabled then null else + $.newMimirPdb('distributor-zone-b'), + + distributor_zone_c_pdb: if !isZoneCEnabled then null else + $.newMimirPdb('distributor-zone-c'), + + distributor_zone_a_scaled_object: if !isZoneAEnabled || !$._config.autoscaling_distributor_enabled then null else + $.newDistributorScaledObject('distributor-zone-a', 'distributor-zone-a.*'), + + distributor_zone_b_scaled_object: if !isZoneBEnabled || !$._config.autoscaling_distributor_enabled then null else + $.newDistributorScaledObject('distributor-zone-b', 'distributor-zone-b.*'), + + distributor_zone_c_scaled_object: if !isZoneCEnabled || !$._config.autoscaling_distributor_enabled then null else + $.newDistributorScaledObject('distributor-zone-c', 'distributor-zone-c.*'), + + newDistributorZoneContainer(zone, args, extraEnvVarMap={}):: + $.distributor_container + + container.withArgs($.util.mapToFlags(args)) + + (if std.length(extraEnvVarMap) > 0 then container.withEnvMixin(std.prune(extraEnvVarMap)) else {}), + + newDistributorZoneDeployment(zone, container, nodeAffinityMatchers=[]):: + local name = 'distributor-zone-%s' % zone; + + $.newDistributorDeployment(name, container, nodeAffinityMatchers) + + deployment.mixin.spec.withReplicas(std.ceil($._config.multi_zone_distributor_replicas / std.length($._config.multi_zone_distributor_availability_zones))) + + deployment.spec.template.spec.withTolerationsMixin([ + $.core.v1.toleration.withKey('topology') + + $.core.v1.toleration.withOperator('Equal') + + $.core.v1.toleration.withValue('multi-az') + + $.core.v1.toleration.withEffect('NoSchedule'), + ]) + + gossipLabel + + (if !$._config.autoscaling_distributor_enabled then {} else $.removeReplicasFromSpec), + + newDistributorScaledObject(name, pod_regex):: + $.newResourceScaledObject( + name=name, + container_name='distributor', + cpu_requests=$.distributor_container.resources.requests.cpu, + memory_requests=$.distributor_container.resources.requests.memory, + min_replicas=$._config.autoscaling_distributor_min_replicas, + max_replicas=$._config.autoscaling_distributor_max_replicas, + cpu_target_utilization=$._config.autoscaling_distributor_cpu_target_utilization, + memory_target_utilization=$._config.autoscaling_distributor_memory_target_utilization, + with_cortex_prefix=true, + with_ready_trigger=true, + pod_regex=pod_regex, + ) + ( + { + spec+: { + advanced: { + horizontalPodAutoscalerConfig: { + behavior: { + scaleUp: { + // When multiple policies are specified the policy which allows the highest amount of change is the + // policy which is selected by default. + policies: [ + { + // Allow to scale up at most 50% of pods every 2m. Every 2min is chosen as enough time for new + // pods to be handling load and counted in the 15min lookback window. + // + // This policy covers the case we already have a high number of pods running and adding +50% + // in the span of 2m means adding a significative number of pods. + type: 'Percent', + value: 50, + periodSeconds: 60 * 2, + }, + { + // Allow to scale up at most 50% of pods every 2m. Every 2min is chosen as enough time for new + // pods to be handling load and counted in the 15min lookback window. + // + // This policy covers the case we currently have a small number of pods (e.g. < 10) and limiting + // the scaling by percentage may be too slow when scaling up. + type: 'Pods', + value: 15, + periodSeconds: 60 * 2, + }, + ], + // After a scaleup we should wait at least 2 minutes to observe the effect. + stabilizationWindowSeconds: 60 * 2, + }, + scaleDown: { + policies: [{ + // Allow to scale down up to 10% of pods every 2m. + type: 'Percent', + value: 10, + periodSeconds: 120, + }], + // Reduce the likelihood of flapping replicas. When the metrics indicate that the target should be scaled + // down, HPA looks into previously computed desired states, and uses the highest value from the last 30m. + // This is particularly high for distributors due to their reasonably stable load and their long + // shutdown-delay + grace period. + stabilizationWindowSeconds: 60 * 30, + }, + }, + }, + }, + }, + } + ), + + // Remove single-zone deployment when multi-zone is enabled. + distributor_deployment: if isMultiZoneEnabled then null else super.distributor_deployment, + distributor_service: if isMultiZoneEnabled then null else super.distributor_service, + distributor_pdb: if isMultiZoneEnabled then null else super.distributor_pdb, + distributor_scaled_object: if isMultiZoneEnabled then null else super.distributor_scaled_object, +} From f1def1f0679b4a5a72462dbac4279a781e55846e Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Tue, 8 Oct 2024 12:23:46 +0200 Subject: [PATCH 2/8] Fixing review findings Signed-off-by: Yuri Nikolic --- CHANGELOG.md | 2 +- ...custom-stabilization-window-generated.yaml | 362 ++++- ...toscaling-multiple-triggers-generated.yaml | 362 ++++- ...age-autoscaling-one-trigger-generated.yaml | 362 ++++- ...st-storage-migration-step-0-generated.yaml | 355 ++++- ...st-storage-migration-step-1-generated.yaml | 356 ++++- ...t-storage-migration-step-10-generated.yaml | 362 ++++- ...t-storage-migration-step-11-generated.yaml | 362 ++++- ...st-storage-migration-step-2-generated.yaml | 363 ++++- ...st-storage-migration-step-3-generated.yaml | 363 ++++- ...st-storage-migration-step-4-generated.yaml | 362 ++++- ...t-storage-migration-step-5a-generated.yaml | 362 ++++- ...t-storage-migration-step-5b-generated.yaml | 362 ++++- ...st-storage-migration-step-6-generated.yaml | 362 ++++- ...st-storage-migration-step-7-generated.yaml | 362 ++++- ...st-storage-migration-step-8-generated.yaml | 362 ++++- ...st-storage-migration-step-9-generated.yaml | 362 ++++- ...torage-migration-step-final-generated.yaml | 362 ++++- ...test-multi-zone-distributor-generated.yaml | 1333 ----------------- .../test-multi-zone-distributor.jsonnet | 30 - .../test-multi-zone-generated.yaml | 354 ++++- ...teway-automated-downscaling-generated.yaml | 354 ++++- .../mimir-tests/test-multi-zone.jsonnet | 8 + operations/mimir/autoscaling.libsonnet | 41 +- operations/mimir/common.libsonnet | 6 + operations/mimir/memberlist.libsonnet | 15 + operations/mimir/mimir.libsonnet | 1 + .../mimir/multi-zone-distributor.libsonnet | 81 +- 28 files changed, 6731 insertions(+), 1637 deletions(-) delete mode 100644 operations/mimir-tests/test-multi-zone-distributor-generated.yaml delete mode 100644 operations/mimir-tests/test-multi-zone-distributor.jsonnet diff --git a/CHANGELOG.md b/CHANGELOG.md index 1fd1f4dbc3c..60b5f3a1aee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,7 +37,7 @@ ### Jsonnet -* [FEATURE] Add support to deploy distributors in multi availability zones. +* [FEATURE] Add support to deploy distributors in multi availability zones. #9548 * [ENHANCEMENT] Add `ingest_storage_ingester_autoscaling_triggers` option to specify multiple triggers in ScaledObject created for ingest-store ingester autoscaling. #9422 * [ENHANCEMENT] Add `ingest_storage_ingester_autoscaling_scale_up_stabilization_window_seconds` and `ingest_storage_ingester_autoscaling_scale_down_stabilization_window_seconds` config options to make stabilization window for ingester autoscaling when using ingest-storage configurable. #9445 * [ENHANCEMENT] Make label-selector in ReplicaTemplate/ingester-zone-a object configurable when using ingest-storage. #9480 diff --git a/operations/mimir-tests/test-ingest-storage-autoscaling-custom-stabilization-window-generated.yaml b/operations/mimir-tests/test-ingest-storage-autoscaling-custom-stabilization-window-generated.yaml index e3562950763..618cf612aa1 100644 --- a/operations/mimir-tests/test-ingest-storage-autoscaling-custom-stabilization-window-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-autoscaling-custom-stabilization-window-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -391,8 +404,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -407,7 +420,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -831,15 +866,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 - replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -848,8 +882,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix=partition-ingesters/ + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -921,10 +1082,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2941,6 +3107,182 @@ spec: --- apiVersion: keda.sh/v1alpha1 kind: ScaledObject +metadata: + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-b + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_b_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_b_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject metadata: name: ingester-zone-a namespace: default diff --git a/operations/mimir-tests/test-ingest-storage-autoscaling-multiple-triggers-generated.yaml b/operations/mimir-tests/test-ingest-storage-autoscaling-multiple-triggers-generated.yaml index 3d71ac653bf..fd9b480bd20 100644 --- a/operations/mimir-tests/test-ingest-storage-autoscaling-multiple-triggers-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-autoscaling-multiple-triggers-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -391,8 +404,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -407,7 +420,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -831,15 +866,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 - replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -848,8 +882,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix=partition-ingesters/ + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -921,10 +1082,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2941,6 +3107,182 @@ spec: --- apiVersion: keda.sh/v1alpha1 kind: ScaledObject +metadata: + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-b + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_b_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_b_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject metadata: name: ingester-zone-a namespace: default diff --git a/operations/mimir-tests/test-ingest-storage-autoscaling-one-trigger-generated.yaml b/operations/mimir-tests/test-ingest-storage-autoscaling-one-trigger-generated.yaml index bb11bd645a1..68e194b235f 100644 --- a/operations/mimir-tests/test-ingest-storage-autoscaling-one-trigger-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-autoscaling-one-trigger-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -391,8 +404,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -407,7 +420,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -831,15 +866,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 - replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -848,8 +882,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix=partition-ingesters/ + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -921,10 +1082,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2941,6 +3107,182 @@ spec: --- apiVersion: keda.sh/v1alpha1 kind: ScaledObject +metadata: + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-b + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_b_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_b_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject metadata: name: ingester-zone-a namespace: default diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-0-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-0-generated.yaml index b86c1fd5032..503937febc2 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-0-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-0-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -332,8 +345,30 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b namespace: default spec: clusterIP: None @@ -348,7 +383,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -772,15 +807,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 - replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -789,8 +823,128 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -855,10 +1009,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2775,3 +2934,179 @@ spec: memory: 512Mi size: 3 version: 3.3.13 +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-b + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_b_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_b_memory_hpa_default + type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-1-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-1-generated.yaml index 705760da7a2..b0f5b79d4ad 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-1-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-1-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -332,8 +345,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -348,7 +361,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -841,15 +876,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 - replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -858,8 +892,129 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -925,10 +1080,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3277,3 +3437,179 @@ spec: memory: 512Mi size: 3 version: 3.3.13 +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-b + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_b_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_b_memory_hpa_default + type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-10-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-10-generated.yaml index 5493a610ba9..df2305fc88d 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-10-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-10-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -391,8 +404,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -407,7 +420,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -808,15 +843,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 - replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -825,8 +859,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix=partition-ingesters/ + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -898,10 +1059,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2774,6 +2940,182 @@ spec: --- apiVersion: keda.sh/v1alpha1 kind: ScaledObject +metadata: + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-b + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_b_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_b_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject metadata: name: ingester-zone-a namespace: default diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-11-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-11-generated.yaml index 04b7c5145cf..c8d2a21c763 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-11-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-11-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -391,8 +404,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -407,7 +420,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -808,15 +843,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 - replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -825,8 +859,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix=partition-ingesters/ + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -898,10 +1059,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2778,6 +2944,182 @@ spec: --- apiVersion: keda.sh/v1alpha1 kind: ScaledObject +metadata: + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-b + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_b_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_b_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject metadata: name: ingester-zone-a namespace: default diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-2-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-2-generated.yaml index f1dd8905fe4..1006420691d 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-2-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-2-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -332,8 +345,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -348,7 +361,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -841,15 +876,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 - replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -858,8 +892,136 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingest-storage.migration.distributor-send-to-ingesters-enabled=true + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -932,10 +1094,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3290,3 +3457,179 @@ spec: memory: 512Mi size: 3 version: 3.3.13 +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-b + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_b_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_b_memory_hpa_default + type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-3-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-3-generated.yaml index 8c66ea46ecc..f789b04d721 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-3-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-3-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -332,8 +345,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -348,7 +361,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -841,15 +876,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 - replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -858,8 +892,136 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingest-storage.migration.distributor-send-to-ingesters-enabled=true + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -932,10 +1094,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3312,3 +3479,179 @@ spec: memory: 512Mi size: 3 version: 3.3.13 +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-b + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_b_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_b_memory_hpa_default + type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-4-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-4-generated.yaml index fa331a58566..f0c4b1ccaf2 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-4-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-4-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -332,8 +345,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -348,7 +361,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -841,15 +876,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 - replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -858,8 +892,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -931,10 +1092,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3310,3 +3476,179 @@ spec: memory: 512Mi size: 3 version: 3.3.13 +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-b + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_b_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_b_memory_hpa_default + type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-5a-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-5a-generated.yaml index 9ff16c6f497..5d3caad09f5 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-5a-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-5a-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -332,8 +345,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -348,7 +361,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -841,15 +876,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 - replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -858,8 +892,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -931,10 +1092,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3310,3 +3476,179 @@ spec: memory: 512Mi size: 3 version: 3.3.13 +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-b + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_b_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_b_memory_hpa_default + type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-5b-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-5b-generated.yaml index 71a1767b8ee..18a27160cfe 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-5b-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-5b-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -332,8 +345,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -348,7 +361,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -841,15 +876,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 - replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -858,8 +892,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -931,10 +1092,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3310,3 +3476,179 @@ spec: memory: 512Mi size: 3 version: 3.3.13 +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-b + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_b_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_b_memory_hpa_default + type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-6-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-6-generated.yaml index 533e044db21..de991d69511 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-6-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-6-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -332,8 +345,30 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b namespace: default spec: clusterIP: None @@ -348,7 +383,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -772,15 +807,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 - replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -789,8 +823,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -862,10 +1023,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2839,3 +3005,179 @@ spec: memory: 512Mi size: 3 version: 3.3.13 +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-b + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_b_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_b_memory_hpa_default + type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-7-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-7-generated.yaml index a56bf98d767..a41c63b5f97 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-7-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-7-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -332,8 +345,30 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b namespace: default spec: clusterIP: None @@ -348,7 +383,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -772,15 +807,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 - replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -789,8 +823,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix=partition-ingesters/ + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -862,10 +1023,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2867,3 +3033,179 @@ spec: memory: 512Mi size: 3 version: 3.3.13 +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-b + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_b_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_b_memory_hpa_default + type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-8-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-8-generated.yaml index 3994bf7bb43..253675e25ab 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-8-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-8-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -332,8 +345,30 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b namespace: default spec: clusterIP: None @@ -348,7 +383,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -772,15 +807,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 - replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -789,8 +823,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix=partition-ingesters/ + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -862,10 +1023,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2867,3 +3033,179 @@ spec: memory: 512Mi size: 3 version: 3.3.13 +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-b + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_b_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_b_memory_hpa_default + type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-9-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-9-generated.yaml index 8028222f189..0272084002c 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-9-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-9-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -332,8 +345,30 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b namespace: default spec: clusterIP: None @@ -348,7 +383,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -749,15 +784,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 - replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -766,8 +800,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix=partition-ingesters/ + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -839,10 +1000,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2704,3 +2870,179 @@ spec: memory: 512Mi size: 3 version: 3.3.13 +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-b + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_b_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_b_memory_hpa_default + type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-final-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-final-generated.yaml index c65472b3c02..0a398f89611 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-final-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-final-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -391,8 +404,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -407,7 +420,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -831,15 +866,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 - replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -848,8 +882,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix=partition-ingesters/ + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -921,10 +1082,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2941,6 +3107,182 @@ spec: --- apiVersion: keda.sh/v1alpha1 kind: ScaledObject +metadata: + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-b + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_b_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_b_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject metadata: name: ingester-zone-a namespace: default diff --git a/operations/mimir-tests/test-multi-zone-distributor-generated.yaml b/operations/mimir-tests/test-multi-zone-distributor-generated.yaml deleted file mode 100644 index 2dbd47f7924..00000000000 --- a/operations/mimir-tests/test-multi-zone-distributor-generated.yaml +++ /dev/null @@ -1,1333 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: default ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: compactor - name: compactor - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: compactor ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor - name: distributor - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: ingester - name: ingester - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: ingester ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: memcached - name: memcached - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: memcached ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: memcached-frontend - name: memcached-frontend - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: memcached-frontend ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: memcached-index-queries - name: memcached-index-queries - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: memcached-index-queries ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: memcached-metadata - name: memcached-metadata - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: memcached-metadata ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: querier - name: querier - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: querier ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: query-frontend - name: query-frontend - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: query-frontend ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: query-scheduler - name: query-scheduler - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: query-scheduler ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: store-gateway - name: store-gateway - namespace: default -spec: - maxUnavailable: 2 - selector: - matchLabels: - name: store-gateway ---- -apiVersion: v1 -data: - overrides.yaml: | - overrides: {} -kind: ConfigMap -metadata: - name: overrides - namespace: default ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: compactor - name: compactor - namespace: default -spec: - clusterIP: None - ports: - - name: compactor-http-metrics - port: 8080 - targetPort: 8080 - - name: compactor-grpc - port: 9095 - targetPort: 9095 - - name: compactor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: compactor ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor - name: distributor - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor ---- -apiVersion: v1 -kind: Service -metadata: - name: gossip-ring - namespace: default -spec: - clusterIP: None - ports: - - appProtocol: tcp - name: gossip-ring - port: 7946 - protocol: TCP - targetPort: 7946 - selector: - gossip_ring_member: "true" ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: ingester - name: ingester - namespace: default -spec: - ports: - - name: ingester-http-metrics - port: 8080 - targetPort: 8080 - - name: ingester-grpc - port: 9095 - targetPort: 9095 - - name: ingester-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: ingester ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: memcached - name: memcached - namespace: default -spec: - clusterIP: None - ports: - - name: memcached-client - port: 11211 - targetPort: 11211 - - name: exporter-http-metrics - port: 9150 - targetPort: 9150 - selector: - name: memcached ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: memcached-frontend - name: memcached-frontend - namespace: default -spec: - clusterIP: None - ports: - - name: memcached-client - port: 11211 - targetPort: 11211 - - name: exporter-http-metrics - port: 9150 - targetPort: 9150 - selector: - name: memcached-frontend ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: memcached-index-queries - name: memcached-index-queries - namespace: default -spec: - clusterIP: None - ports: - - name: memcached-client - port: 11211 - targetPort: 11211 - - name: exporter-http-metrics - port: 9150 - targetPort: 9150 - selector: - name: memcached-index-queries ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: memcached-metadata - name: memcached-metadata - namespace: default -spec: - clusterIP: None - ports: - - name: memcached-client - port: 11211 - targetPort: 11211 - - name: exporter-http-metrics - port: 9150 - targetPort: 9150 - selector: - name: memcached-metadata ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: querier - name: querier - namespace: default -spec: - ports: - - name: querier-http-metrics - port: 8080 - targetPort: 8080 - - name: querier-grpc - port: 9095 - targetPort: 9095 - - name: querier-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: querier ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: query-frontend - name: query-frontend - namespace: default -spec: - ports: - - name: query-frontend-http-metrics - port: 8080 - targetPort: 8080 - - name: query-frontend-grpc - port: 9095 - targetPort: 9095 - selector: - name: query-frontend ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: query-scheduler - name: query-scheduler - namespace: default -spec: - ports: - - name: query-scheduler-http-metrics - port: 8080 - targetPort: 8080 - - name: query-scheduler-grpc - port: 9095 - targetPort: 9095 - selector: - name: query-scheduler ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: query-scheduler - name: query-scheduler-discovery - namespace: default -spec: - clusterIP: None - ports: - - name: query-scheduler-http-metrics - port: 8080 - targetPort: 8080 - - name: query-scheduler-grpc - port: 9095 - targetPort: 9095 - publishNotReadyAddresses: true - selector: - name: query-scheduler ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: store-gateway - name: store-gateway - namespace: default -spec: - ports: - - name: store-gateway-http-metrics - port: 8080 - targetPort: 8080 - - name: store-gateway-grpc - port: 9095 - targetPort: 9095 - - name: store-gateway-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: store-gateway ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor - namespace: default -spec: - minReadySeconds: 10 - replicas: 3 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor - spec: - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix= - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent - name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: querier - namespace: default -spec: - minReadySeconds: 10 - replicas: 6 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: querier - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: querier - spec: - containers: - - args: - - -blocks-storage.bucket-store.metadata-cache.backend=memcached - - -blocks-storage.bucket-store.metadata-cache.memcached.addresses=dnssrvnoa+memcached-metadata.default.svc.cluster.local.:11211 - - -blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency=50 - - -blocks-storage.bucket-store.metadata-cache.memcached.max-item-size=1048576 - - -blocks-storage.bucket-store.sync-dir=/data/tsdb - - -blocks-storage.bucket-store.sync-interval=15m - - -blocks-storage.gcs.bucket-name=blocks-bucket - - -common.storage.backend=gcs - - -distributor.health-check-ingesters=true - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix= - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -mem-ballast-size-bytes=268435456 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -querier.frontend-client.grpc-max-send-msg-size=104857600 - - -querier.max-concurrent=8 - - -querier.max-partial-query-length=768h - - -querier.scheduler-address=query-scheduler-discovery.default.svc.cluster.local.:9095 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -store-gateway.sharding-ring.heartbeat-timeout=4m - - -store-gateway.sharding-ring.prefix= - - -store-gateway.sharding-ring.replication-factor=3 - - -store-gateway.sharding-ring.store=memberlist - - -target=querier - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "5" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "5000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent - name: querier - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 24Gi - requests: - cpu: "1" - memory: 12Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 180 - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: querier - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: query-frontend - namespace: default -spec: - minReadySeconds: 10 - replicas: 2 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: query-frontend - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - name: query-frontend - spec: - containers: - - args: - - -query-frontend.cache-results=true - - -query-frontend.max-cache-freshness=10m - - -query-frontend.max-total-query-length=12000h - - -query-frontend.query-sharding-target-series-per-shard=2500 - - -query-frontend.results-cache.backend=memcached - - -query-frontend.results-cache.memcached.addresses=dnssrvnoa+memcached-frontend.default.svc.cluster.local.:11211 - - -query-frontend.results-cache.memcached.max-item-size=5242880 - - -query-frontend.results-cache.memcached.timeout=500ms - - -query-frontend.scheduler-address=query-scheduler-discovery.default.svc.cluster.local.:9095 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=30s - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=query-frontend - - -usage-stats.installation-mode=jsonnet - env: - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "5000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent - name: query-frontend - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 1200Mi - requests: - cpu: "2" - memory: 600Mi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 390 - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: query-frontend - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: query-scheduler - namespace: default -spec: - minReadySeconds: 10 - replicas: 2 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: query-scheduler - strategy: - rollingUpdate: - maxSurge: 1 - maxUnavailable: 0 - template: - metadata: - labels: - name: query-scheduler - spec: - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchLabels: - name: query-scheduler - topologyKey: kubernetes.io/hostname - containers: - - args: - - -query-scheduler.max-outstanding-requests-per-tenant=100 - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -target=query-scheduler - - -usage-stats.installation-mode=jsonnet - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent - name: query-scheduler - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 2Gi - requests: - cpu: "2" - memory: 1Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 180 - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - labels: - name: compactor - name: compactor - namespace: default -spec: - podManagementPolicy: Parallel - replicas: 1 - selector: - matchLabels: - name: compactor - serviceName: compactor - template: - metadata: - labels: - gossip_ring_member: "true" - name: compactor - spec: - containers: - - args: - - -blocks-storage.gcs.bucket-name=blocks-bucket - - -common.storage.backend=gcs - - -compactor.block-ranges=2h,12h,24h - - -compactor.blocks-retention-period=0 - - -compactor.cleanup-interval=15m - - -compactor.compaction-concurrency=1 - - -compactor.compaction-interval=30m - - -compactor.compactor-tenant-shard-size=1 - - -compactor.data-dir=/data - - -compactor.deletion-delay=2h - - -compactor.first-level-compaction-wait-period=25m - - -compactor.max-closing-blocks-concurrency=2 - - -compactor.max-opening-blocks-concurrency=4 - - -compactor.ring.heartbeat-period=1m - - -compactor.ring.heartbeat-timeout=4m - - -compactor.ring.prefix= - - -compactor.ring.store=memberlist - - -compactor.ring.wait-stability-min-duration=1m - - -compactor.split-and-merge-shards=0 - - -compactor.split-groups=1 - - -compactor.symbols-flushers-concurrency=4 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -target=compactor - - -usage-stats.installation-mode=jsonnet - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent - name: compactor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 6Gi - requests: - cpu: 1 - memory: 6Gi - volumeMounts: - - mountPath: /data - name: compactor-data - - mountPath: /etc/mimir - name: overrides - securityContext: - runAsUser: 0 - terminationGracePeriodSeconds: 900 - volumes: - - configMap: - name: overrides - name: overrides - updateStrategy: - type: RollingUpdate - volumeClaimTemplates: - - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: compactor-data - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 250Gi - storageClassName: standard ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - labels: - name: ingester - name: ingester - namespace: default -spec: - podManagementPolicy: Parallel - replicas: 3 - selector: - matchLabels: - name: ingester - serviceName: ingester - template: - metadata: - labels: - gossip_ring_member: "true" - name: ingester - spec: - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchLabels: - name: ingester - topologyKey: kubernetes.io/hostname - containers: - - args: - - -blocks-storage.gcs.bucket-name=blocks-bucket - - -blocks-storage.tsdb.block-ranges-period=2h - - -blocks-storage.tsdb.dir=/data/tsdb - - -blocks-storage.tsdb.head-compaction-interval=15m - - -blocks-storage.tsdb.ship-interval=1m - - -blocks-storage.tsdb.wal-replay-concurrency=3 - - -common.storage.backend=gcs - - -distributor.health-check-ingesters=true - - -ingester.max-global-metadata-per-metric=10 - - -ingester.max-global-metadata-per-user=30000 - - -ingester.max-global-series-per-user=150000 - - -ingester.ring.heartbeat-period=2m - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.num-tokens=512 - - -ingester.ring.prefix= - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.tokens-file-path=/data/tokens - - -ingester.ring.unregister-on-shutdown=true - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc-max-concurrent-streams=500 - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -target=ingester - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "9" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent - name: ingester - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 25Gi - requests: - cpu: "4" - memory: 15Gi - volumeMounts: - - mountPath: /data - name: ingester-data - - mountPath: /etc/mimir - name: overrides - securityContext: - runAsUser: 0 - terminationGracePeriodSeconds: 1200 - volumes: - - configMap: - name: overrides - name: overrides - updateStrategy: - type: RollingUpdate - volumeClaimTemplates: - - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: ingester-data - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 100Gi - storageClassName: fast ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: memcached - namespace: default -spec: - replicas: 3 - selector: - matchLabels: - name: memcached - serviceName: memcached - template: - metadata: - labels: - name: memcached - spec: - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchLabels: - name: memcached - topologyKey: kubernetes.io/hostname - containers: - - args: - - -m 6144 - - -I 1m - - -c 16384 - - -v - - --extended=track_sizes - image: memcached:1.6.28-alpine - imagePullPolicy: IfNotPresent - name: memcached - ports: - - containerPort: 11211 - name: client - resources: - limits: - memory: 9Gi - requests: - cpu: 500m - memory: 6552Mi - - args: - - --memcached.address=localhost:11211 - - --web.listen-address=0.0.0.0:9150 - image: prom/memcached-exporter:v0.14.4 - imagePullPolicy: IfNotPresent - name: exporter - ports: - - containerPort: 9150 - name: http-metrics - updateStrategy: - type: RollingUpdate ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: memcached-frontend - namespace: default -spec: - replicas: 3 - selector: - matchLabels: - name: memcached-frontend - serviceName: memcached-frontend - template: - metadata: - labels: - name: memcached-frontend - spec: - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchLabels: - name: memcached-frontend - topologyKey: kubernetes.io/hostname - containers: - - args: - - -m 1024 - - -I 5m - - -c 16384 - - -v - - --extended=track_sizes - image: memcached:1.6.28-alpine - imagePullPolicy: IfNotPresent - name: memcached - ports: - - containerPort: 11211 - name: client - resources: - limits: - memory: 1536Mi - requests: - cpu: 500m - memory: 1176Mi - - args: - - --memcached.address=localhost:11211 - - --web.listen-address=0.0.0.0:9150 - image: prom/memcached-exporter:v0.14.4 - imagePullPolicy: IfNotPresent - name: exporter - ports: - - containerPort: 9150 - name: http-metrics - updateStrategy: - type: RollingUpdate ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: memcached-index-queries - namespace: default -spec: - replicas: 3 - selector: - matchLabels: - name: memcached-index-queries - serviceName: memcached-index-queries - template: - metadata: - labels: - name: memcached-index-queries - spec: - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchLabels: - name: memcached-index-queries - topologyKey: kubernetes.io/hostname - containers: - - args: - - -m 1024 - - -I 5m - - -c 16384 - - -v - - --extended=track_sizes - image: memcached:1.6.28-alpine - imagePullPolicy: IfNotPresent - name: memcached - ports: - - containerPort: 11211 - name: client - resources: - limits: - memory: 1536Mi - requests: - cpu: 500m - memory: 1176Mi - - args: - - --memcached.address=localhost:11211 - - --web.listen-address=0.0.0.0:9150 - image: prom/memcached-exporter:v0.14.4 - imagePullPolicy: IfNotPresent - name: exporter - ports: - - containerPort: 9150 - name: http-metrics - updateStrategy: - type: RollingUpdate ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: memcached-metadata - namespace: default -spec: - replicas: 1 - selector: - matchLabels: - name: memcached-metadata - serviceName: memcached-metadata - template: - metadata: - labels: - name: memcached-metadata - spec: - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchLabels: - name: memcached-metadata - topologyKey: kubernetes.io/hostname - containers: - - args: - - -m 512 - - -I 1m - - -c 16384 - - -v - - --extended=track_sizes - image: memcached:1.6.28-alpine - imagePullPolicy: IfNotPresent - name: memcached - ports: - - containerPort: 11211 - name: client - resources: - limits: - memory: 768Mi - requests: - cpu: 500m - memory: 638Mi - - args: - - --memcached.address=localhost:11211 - - --web.listen-address=0.0.0.0:9150 - image: prom/memcached-exporter:v0.14.4 - imagePullPolicy: IfNotPresent - name: exporter - ports: - - containerPort: 9150 - name: http-metrics - updateStrategy: - type: RollingUpdate ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - labels: - name: store-gateway - name: store-gateway - namespace: default -spec: - podManagementPolicy: Parallel - replicas: 3 - selector: - matchLabels: - name: store-gateway - serviceName: store-gateway - template: - metadata: - labels: - gossip_ring_member: "true" - name: store-gateway - spec: - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchLabels: - name: store-gateway - topologyKey: kubernetes.io/hostname - containers: - - args: - - -blocks-storage.bucket-store.chunks-cache.backend=memcached - - -blocks-storage.bucket-store.chunks-cache.memcached.addresses=dnssrvnoa+memcached.default.svc.cluster.local.:11211 - - -blocks-storage.bucket-store.chunks-cache.memcached.max-async-concurrency=50 - - -blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency=100 - - -blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections=150 - - -blocks-storage.bucket-store.chunks-cache.memcached.max-item-size=1048576 - - -blocks-storage.bucket-store.chunks-cache.memcached.timeout=750ms - - -blocks-storage.bucket-store.index-cache.backend=memcached - - -blocks-storage.bucket-store.index-cache.memcached.addresses=dnssrvnoa+memcached-index-queries.default.svc.cluster.local.:11211 - - -blocks-storage.bucket-store.index-cache.memcached.max-async-concurrency=50 - - -blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency=100 - - -blocks-storage.bucket-store.index-cache.memcached.max-idle-connections=150 - - -blocks-storage.bucket-store.index-cache.memcached.max-item-size=5242880 - - -blocks-storage.bucket-store.index-cache.memcached.timeout=750ms - - -blocks-storage.bucket-store.metadata-cache.backend=memcached - - -blocks-storage.bucket-store.metadata-cache.memcached.addresses=dnssrvnoa+memcached-metadata.default.svc.cluster.local.:11211 - - -blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency=50 - - -blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency=100 - - -blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections=150 - - -blocks-storage.bucket-store.metadata-cache.memcached.max-item-size=1048576 - - -blocks-storage.bucket-store.sync-dir=/data/tsdb - - -blocks-storage.bucket-store.sync-interval=15m - - -blocks-storage.gcs.bucket-name=blocks-bucket - - -common.storage.backend=gcs - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -store-gateway.sharding-ring.heartbeat-period=1m - - -store-gateway.sharding-ring.heartbeat-timeout=4m - - -store-gateway.sharding-ring.prefix= - - -store-gateway.sharding-ring.replication-factor=3 - - -store-gateway.sharding-ring.store=memberlist - - -store-gateway.sharding-ring.tokens-file-path=/data/tokens - - -store-gateway.sharding-ring.unregister-on-shutdown=false - - -store-gateway.sharding-ring.wait-stability-min-duration=1m - - -target=store-gateway - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "5" - - name: GOMEMLIMIT - value: "12884901888" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent - name: store-gateway - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 18Gi - requests: - cpu: "1" - memory: 12Gi - volumeMounts: - - mountPath: /data - name: store-gateway-data - - mountPath: /etc/mimir - name: overrides - securityContext: - runAsUser: 0 - terminationGracePeriodSeconds: 120 - volumes: - - configMap: - name: overrides - name: overrides - updateStrategy: - type: RollingUpdate - volumeClaimTemplates: - - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: store-gateway-data - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 50Gi - storageClassName: standard ---- -apiVersion: etcd.database.coreos.com/v1beta2 -kind: EtcdCluster -metadata: - annotations: - etcd.database.coreos.com/scope: clusterwide - name: etcd - namespace: default -spec: - pod: - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchLabels: - etcd_cluster: etcd - topologyKey: kubernetes.io/hostname - annotations: - prometheus.io/port: "2379" - prometheus.io/scrape: "true" - etcdEnv: - - name: ETCD_AUTO_COMPACTION_RETENTION - value: 1h - labels: - name: etcd - resources: - limits: - memory: 512Mi - requests: - cpu: 500m - memory: 512Mi - size: 3 - version: 3.3.13 diff --git a/operations/mimir-tests/test-multi-zone-distributor.jsonnet b/operations/mimir-tests/test-multi-zone-distributor.jsonnet deleted file mode 100644 index 26634758270..00000000000 --- a/operations/mimir-tests/test-multi-zone-distributor.jsonnet +++ /dev/null @@ -1,30 +0,0 @@ -local mimir = import 'mimir/mimir.libsonnet'; - -mimir { - local availabilityZones = ['us-east-2a', 'us-east-2b'], - - _config+:: { - namespace: 'default', - external_url: 'http://test', - - storage_backend: 'gcs', - blocks_storage_bucket_name: 'blocks-bucket', - - multi_zone_distributor_enabled: true, - multi_zone_distributor_availability_zones: availabilityZones, - }, - - distributor_zone_a_args+:: { - 'ingest-storage.kafka.address': 'warpstream-agent-write-zone-a.%(namespace)s.svc.cluster.local.:9092' % $._config, - 'ingest-storage.kafka.client-id': $.mimirKafkaClientID($.ingest_storage_distributor_kafka_client_id_settings { - warpstream_az: availabilityZones[0], - }), - }, - - distributor_zone_b_args+:: { - 'ingest-storage.kafka.address': 'warpstream-agent-write-zone-b.%(namespace)s.svc.cluster.local.:9092' % $._config, - 'ingest-storage.kafka.client-id': $.mimirKafkaClientID($.ingest_storage_distributor_kafka_client_id_settings { - warpstream_az: availabilityZones[1], - }), - }, -} diff --git a/operations/mimir-tests/test-multi-zone-generated.yaml b/operations/mimir-tests/test-multi-zone-generated.yaml index 40cba8a70a7..fcddfd953ab 100644 --- a/operations/mimir-tests/test-multi-zone-generated.yaml +++ b/operations/mimir-tests/test-multi-zone-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -293,8 +306,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -309,7 +322,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -655,15 +690,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 - replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -672,8 +706,127 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -737,10 +890,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2382,3 +2540,179 @@ spec: memory: 512Mi size: 3 version: 3.3.13 +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-b + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_b_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_b_memory_hpa_default + type: prometheus diff --git a/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml b/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml index e621e14d280..9d3ea8fb881 100644 --- a/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml +++ b/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -356,8 +369,30 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b namespace: default spec: clusterIP: None @@ -372,7 +407,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -732,15 +767,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 - replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -749,8 +783,127 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -814,10 +967,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2512,6 +2670,182 @@ webhooks: sideEffects: NoneOnDryRun timeoutSeconds: 10 --- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-b + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_b_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_b_memory_hpa_default + type: prometheus +--- apiVersion: admissionregistration.k8s.io/v1 kind: ValidatingWebhookConfiguration metadata: diff --git a/operations/mimir-tests/test-multi-zone.jsonnet b/operations/mimir-tests/test-multi-zone.jsonnet index 0eb300d8c53..55332c87197 100644 --- a/operations/mimir-tests/test-multi-zone.jsonnet +++ b/operations/mimir-tests/test-multi-zone.jsonnet @@ -19,6 +19,14 @@ mimir { multi_zone_store_gateway_enabled: true, multi_zone_store_gateway_replicas: 3, + + local availabilityZones = ['us-east-2a', 'us-east-2b'], + multi_zone_distributor_enabled: true, + multi_zone_distributor_availability_zones: availabilityZones, + + autoscaling_distributor_enabled: true, + autoscaling_distributor_min_replicas: 3, + autoscaling_distributor_max_replicas: 30, }, ingester_env_map+:: { diff --git a/operations/mimir/autoscaling.libsonnet b/operations/mimir/autoscaling.libsonnet index 7db49edfef0..22193d1083b 100644 --- a/operations/mimir/autoscaling.libsonnet +++ b/operations/mimir/autoscaling.libsonnet @@ -569,9 +569,10 @@ {} ), - distributor_scaled_object: if !$._config.autoscaling_distributor_enabled then null else + newDistributorScaledObject(name, pod_regex=''):: $.newResourceScaledObject( - name='distributor', + name=name, + container_name='distributor', cpu_requests=$.distributor_container.resources.requests.cpu, memory_requests=$.distributor_container.resources.requests.memory, min_replicas=$._config.autoscaling_distributor_min_replicas, @@ -634,9 +635,43 @@ } ), + local isMultiZoneEnabled = $._config.multi_zone_distributor_enabled, + local isAutoscalingEnabled = $._config.autoscaling_distributor_enabled, + local isAutoscalingSingleZoneEnabled = !isMultiZoneEnabled && isAutoscalingEnabled, + local isAutoscalingZoneAEnabled = isMultiZoneEnabled && isAutoscalingEnabled && std.length($._config.multi_zone_distributor_availability_zones) >= 1, + local isAutoscalingZoneBEnabled = isMultiZoneEnabled && isAutoscalingEnabled && std.length($._config.multi_zone_distributor_availability_zones) >= 2, + local isAutoscalingZoneCEnabled = isMultiZoneEnabled && isAutoscalingEnabled && std.length($._config.multi_zone_distributor_availability_zones) >= 3, + + distributor_scaled_object: if !isAutoscalingSingleZoneEnabled then null else + $.newDistributorScaledObject('distributor'), + distributor_deployment: overrideSuperIfExists( 'distributor_deployment', - if !$._config.autoscaling_distributor_enabled then {} else $.removeReplicasFromSpec + if !isAutoscalingSingleZoneEnabled then {} else $.removeReplicasFromSpec + ), + + distributor_zone_a_scaled_object: if !isAutoscalingZoneAEnabled then null else + $.newDistributorScaledObject('distributor-zone-a', 'distributor-zone-a.*'), + + distributor_zone_a_deployment: overrideSuperIfExists( + 'distributor_zone_a_deployment', + if !isAutoscalingZoneAEnabled then {} else $.removeReplicasFromSpec + ), + + distributor_zone_b_scaled_object: if !isAutoscalingZoneBEnabled then null else + $.newDistributorScaledObject('distributor-zone-b', 'distributor-zone-b.*'), + + distributor_zone_b_deployment: overrideSuperIfExists( + 'distributor_zone_b_deployment', + if !isAutoscalingZoneBEnabled then {} else $.removeReplicasFromSpec + ), + + distributor_zone_c_scaled_object: if !isAutoscalingZoneCEnabled then null else + $.newDistributorScaledObject('distributor-zone-c', 'distributor-zone-c.*'), + + distributor_zone_c_deployment: overrideSuperIfExists( + 'distributor_zone_c_deployment', + if !isAutoscalingZoneCEnabled then {} else $.removeReplicasFromSpec ), ruler_scaled_object: if !$._config.autoscaling_ruler_enabled then null else $.newResourceScaledObject( diff --git a/operations/mimir/common.libsonnet b/operations/mimir/common.libsonnet index b1c66faa3fd..44c9378155c 100644 --- a/operations/mimir/common.libsonnet +++ b/operations/mimir/common.libsonnet @@ -149,6 +149,12 @@ ) ), + newMimirNodeAffinityMatcherAZ(az):: { + key: 'topology.kubernetes.io/zone', + operator: 'In', + values: [az], + }, + mimirVolumeMounts:: $.util.volumeMounts( [$.util.volumeMountItem(name, $._config.configmaps[name]) for name in std.objectFieldsAll($._config.configmaps)] diff --git a/operations/mimir/memberlist.libsonnet b/operations/mimir/memberlist.libsonnet index 3eb9ad306ed..0df440897a2 100644 --- a/operations/mimir/memberlist.libsonnet +++ b/operations/mimir/memberlist.libsonnet @@ -111,6 +111,21 @@ if !$._config.memberlist_ring_enabled then {} else gossipLabel ), + distributor_zone_a_deployment: overrideSuperIfExists( + 'distributor_zone_a_deployment', + if !$._config.memberlist_ring_enabled then {} else gossipLabel + ), + + distributor_zone_b_deployment: overrideSuperIfExists( + 'distributor_zone_b_deployment', + if !$._config.memberlist_ring_enabled then {} else gossipLabel + ), + + distributor_zone_c_deployment: overrideSuperIfExists( + 'distributor_zone_c_deployment', + if !$._config.memberlist_ring_enabled then {} else gossipLabel + ), + ingester_statefulset: overrideSuperIfExists( 'ingester_statefulset', if !$._config.memberlist_ring_enabled then {} else gossipLabel diff --git a/operations/mimir/mimir.libsonnet b/operations/mimir/mimir.libsonnet index 9042f40fea9..dd54c4a1549 100644 --- a/operations/mimir/mimir.libsonnet +++ b/operations/mimir/mimir.libsonnet @@ -25,6 +25,7 @@ (import 'shuffle-sharding.libsonnet') + (import 'query-sharding.libsonnet') + (import 'multi-zone.libsonnet') + +(import 'multi-zone-distributor.libsonnet') + (import 'rollout-operator.libsonnet') + (import 'ruler-remote-evaluation.libsonnet') + (import 'continuous-test.libsonnet') + diff --git a/operations/mimir/multi-zone-distributor.libsonnet b/operations/mimir/multi-zone-distributor.libsonnet index 00aeabb59ce..014e0fc767c 100644 --- a/operations/mimir/multi-zone-distributor.libsonnet +++ b/operations/mimir/multi-zone-distributor.libsonnet @@ -1,4 +1,3 @@ -// This file contains the experimental configuration to deploy distributors in multi-AZ. { _config+:: { multi_zone_distributor_enabled: false, @@ -69,15 +68,6 @@ distributor_zone_c_pdb: if !isZoneCEnabled then null else $.newMimirPdb('distributor-zone-c'), - distributor_zone_a_scaled_object: if !isZoneAEnabled || !$._config.autoscaling_distributor_enabled then null else - $.newDistributorScaledObject('distributor-zone-a', 'distributor-zone-a.*'), - - distributor_zone_b_scaled_object: if !isZoneBEnabled || !$._config.autoscaling_distributor_enabled then null else - $.newDistributorScaledObject('distributor-zone-b', 'distributor-zone-b.*'), - - distributor_zone_c_scaled_object: if !isZoneCEnabled || !$._config.autoscaling_distributor_enabled then null else - $.newDistributorScaledObject('distributor-zone-c', 'distributor-zone-c.*'), - newDistributorZoneContainer(zone, args, extraEnvVarMap={}):: $.distributor_container + container.withArgs($.util.mapToFlags(args)) + @@ -93,76 +83,7 @@ $.core.v1.toleration.withOperator('Equal') + $.core.v1.toleration.withValue('multi-az') + $.core.v1.toleration.withEffect('NoSchedule'), - ]) + - gossipLabel - + (if !$._config.autoscaling_distributor_enabled then {} else $.removeReplicasFromSpec), - - newDistributorScaledObject(name, pod_regex):: - $.newResourceScaledObject( - name=name, - container_name='distributor', - cpu_requests=$.distributor_container.resources.requests.cpu, - memory_requests=$.distributor_container.resources.requests.memory, - min_replicas=$._config.autoscaling_distributor_min_replicas, - max_replicas=$._config.autoscaling_distributor_max_replicas, - cpu_target_utilization=$._config.autoscaling_distributor_cpu_target_utilization, - memory_target_utilization=$._config.autoscaling_distributor_memory_target_utilization, - with_cortex_prefix=true, - with_ready_trigger=true, - pod_regex=pod_regex, - ) + ( - { - spec+: { - advanced: { - horizontalPodAutoscalerConfig: { - behavior: { - scaleUp: { - // When multiple policies are specified the policy which allows the highest amount of change is the - // policy which is selected by default. - policies: [ - { - // Allow to scale up at most 50% of pods every 2m. Every 2min is chosen as enough time for new - // pods to be handling load and counted in the 15min lookback window. - // - // This policy covers the case we already have a high number of pods running and adding +50% - // in the span of 2m means adding a significative number of pods. - type: 'Percent', - value: 50, - periodSeconds: 60 * 2, - }, - { - // Allow to scale up at most 50% of pods every 2m. Every 2min is chosen as enough time for new - // pods to be handling load and counted in the 15min lookback window. - // - // This policy covers the case we currently have a small number of pods (e.g. < 10) and limiting - // the scaling by percentage may be too slow when scaling up. - type: 'Pods', - value: 15, - periodSeconds: 60 * 2, - }, - ], - // After a scaleup we should wait at least 2 minutes to observe the effect. - stabilizationWindowSeconds: 60 * 2, - }, - scaleDown: { - policies: [{ - // Allow to scale down up to 10% of pods every 2m. - type: 'Percent', - value: 10, - periodSeconds: 120, - }], - // Reduce the likelihood of flapping replicas. When the metrics indicate that the target should be scaled - // down, HPA looks into previously computed desired states, and uses the highest value from the last 30m. - // This is particularly high for distributors due to their reasonably stable load and their long - // shutdown-delay + grace period. - stabilizationWindowSeconds: 60 * 30, - }, - }, - }, - }, - }, - } - ), + ]), // Remove single-zone deployment when multi-zone is enabled. distributor_deployment: if isMultiZoneEnabled then null else super.distributor_deployment, From a3e340a4186ec295b4bef55ea84228bf045c18f2 Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Tue, 8 Oct 2024 12:55:28 +0200 Subject: [PATCH 3/8] Fixing review findings Signed-off-by: Yuri Nikolic --- operations/mimir/multi-zone-distributor.libsonnet | 3 --- 1 file changed, 3 deletions(-) diff --git a/operations/mimir/multi-zone-distributor.libsonnet b/operations/mimir/multi-zone-distributor.libsonnet index 014e0fc767c..903a3af8ec4 100644 --- a/operations/mimir/multi-zone-distributor.libsonnet +++ b/operations/mimir/multi-zone-distributor.libsonnet @@ -14,9 +14,6 @@ local isZoneBEnabled = isMultiZoneEnabled && std.length($._config.multi_zone_distributor_availability_zones) >= 2, local isZoneCEnabled = isMultiZoneEnabled && std.length($._config.multi_zone_distributor_availability_zones) >= 3, - local gossipLabel = if !$._config.memberlist_ring_enabled then {} else - $.apps.v1.statefulSet.spec.template.metadata.withLabelsMixin({ [$._config.gossip_member_label]: 'true' }), - distributor_zone_a_args:: $.distributor_args, distributor_zone_b_args:: $.distributor_args, distributor_zone_c_args:: $.distributor_args, From 4c7590beb2e7f930004976316e4daf0e34df43dd Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Tue, 8 Oct 2024 12:57:48 +0200 Subject: [PATCH 4/8] Setting multi_zone_distributor_enabled to false Signed-off-by: Yuri Nikolic --- ...custom-stabilization-window-generated.yaml | 285 +---------------- ...toscaling-multiple-triggers-generated.yaml | 285 +---------------- ...age-autoscaling-one-trigger-generated.yaml | 285 +---------------- ...st-storage-migration-step-0-generated.yaml | 278 +---------------- ...st-storage-migration-step-1-generated.yaml | 279 +---------------- ...t-storage-migration-step-10-generated.yaml | 285 +---------------- ...t-storage-migration-step-11-generated.yaml | 285 +---------------- ...st-storage-migration-step-2-generated.yaml | 286 +----------------- ...st-storage-migration-step-3-generated.yaml | 286 +----------------- ...st-storage-migration-step-4-generated.yaml | 285 +---------------- ...t-storage-migration-step-5a-generated.yaml | 285 +---------------- ...t-storage-migration-step-5b-generated.yaml | 285 +---------------- ...st-storage-migration-step-6-generated.yaml | 285 +---------------- ...st-storage-migration-step-7-generated.yaml | 285 +---------------- ...st-storage-migration-step-8-generated.yaml | 285 +---------------- ...st-storage-migration-step-9-generated.yaml | 285 +---------------- ...torage-migration-step-final-generated.yaml | 285 +---------------- .../test-multi-zone-generated.yaml | 277 +---------------- ...teway-automated-downscaling-generated.yaml | 277 +---------------- .../mimir-tests/test-multi-zone.jsonnet | 2 +- 20 files changed, 286 insertions(+), 5104 deletions(-) diff --git a/operations/mimir-tests/test-ingest-storage-autoscaling-custom-stabilization-window-generated.yaml b/operations/mimir-tests/test-ingest-storage-autoscaling-custom-stabilization-window-generated.yaml index 618cf612aa1..5b263f4b53a 100644 --- a/operations/mimir-tests/test-ingest-storage-autoscaling-custom-stabilization-window-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-autoscaling-custom-stabilization-window-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -404,8 +391,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -420,29 +407,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -866,14 +831,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -882,135 +847,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix=partition-ingesters/ - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1082,15 +920,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3108,95 +2941,7 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b + name: distributor namespace: default spec: advanced: @@ -3221,11 +2966,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor-zone-b + name: distributor triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default + metricName: cortex_distributor_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -3244,11 +2989,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default + name: cortex_distributor_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default + metricName: cortex_distributor_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3278,7 +3023,7 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default + name: cortex_distributor_memory_hpa_default type: prometheus --- apiVersion: keda.sh/v1alpha1 diff --git a/operations/mimir-tests/test-ingest-storage-autoscaling-multiple-triggers-generated.yaml b/operations/mimir-tests/test-ingest-storage-autoscaling-multiple-triggers-generated.yaml index fd9b480bd20..96d34a7691e 100644 --- a/operations/mimir-tests/test-ingest-storage-autoscaling-multiple-triggers-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-autoscaling-multiple-triggers-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -404,8 +391,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -420,29 +407,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -866,14 +831,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -882,135 +847,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix=partition-ingesters/ - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1082,15 +920,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3108,95 +2941,7 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b + name: distributor namespace: default spec: advanced: @@ -3221,11 +2966,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor-zone-b + name: distributor triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default + metricName: cortex_distributor_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -3244,11 +2989,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default + name: cortex_distributor_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default + metricName: cortex_distributor_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3278,7 +3023,7 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default + name: cortex_distributor_memory_hpa_default type: prometheus --- apiVersion: keda.sh/v1alpha1 diff --git a/operations/mimir-tests/test-ingest-storage-autoscaling-one-trigger-generated.yaml b/operations/mimir-tests/test-ingest-storage-autoscaling-one-trigger-generated.yaml index 68e194b235f..66fde4046b9 100644 --- a/operations/mimir-tests/test-ingest-storage-autoscaling-one-trigger-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-autoscaling-one-trigger-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -404,8 +391,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -420,29 +407,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -866,14 +831,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -882,135 +847,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix=partition-ingesters/ - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1082,15 +920,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3108,95 +2941,7 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b + name: distributor namespace: default spec: advanced: @@ -3221,11 +2966,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor-zone-b + name: distributor triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default + metricName: cortex_distributor_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -3244,11 +2989,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default + name: cortex_distributor_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default + metricName: cortex_distributor_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3278,7 +3023,7 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default + name: cortex_distributor_memory_hpa_default type: prometheus --- apiVersion: keda.sh/v1alpha1 diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-0-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-0-generated.yaml index 503937febc2..5f50d850772 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-0-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-0-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -345,8 +332,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -361,29 +348,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -807,14 +772,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -823,128 +788,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix= - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1009,15 +854,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2938,95 +2778,7 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b + name: distributor namespace: default spec: advanced: @@ -3051,11 +2803,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor-zone-b + name: distributor triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default + metricName: cortex_distributor_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -3074,11 +2826,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default + name: cortex_distributor_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default + metricName: cortex_distributor_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3108,5 +2860,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default + name: cortex_distributor_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-1-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-1-generated.yaml index b0f5b79d4ad..eab256f0b6e 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-1-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-1-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -345,8 +332,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -361,29 +348,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -876,14 +841,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -892,129 +857,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix= - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1080,15 +924,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3441,95 +3280,7 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b + name: distributor namespace: default spec: advanced: @@ -3554,11 +3305,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor-zone-b + name: distributor triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default + metricName: cortex_distributor_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -3577,11 +3328,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default + name: cortex_distributor_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default + metricName: cortex_distributor_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3611,5 +3362,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default + name: cortex_distributor_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-10-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-10-generated.yaml index df2305fc88d..112d98bb1e0 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-10-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-10-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -404,8 +391,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -420,29 +407,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -843,14 +808,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -859,135 +824,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix=partition-ingesters/ - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1059,15 +897,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2941,95 +2774,7 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b + name: distributor namespace: default spec: advanced: @@ -3054,11 +2799,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor-zone-b + name: distributor triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default + metricName: cortex_distributor_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -3077,11 +2822,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default + name: cortex_distributor_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default + metricName: cortex_distributor_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3111,7 +2856,7 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default + name: cortex_distributor_memory_hpa_default type: prometheus --- apiVersion: keda.sh/v1alpha1 diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-11-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-11-generated.yaml index c8d2a21c763..b47b52f9f23 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-11-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-11-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -404,8 +391,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -420,29 +407,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -843,14 +808,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -859,135 +824,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix=partition-ingesters/ - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1059,15 +897,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2945,95 +2778,7 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b + name: distributor namespace: default spec: advanced: @@ -3058,11 +2803,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor-zone-b + name: distributor triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default + metricName: cortex_distributor_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -3081,11 +2826,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default + name: cortex_distributor_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default + metricName: cortex_distributor_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3115,7 +2860,7 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default + name: cortex_distributor_memory_hpa_default type: prometheus --- apiVersion: keda.sh/v1alpha1 diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-2-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-2-generated.yaml index 1006420691d..f0f2b60ae2e 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-2-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-2-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -345,8 +332,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -361,29 +348,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -876,14 +841,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -892,136 +857,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingest-storage.migration.distributor-send-to-ingesters-enabled=true - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix= - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1094,15 +931,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3461,95 +3293,7 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b + name: distributor namespace: default spec: advanced: @@ -3574,11 +3318,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor-zone-b + name: distributor triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default + metricName: cortex_distributor_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -3597,11 +3341,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default + name: cortex_distributor_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default + metricName: cortex_distributor_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3631,5 +3375,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default + name: cortex_distributor_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-3-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-3-generated.yaml index f789b04d721..d58513ca677 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-3-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-3-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -345,8 +332,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -361,29 +348,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -876,14 +841,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -892,136 +857,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingest-storage.migration.distributor-send-to-ingesters-enabled=true - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix= - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1094,15 +931,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3483,95 +3315,7 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b + name: distributor namespace: default spec: advanced: @@ -3596,11 +3340,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor-zone-b + name: distributor triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default + metricName: cortex_distributor_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -3619,11 +3363,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default + name: cortex_distributor_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default + metricName: cortex_distributor_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3653,5 +3397,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default + name: cortex_distributor_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-4-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-4-generated.yaml index f0c4b1ccaf2..11b1f1d9123 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-4-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-4-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -345,8 +332,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -361,29 +348,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -876,14 +841,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -892,135 +857,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix= - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1092,15 +930,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3480,95 +3313,7 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b + name: distributor namespace: default spec: advanced: @@ -3593,11 +3338,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor-zone-b + name: distributor triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default + metricName: cortex_distributor_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -3616,11 +3361,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default + name: cortex_distributor_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default + metricName: cortex_distributor_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3650,5 +3395,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default + name: cortex_distributor_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-5a-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-5a-generated.yaml index 5d3caad09f5..54b7a90c838 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-5a-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-5a-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -345,8 +332,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -361,29 +348,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -876,14 +841,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -892,135 +857,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix= - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1092,15 +930,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3480,95 +3313,7 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b + name: distributor namespace: default spec: advanced: @@ -3593,11 +3338,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor-zone-b + name: distributor triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default + metricName: cortex_distributor_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -3616,11 +3361,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default + name: cortex_distributor_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default + metricName: cortex_distributor_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3650,5 +3395,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default + name: cortex_distributor_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-5b-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-5b-generated.yaml index 18a27160cfe..f4302d9a64c 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-5b-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-5b-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -345,8 +332,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -361,29 +348,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -876,14 +841,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -892,135 +857,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix= - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1092,15 +930,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3480,95 +3313,7 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b + name: distributor namespace: default spec: advanced: @@ -3593,11 +3338,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor-zone-b + name: distributor triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default + metricName: cortex_distributor_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -3616,11 +3361,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default + name: cortex_distributor_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default + metricName: cortex_distributor_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3650,5 +3395,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default + name: cortex_distributor_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-6-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-6-generated.yaml index de991d69511..5c30c4029d1 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-6-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-6-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -345,8 +332,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -361,29 +348,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -807,14 +772,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -823,135 +788,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix= - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1023,15 +861,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3009,95 +2842,7 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b + name: distributor namespace: default spec: advanced: @@ -3122,11 +2867,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor-zone-b + name: distributor triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default + metricName: cortex_distributor_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -3145,11 +2890,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default + name: cortex_distributor_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default + metricName: cortex_distributor_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3179,5 +2924,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default + name: cortex_distributor_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-7-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-7-generated.yaml index a41c63b5f97..9b6813c3d61 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-7-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-7-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -345,8 +332,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -361,29 +348,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -807,14 +772,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -823,135 +788,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix=partition-ingesters/ - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1023,15 +861,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3037,95 +2870,7 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b + name: distributor namespace: default spec: advanced: @@ -3150,11 +2895,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor-zone-b + name: distributor triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default + metricName: cortex_distributor_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -3173,11 +2918,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default + name: cortex_distributor_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default + metricName: cortex_distributor_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3207,5 +2952,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default + name: cortex_distributor_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-8-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-8-generated.yaml index 253675e25ab..ba118d1d7fe 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-8-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-8-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -345,8 +332,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -361,29 +348,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -807,14 +772,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -823,135 +788,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix=partition-ingesters/ - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1023,15 +861,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3037,95 +2870,7 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b + name: distributor namespace: default spec: advanced: @@ -3150,11 +2895,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor-zone-b + name: distributor triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default + metricName: cortex_distributor_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -3173,11 +2918,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default + name: cortex_distributor_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default + metricName: cortex_distributor_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3207,5 +2952,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default + name: cortex_distributor_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-9-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-9-generated.yaml index 0272084002c..c57421b59a8 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-9-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-9-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -345,8 +332,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -361,29 +348,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -784,14 +749,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -800,135 +765,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix=partition-ingesters/ - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1000,15 +838,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2874,95 +2707,7 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b + name: distributor namespace: default spec: advanced: @@ -2987,11 +2732,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor-zone-b + name: distributor triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default + metricName: cortex_distributor_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -3010,11 +2755,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default + name: cortex_distributor_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default + metricName: cortex_distributor_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3044,5 +2789,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default + name: cortex_distributor_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-final-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-final-generated.yaml index 0a398f89611..622c0eed712 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-final-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-final-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -404,8 +391,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -420,29 +407,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -866,14 +831,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -882,135 +847,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix=partition-ingesters/ - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1082,15 +920,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3108,95 +2941,7 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b + name: distributor namespace: default spec: advanced: @@ -3221,11 +2966,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor-zone-b + name: distributor triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default + metricName: cortex_distributor_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -3244,11 +2989,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default + name: cortex_distributor_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default + metricName: cortex_distributor_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3278,7 +3023,7 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default + name: cortex_distributor_memory_hpa_default type: prometheus --- apiVersion: keda.sh/v1alpha1 diff --git a/operations/mimir-tests/test-multi-zone-generated.yaml b/operations/mimir-tests/test-multi-zone-generated.yaml index fcddfd953ab..b0538a2a48c 100644 --- a/operations/mimir-tests/test-multi-zone-generated.yaml +++ b/operations/mimir-tests/test-multi-zone-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -306,8 +293,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -322,29 +309,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -690,14 +655,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -706,127 +671,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix= - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -890,15 +736,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2544,95 +2385,7 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b + name: distributor namespace: default spec: advanced: @@ -2657,11 +2410,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor-zone-b + name: distributor triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default + metricName: cortex_distributor_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -2680,11 +2433,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default + name: cortex_distributor_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default + metricName: cortex_distributor_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -2714,5 +2467,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default + name: cortex_distributor_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml b/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml index 9d3ea8fb881..ce74b0731ee 100644 --- a/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml +++ b/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -369,8 +356,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -385,29 +372,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -767,14 +732,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -783,127 +748,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix= - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -967,15 +813,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2673,95 +2514,7 @@ webhooks: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b + name: distributor namespace: default spec: advanced: @@ -2786,11 +2539,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor-zone-b + name: distributor triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default + metricName: cortex_distributor_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -2809,11 +2562,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default + name: cortex_distributor_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default + metricName: cortex_distributor_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -2843,7 +2596,7 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default + name: cortex_distributor_memory_hpa_default type: prometheus --- apiVersion: admissionregistration.k8s.io/v1 diff --git a/operations/mimir-tests/test-multi-zone.jsonnet b/operations/mimir-tests/test-multi-zone.jsonnet index 55332c87197..371c24020ee 100644 --- a/operations/mimir-tests/test-multi-zone.jsonnet +++ b/operations/mimir-tests/test-multi-zone.jsonnet @@ -21,7 +21,7 @@ mimir { multi_zone_store_gateway_replicas: 3, local availabilityZones = ['us-east-2a', 'us-east-2b'], - multi_zone_distributor_enabled: true, + multi_zone_distributor_enabled: false, multi_zone_distributor_availability_zones: availabilityZones, autoscaling_distributor_enabled: true, From d52fd2c9b7aeab15bff88b12d1ff4799241d1caa Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Tue, 8 Oct 2024 13:04:10 +0200 Subject: [PATCH 5/8] Setting multi_zone_distributor_enabled to true Signed-off-by: Yuri Nikolic --- operations/mimir-tests/test-multi-zone.jsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir-tests/test-multi-zone.jsonnet b/operations/mimir-tests/test-multi-zone.jsonnet index 371c24020ee..55332c87197 100644 --- a/operations/mimir-tests/test-multi-zone.jsonnet +++ b/operations/mimir-tests/test-multi-zone.jsonnet @@ -21,7 +21,7 @@ mimir { multi_zone_store_gateway_replicas: 3, local availabilityZones = ['us-east-2a', 'us-east-2b'], - multi_zone_distributor_enabled: false, + multi_zone_distributor_enabled: true, multi_zone_distributor_availability_zones: availabilityZones, autoscaling_distributor_enabled: true, From 18cfdfac63530e9624b44db1d840649d146c92c0 Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Tue, 8 Oct 2024 13:21:27 +0200 Subject: [PATCH 6/8] Making lint happy Signed-off-by: Yuri Nikolic --- ...custom-stabilization-window-generated.yaml | 285 ++++++++++++++++- ...toscaling-multiple-triggers-generated.yaml | 285 ++++++++++++++++- ...age-autoscaling-one-trigger-generated.yaml | 285 ++++++++++++++++- ...st-storage-migration-step-0-generated.yaml | 278 ++++++++++++++++- ...st-storage-migration-step-1-generated.yaml | 279 ++++++++++++++++- ...t-storage-migration-step-10-generated.yaml | 285 ++++++++++++++++- ...t-storage-migration-step-11-generated.yaml | 285 ++++++++++++++++- ...st-storage-migration-step-2-generated.yaml | 286 +++++++++++++++++- ...st-storage-migration-step-3-generated.yaml | 286 +++++++++++++++++- ...st-storage-migration-step-4-generated.yaml | 285 ++++++++++++++++- ...t-storage-migration-step-5a-generated.yaml | 285 ++++++++++++++++- ...t-storage-migration-step-5b-generated.yaml | 285 ++++++++++++++++- ...st-storage-migration-step-6-generated.yaml | 285 ++++++++++++++++- ...st-storage-migration-step-7-generated.yaml | 285 ++++++++++++++++- ...st-storage-migration-step-8-generated.yaml | 285 ++++++++++++++++- ...st-storage-migration-step-9-generated.yaml | 285 ++++++++++++++++- ...torage-migration-step-final-generated.yaml | 285 ++++++++++++++++- .../test-multi-zone-generated.yaml | 277 ++++++++++++++++- ...teway-automated-downscaling-generated.yaml | 277 ++++++++++++++++- 19 files changed, 5103 insertions(+), 285 deletions(-) diff --git a/operations/mimir-tests/test-ingest-storage-autoscaling-custom-stabilization-window-generated.yaml b/operations/mimir-tests/test-ingest-storage-autoscaling-custom-stabilization-window-generated.yaml index 5b263f4b53a..618cf612aa1 100644 --- a/operations/mimir-tests/test-ingest-storage-autoscaling-custom-stabilization-window-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-autoscaling-custom-stabilization-window-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -391,8 +404,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -407,7 +420,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -831,14 +866,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -847,8 +882,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix=partition-ingesters/ + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -920,10 +1082,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2941,7 +3108,95 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b namespace: default spec: advanced: @@ -2966,11 +3221,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor + name: distributor-zone-b triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_cpu_hpa_default + metricName: cortex_distributor_zone_b_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -2989,11 +3244,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_cpu_hpa_default + name: cortex_distributor_zone_b_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_memory_hpa_default + metricName: cortex_distributor_zone_b_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3023,7 +3278,7 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_memory_hpa_default + name: cortex_distributor_zone_b_memory_hpa_default type: prometheus --- apiVersion: keda.sh/v1alpha1 diff --git a/operations/mimir-tests/test-ingest-storage-autoscaling-multiple-triggers-generated.yaml b/operations/mimir-tests/test-ingest-storage-autoscaling-multiple-triggers-generated.yaml index 96d34a7691e..fd9b480bd20 100644 --- a/operations/mimir-tests/test-ingest-storage-autoscaling-multiple-triggers-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-autoscaling-multiple-triggers-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -391,8 +404,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -407,7 +420,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -831,14 +866,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -847,8 +882,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix=partition-ingesters/ + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -920,10 +1082,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2941,7 +3108,95 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b namespace: default spec: advanced: @@ -2966,11 +3221,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor + name: distributor-zone-b triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_cpu_hpa_default + metricName: cortex_distributor_zone_b_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -2989,11 +3244,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_cpu_hpa_default + name: cortex_distributor_zone_b_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_memory_hpa_default + metricName: cortex_distributor_zone_b_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3023,7 +3278,7 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_memory_hpa_default + name: cortex_distributor_zone_b_memory_hpa_default type: prometheus --- apiVersion: keda.sh/v1alpha1 diff --git a/operations/mimir-tests/test-ingest-storage-autoscaling-one-trigger-generated.yaml b/operations/mimir-tests/test-ingest-storage-autoscaling-one-trigger-generated.yaml index 66fde4046b9..68e194b235f 100644 --- a/operations/mimir-tests/test-ingest-storage-autoscaling-one-trigger-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-autoscaling-one-trigger-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -391,8 +404,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -407,7 +420,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -831,14 +866,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -847,8 +882,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix=partition-ingesters/ + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -920,10 +1082,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2941,7 +3108,95 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b namespace: default spec: advanced: @@ -2966,11 +3221,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor + name: distributor-zone-b triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_cpu_hpa_default + metricName: cortex_distributor_zone_b_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -2989,11 +3244,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_cpu_hpa_default + name: cortex_distributor_zone_b_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_memory_hpa_default + metricName: cortex_distributor_zone_b_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3023,7 +3278,7 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_memory_hpa_default + name: cortex_distributor_zone_b_memory_hpa_default type: prometheus --- apiVersion: keda.sh/v1alpha1 diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-0-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-0-generated.yaml index 5f50d850772..503937febc2 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-0-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-0-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -332,8 +345,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -348,7 +361,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -772,14 +807,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -788,8 +823,128 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -854,10 +1009,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2778,7 +2938,95 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b namespace: default spec: advanced: @@ -2803,11 +3051,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor + name: distributor-zone-b triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_cpu_hpa_default + metricName: cortex_distributor_zone_b_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -2826,11 +3074,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_cpu_hpa_default + name: cortex_distributor_zone_b_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_memory_hpa_default + metricName: cortex_distributor_zone_b_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -2860,5 +3108,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_memory_hpa_default + name: cortex_distributor_zone_b_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-1-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-1-generated.yaml index eab256f0b6e..b0f5b79d4ad 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-1-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-1-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -332,8 +345,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -348,7 +361,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -841,14 +876,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -857,8 +892,129 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -924,10 +1080,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3280,7 +3441,95 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b namespace: default spec: advanced: @@ -3305,11 +3554,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor + name: distributor-zone-b triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_cpu_hpa_default + metricName: cortex_distributor_zone_b_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -3328,11 +3577,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_cpu_hpa_default + name: cortex_distributor_zone_b_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_memory_hpa_default + metricName: cortex_distributor_zone_b_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3362,5 +3611,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_memory_hpa_default + name: cortex_distributor_zone_b_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-10-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-10-generated.yaml index 112d98bb1e0..df2305fc88d 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-10-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-10-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -391,8 +404,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -407,7 +420,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -808,14 +843,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -824,8 +859,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix=partition-ingesters/ + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -897,10 +1059,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2774,7 +2941,95 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b namespace: default spec: advanced: @@ -2799,11 +3054,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor + name: distributor-zone-b triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_cpu_hpa_default + metricName: cortex_distributor_zone_b_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -2822,11 +3077,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_cpu_hpa_default + name: cortex_distributor_zone_b_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_memory_hpa_default + metricName: cortex_distributor_zone_b_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -2856,7 +3111,7 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_memory_hpa_default + name: cortex_distributor_zone_b_memory_hpa_default type: prometheus --- apiVersion: keda.sh/v1alpha1 diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-11-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-11-generated.yaml index b47b52f9f23..c8d2a21c763 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-11-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-11-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -391,8 +404,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -407,7 +420,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -808,14 +843,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -824,8 +859,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix=partition-ingesters/ + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -897,10 +1059,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2778,7 +2945,95 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b namespace: default spec: advanced: @@ -2803,11 +3058,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor + name: distributor-zone-b triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_cpu_hpa_default + metricName: cortex_distributor_zone_b_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -2826,11 +3081,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_cpu_hpa_default + name: cortex_distributor_zone_b_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_memory_hpa_default + metricName: cortex_distributor_zone_b_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -2860,7 +3115,7 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_memory_hpa_default + name: cortex_distributor_zone_b_memory_hpa_default type: prometheus --- apiVersion: keda.sh/v1alpha1 diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-2-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-2-generated.yaml index f0f2b60ae2e..1006420691d 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-2-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-2-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -332,8 +345,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -348,7 +361,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -841,14 +876,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -857,8 +892,136 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingest-storage.migration.distributor-send-to-ingesters-enabled=true + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -931,10 +1094,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3293,7 +3461,95 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b namespace: default spec: advanced: @@ -3318,11 +3574,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor + name: distributor-zone-b triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_cpu_hpa_default + metricName: cortex_distributor_zone_b_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -3341,11 +3597,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_cpu_hpa_default + name: cortex_distributor_zone_b_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_memory_hpa_default + metricName: cortex_distributor_zone_b_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3375,5 +3631,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_memory_hpa_default + name: cortex_distributor_zone_b_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-3-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-3-generated.yaml index d58513ca677..f789b04d721 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-3-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-3-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -332,8 +345,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -348,7 +361,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -841,14 +876,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -857,8 +892,136 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingest-storage.migration.distributor-send-to-ingesters-enabled=true + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -931,10 +1094,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3315,7 +3483,95 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b namespace: default spec: advanced: @@ -3340,11 +3596,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor + name: distributor-zone-b triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_cpu_hpa_default + metricName: cortex_distributor_zone_b_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -3363,11 +3619,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_cpu_hpa_default + name: cortex_distributor_zone_b_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_memory_hpa_default + metricName: cortex_distributor_zone_b_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3397,5 +3653,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_memory_hpa_default + name: cortex_distributor_zone_b_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-4-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-4-generated.yaml index 11b1f1d9123..f0c4b1ccaf2 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-4-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-4-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -332,8 +345,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -348,7 +361,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -841,14 +876,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -857,8 +892,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -930,10 +1092,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3313,7 +3480,95 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b namespace: default spec: advanced: @@ -3338,11 +3593,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor + name: distributor-zone-b triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_cpu_hpa_default + metricName: cortex_distributor_zone_b_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -3361,11 +3616,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_cpu_hpa_default + name: cortex_distributor_zone_b_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_memory_hpa_default + metricName: cortex_distributor_zone_b_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3395,5 +3650,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_memory_hpa_default + name: cortex_distributor_zone_b_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-5a-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-5a-generated.yaml index 54b7a90c838..5d3caad09f5 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-5a-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-5a-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -332,8 +345,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -348,7 +361,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -841,14 +876,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -857,8 +892,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -930,10 +1092,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3313,7 +3480,95 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b namespace: default spec: advanced: @@ -3338,11 +3593,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor + name: distributor-zone-b triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_cpu_hpa_default + metricName: cortex_distributor_zone_b_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -3361,11 +3616,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_cpu_hpa_default + name: cortex_distributor_zone_b_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_memory_hpa_default + metricName: cortex_distributor_zone_b_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3395,5 +3650,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_memory_hpa_default + name: cortex_distributor_zone_b_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-5b-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-5b-generated.yaml index f4302d9a64c..18a27160cfe 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-5b-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-5b-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -332,8 +345,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -348,7 +361,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -841,14 +876,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -857,8 +892,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -930,10 +1092,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3313,7 +3480,95 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b namespace: default spec: advanced: @@ -3338,11 +3593,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor + name: distributor-zone-b triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_cpu_hpa_default + metricName: cortex_distributor_zone_b_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -3361,11 +3616,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_cpu_hpa_default + name: cortex_distributor_zone_b_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_memory_hpa_default + metricName: cortex_distributor_zone_b_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3395,5 +3650,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_memory_hpa_default + name: cortex_distributor_zone_b_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-6-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-6-generated.yaml index 5c30c4029d1..de991d69511 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-6-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-6-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -332,8 +345,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -348,7 +361,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -772,14 +807,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -788,8 +823,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -861,10 +1023,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2842,7 +3009,95 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b namespace: default spec: advanced: @@ -2867,11 +3122,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor + name: distributor-zone-b triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_cpu_hpa_default + metricName: cortex_distributor_zone_b_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -2890,11 +3145,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_cpu_hpa_default + name: cortex_distributor_zone_b_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_memory_hpa_default + metricName: cortex_distributor_zone_b_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -2924,5 +3179,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_memory_hpa_default + name: cortex_distributor_zone_b_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-7-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-7-generated.yaml index 9b6813c3d61..a41c63b5f97 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-7-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-7-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -332,8 +345,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -348,7 +361,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -772,14 +807,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -788,8 +823,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix=partition-ingesters/ + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -861,10 +1023,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2870,7 +3037,95 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b namespace: default spec: advanced: @@ -2895,11 +3150,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor + name: distributor-zone-b triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_cpu_hpa_default + metricName: cortex_distributor_zone_b_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -2918,11 +3173,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_cpu_hpa_default + name: cortex_distributor_zone_b_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_memory_hpa_default + metricName: cortex_distributor_zone_b_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -2952,5 +3207,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_memory_hpa_default + name: cortex_distributor_zone_b_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-8-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-8-generated.yaml index ba118d1d7fe..253675e25ab 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-8-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-8-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -332,8 +345,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -348,7 +361,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -772,14 +807,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -788,8 +823,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix=partition-ingesters/ + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -861,10 +1023,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2870,7 +3037,95 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b namespace: default spec: advanced: @@ -2895,11 +3150,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor + name: distributor-zone-b triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_cpu_hpa_default + metricName: cortex_distributor_zone_b_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -2918,11 +3173,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_cpu_hpa_default + name: cortex_distributor_zone_b_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_memory_hpa_default + metricName: cortex_distributor_zone_b_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -2952,5 +3207,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_memory_hpa_default + name: cortex_distributor_zone_b_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-9-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-9-generated.yaml index c57421b59a8..0272084002c 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-9-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-9-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -332,8 +345,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -348,7 +361,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -749,14 +784,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -765,8 +800,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix=partition-ingesters/ + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -838,10 +1000,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2707,7 +2874,95 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b namespace: default spec: advanced: @@ -2732,11 +2987,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor + name: distributor-zone-b triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_cpu_hpa_default + metricName: cortex_distributor_zone_b_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -2755,11 +3010,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_cpu_hpa_default + name: cortex_distributor_zone_b_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_memory_hpa_default + metricName: cortex_distributor_zone_b_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -2789,5 +3044,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_memory_hpa_default + name: cortex_distributor_zone_b_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-final-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-final-generated.yaml index 622c0eed712..0a398f89611 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-final-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-final-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -391,8 +404,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -407,7 +420,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -831,14 +866,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -847,8 +882,135 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ingestion-tenant-shard-size=3 + - -distributor.remote-timeout=5s + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingest-storage.enabled=true + - -ingest-storage.ingestion-partition-tenant-shard-size=1 + - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 + - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 + - -ingest-storage.kafka.topic=ingest + - -ingester.partition-ring.prefix= + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix=partition-ingesters/ + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -920,10 +1082,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2941,7 +3108,95 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b namespace: default spec: advanced: @@ -2966,11 +3221,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor + name: distributor-zone-b triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_cpu_hpa_default + metricName: cortex_distributor_zone_b_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -2989,11 +3244,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_cpu_hpa_default + name: cortex_distributor_zone_b_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_memory_hpa_default + metricName: cortex_distributor_zone_b_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -3023,7 +3278,7 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_memory_hpa_default + name: cortex_distributor_zone_b_memory_hpa_default type: prometheus --- apiVersion: keda.sh/v1alpha1 diff --git a/operations/mimir-tests/test-multi-zone-generated.yaml b/operations/mimir-tests/test-multi-zone-generated.yaml index b0538a2a48c..fcddfd953ab 100644 --- a/operations/mimir-tests/test-multi-zone-generated.yaml +++ b/operations/mimir-tests/test-multi-zone-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -293,8 +306,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -309,7 +322,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -655,14 +690,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -671,8 +706,127 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -736,10 +890,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2385,7 +2544,95 @@ spec: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b namespace: default spec: advanced: @@ -2410,11 +2657,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor + name: distributor-zone-b triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_cpu_hpa_default + metricName: cortex_distributor_zone_b_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -2433,11 +2680,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_cpu_hpa_default + name: cortex_distributor_zone_b_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_memory_hpa_default + metricName: cortex_distributor_zone_b_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -2467,5 +2714,5 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_memory_hpa_default + name: cortex_distributor_zone_b_memory_hpa_default type: prometheus diff --git a/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml b/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml index ce74b0731ee..9d3ea8fb881 100644 --- a/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml +++ b/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml @@ -33,14 +33,27 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -356,8 +369,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor - name: distributor + name: distributor-zone-a + name: distributor-zone-a namespace: default spec: clusterIP: None @@ -372,7 +385,29 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b --- apiVersion: v1 kind: Service @@ -732,14 +767,14 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor + name: distributor-zone-a namespace: default spec: minReadySeconds: 10 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor + name: distributor-zone-a strategy: rollingUpdate: maxSurge: 15% @@ -748,8 +783,127 @@ spec: metadata: labels: gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -813,10 +967,15 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor + name: distributor-zone-b maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2514,7 +2673,95 @@ webhooks: apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: distributor + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b namespace: default spec: advanced: @@ -2539,11 +2786,11 @@ spec: minReplicaCount: 3 pollingInterval: 10 scaleTargetRef: - name: distributor + name: distributor-zone-b triggers: - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_cpu_hpa_default + metricName: cortex_distributor_zone_b_cpu_hpa_default query: | quantile_over_time(0.95, sum( @@ -2562,11 +2809,11 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2000" - name: cortex_distributor_cpu_hpa_default + name: cortex_distributor_zone_b_cpu_hpa_default type: prometheus - metadata: ignoreNullValues: "false" - metricName: cortex_distributor_memory_hpa_default + metricName: cortex_distributor_zone_b_memory_hpa_default query: | quantile_over_time(0.95, sum( @@ -2596,7 +2843,7 @@ spec: ) serverAddress: http://prometheus.default:9090/prometheus threshold: "2147483648" - name: cortex_distributor_memory_hpa_default + name: cortex_distributor_zone_b_memory_hpa_default type: prometheus --- apiVersion: admissionregistration.k8s.io/v1 From daf8d5d4615f664c7aa1920ddf8f5b1ab201dffd Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Tue, 8 Oct 2024 14:50:57 +0200 Subject: [PATCH 7/8] Fixing review findings Signed-off-by: Yuri Nikolic --- ...custom-stabilization-window-generated.yaml | 36 +++++++++---------- ...toscaling-multiple-triggers-generated.yaml | 36 +++++++++---------- ...age-autoscaling-one-trigger-generated.yaml | 36 +++++++++---------- ...st-storage-migration-step-0-generated.yaml | 36 +++++++++---------- ...st-storage-migration-step-1-generated.yaml | 36 +++++++++---------- ...t-storage-migration-step-10-generated.yaml | 36 +++++++++---------- ...t-storage-migration-step-11-generated.yaml | 36 +++++++++---------- ...st-storage-migration-step-2-generated.yaml | 36 +++++++++---------- ...st-storage-migration-step-3-generated.yaml | 36 +++++++++---------- ...st-storage-migration-step-4-generated.yaml | 36 +++++++++---------- ...t-storage-migration-step-5a-generated.yaml | 36 +++++++++---------- ...t-storage-migration-step-5b-generated.yaml | 36 +++++++++---------- ...st-storage-migration-step-6-generated.yaml | 36 +++++++++---------- ...st-storage-migration-step-7-generated.yaml | 36 +++++++++---------- ...st-storage-migration-step-8-generated.yaml | 36 +++++++++---------- ...st-storage-migration-step-9-generated.yaml | 36 +++++++++---------- ...torage-migration-step-final-generated.yaml | 36 +++++++++---------- .../test-multi-zone-generated.yaml | 36 +++++++++---------- ...teway-automated-downscaling-generated.yaml | 36 +++++++++---------- operations/mimir/autoscaling.libsonnet | 29 +++++++-------- 20 files changed, 357 insertions(+), 356 deletions(-) diff --git a/operations/mimir-tests/test-ingest-storage-autoscaling-custom-stabilization-window-generated.yaml b/operations/mimir-tests/test-ingest-storage-autoscaling-custom-stabilization-window-generated.yaml index 618cf612aa1..acc37dcbb8d 100644 --- a/operations/mimir-tests/test-ingest-storage-autoscaling-custom-stabilization-window-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-autoscaling-custom-stabilization-window-generated.yaml @@ -3141,16 +3141,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3165,26 +3165,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3229,16 +3229,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3253,26 +3253,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) diff --git a/operations/mimir-tests/test-ingest-storage-autoscaling-multiple-triggers-generated.yaml b/operations/mimir-tests/test-ingest-storage-autoscaling-multiple-triggers-generated.yaml index fd9b480bd20..e323b82f7f9 100644 --- a/operations/mimir-tests/test-ingest-storage-autoscaling-multiple-triggers-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-autoscaling-multiple-triggers-generated.yaml @@ -3141,16 +3141,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3165,26 +3165,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3229,16 +3229,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3253,26 +3253,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) diff --git a/operations/mimir-tests/test-ingest-storage-autoscaling-one-trigger-generated.yaml b/operations/mimir-tests/test-ingest-storage-autoscaling-one-trigger-generated.yaml index 68e194b235f..cd3b23f8823 100644 --- a/operations/mimir-tests/test-ingest-storage-autoscaling-one-trigger-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-autoscaling-one-trigger-generated.yaml @@ -3141,16 +3141,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3165,26 +3165,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3229,16 +3229,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3253,26 +3253,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-0-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-0-generated.yaml index 503937febc2..277b97dbc0f 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-0-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-0-generated.yaml @@ -2971,16 +2971,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -2995,26 +2995,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3059,16 +3059,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3083,26 +3083,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-1-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-1-generated.yaml index b0f5b79d4ad..2ad072737c7 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-1-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-1-generated.yaml @@ -3474,16 +3474,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3498,26 +3498,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3562,16 +3562,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3586,26 +3586,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-10-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-10-generated.yaml index df2305fc88d..81ed79856d6 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-10-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-10-generated.yaml @@ -2974,16 +2974,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -2998,26 +2998,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3062,16 +3062,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3086,26 +3086,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-11-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-11-generated.yaml index c8d2a21c763..0000c478769 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-11-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-11-generated.yaml @@ -2978,16 +2978,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3002,26 +3002,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3066,16 +3066,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3090,26 +3090,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-2-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-2-generated.yaml index 1006420691d..f6417ab7d97 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-2-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-2-generated.yaml @@ -3494,16 +3494,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3518,26 +3518,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3582,16 +3582,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3606,26 +3606,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-3-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-3-generated.yaml index f789b04d721..6242839f973 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-3-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-3-generated.yaml @@ -3516,16 +3516,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3540,26 +3540,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3604,16 +3604,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3628,26 +3628,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-4-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-4-generated.yaml index f0c4b1ccaf2..dd6a09b9b69 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-4-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-4-generated.yaml @@ -3513,16 +3513,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3537,26 +3537,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3601,16 +3601,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3625,26 +3625,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-5a-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-5a-generated.yaml index 5d3caad09f5..971823dcf0c 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-5a-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-5a-generated.yaml @@ -3513,16 +3513,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3537,26 +3537,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3601,16 +3601,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3625,26 +3625,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-5b-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-5b-generated.yaml index 18a27160cfe..edf18f75652 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-5b-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-5b-generated.yaml @@ -3513,16 +3513,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3537,26 +3537,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3601,16 +3601,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3625,26 +3625,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-6-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-6-generated.yaml index de991d69511..a3d03e8ed06 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-6-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-6-generated.yaml @@ -3042,16 +3042,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3066,26 +3066,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3130,16 +3130,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3154,26 +3154,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-7-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-7-generated.yaml index a41c63b5f97..7cbd02874f8 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-7-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-7-generated.yaml @@ -3070,16 +3070,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3094,26 +3094,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3158,16 +3158,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3182,26 +3182,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-8-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-8-generated.yaml index 253675e25ab..1a6d4979b2f 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-8-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-8-generated.yaml @@ -3070,16 +3070,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3094,26 +3094,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3158,16 +3158,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3182,26 +3182,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-9-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-9-generated.yaml index 0272084002c..8e40cb7b9ea 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-9-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-9-generated.yaml @@ -2907,16 +2907,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -2931,26 +2931,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -2995,16 +2995,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3019,26 +3019,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-final-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-final-generated.yaml index 0a398f89611..f735026f40f 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-final-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-final-generated.yaml @@ -3141,16 +3141,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3165,26 +3165,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3229,16 +3229,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) @@ -3253,26 +3253,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) diff --git a/operations/mimir-tests/test-multi-zone-generated.yaml b/operations/mimir-tests/test-multi-zone-generated.yaml index fcddfd953ab..8fb4577c280 100644 --- a/operations/mimir-tests/test-multi-zone-generated.yaml +++ b/operations/mimir-tests/test-multi-zone-generated.yaml @@ -2577,16 +2577,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -2601,26 +2601,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -2665,16 +2665,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) @@ -2689,26 +2689,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) diff --git a/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml b/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml index 9d3ea8fb881..8b24a075e45 100644 --- a/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml +++ b/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml @@ -2706,16 +2706,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -2730,26 +2730,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] )[15m:1m] ) >= 15 ) @@ -2794,16 +2794,16 @@ spec: query: | quantile_over_time(0.95, sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default"}[5m])) + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 )[15m:] ) * 1000 and count ( count_over_time( present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m] + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) @@ -2818,26 +2818,26 @@ spec: quantile_over_time(0.95, sum( ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default"}) + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0 + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 ) or vector(0) )[15m:] ) + sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory"}[15m])) + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default"}[15m]) > 0) + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"}) + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) or vector(0) ) and count ( count_over_time( present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default"}[1m] + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] )[15m:1m] ) >= 15 ) diff --git a/operations/mimir/autoscaling.libsonnet b/operations/mimir/autoscaling.libsonnet index 22193d1083b..f4c0c1df596 100644 --- a/operations/mimir/autoscaling.libsonnet +++ b/operations/mimir/autoscaling.libsonnet @@ -581,6 +581,7 @@ memory_target_utilization=$._config.autoscaling_distributor_memory_target_utilization, with_cortex_prefix=true, with_ready_trigger=true, + pod_regex=pod_regex, ) + ( { spec+: { @@ -635,43 +636,43 @@ } ), - local isMultiZoneEnabled = $._config.multi_zone_distributor_enabled, - local isAutoscalingEnabled = $._config.autoscaling_distributor_enabled, - local isAutoscalingSingleZoneEnabled = !isMultiZoneEnabled && isAutoscalingEnabled, - local isAutoscalingZoneAEnabled = isMultiZoneEnabled && isAutoscalingEnabled && std.length($._config.multi_zone_distributor_availability_zones) >= 1, - local isAutoscalingZoneBEnabled = isMultiZoneEnabled && isAutoscalingEnabled && std.length($._config.multi_zone_distributor_availability_zones) >= 2, - local isAutoscalingZoneCEnabled = isMultiZoneEnabled && isAutoscalingEnabled && std.length($._config.multi_zone_distributor_availability_zones) >= 3, + local isDistributorMultiZoneEnabled = $._config.multi_zone_distributor_enabled, + local isDistributorAutoscalingEnabled = $._config.autoscaling_distributor_enabled, + local isDistributorAutoscalingSingleZoneEnabled = !isDistributorMultiZoneEnabled && isDistributorAutoscalingEnabled, + local isDistributorAutoscalingZoneAEnabled = isDistributorMultiZoneEnabled && isDistributorAutoscalingEnabled && std.length($._config.multi_zone_distributor_availability_zones) >= 1, + local isDistributorAutoscalingZoneBEnabled = isDistributorMultiZoneEnabled && isDistributorAutoscalingEnabled && std.length($._config.multi_zone_distributor_availability_zones) >= 2, + local isDistributorAutoscalingZoneCEnabled = isDistributorMultiZoneEnabled && isDistributorAutoscalingEnabled && std.length($._config.multi_zone_distributor_availability_zones) >= 3, - distributor_scaled_object: if !isAutoscalingSingleZoneEnabled then null else + distributor_scaled_object: if !isDistributorAutoscalingSingleZoneEnabled then null else $.newDistributorScaledObject('distributor'), distributor_deployment: overrideSuperIfExists( 'distributor_deployment', - if !isAutoscalingSingleZoneEnabled then {} else $.removeReplicasFromSpec + if !isDistributorAutoscalingSingleZoneEnabled then {} else $.removeReplicasFromSpec ), - distributor_zone_a_scaled_object: if !isAutoscalingZoneAEnabled then null else + distributor_zone_a_scaled_object: if !isDistributorAutoscalingZoneAEnabled then null else $.newDistributorScaledObject('distributor-zone-a', 'distributor-zone-a.*'), distributor_zone_a_deployment: overrideSuperIfExists( 'distributor_zone_a_deployment', - if !isAutoscalingZoneAEnabled then {} else $.removeReplicasFromSpec + if !isDistributorAutoscalingZoneAEnabled then {} else $.removeReplicasFromSpec ), - distributor_zone_b_scaled_object: if !isAutoscalingZoneBEnabled then null else + distributor_zone_b_scaled_object: if !isDistributorAutoscalingZoneBEnabled then null else $.newDistributorScaledObject('distributor-zone-b', 'distributor-zone-b.*'), distributor_zone_b_deployment: overrideSuperIfExists( 'distributor_zone_b_deployment', - if !isAutoscalingZoneBEnabled then {} else $.removeReplicasFromSpec + if !isDistributorAutoscalingZoneBEnabled then {} else $.removeReplicasFromSpec ), - distributor_zone_c_scaled_object: if !isAutoscalingZoneCEnabled then null else + distributor_zone_c_scaled_object: if !isDistributorAutoscalingZoneCEnabled then null else $.newDistributorScaledObject('distributor-zone-c', 'distributor-zone-c.*'), distributor_zone_c_deployment: overrideSuperIfExists( 'distributor_zone_c_deployment', - if !isAutoscalingZoneCEnabled then {} else $.removeReplicasFromSpec + if !isDistributorAutoscalingZoneCEnabled then {} else $.removeReplicasFromSpec ), ruler_scaled_object: if !$._config.autoscaling_ruler_enabled then null else $.newResourceScaledObject( From 50c597eb4da0c5f0d3a030f4075e89a0d6c055cd Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Tue, 8 Oct 2024 17:19:47 +0200 Subject: [PATCH 8/8] Move distributor-related test in test-multi-zone-distributor.jsonnet Signed-off-by: Yuri Nikolic --- ...custom-stabilization-window-generated.yaml | 362 +-- ...toscaling-multiple-triggers-generated.yaml | 362 +-- ...age-autoscaling-one-trigger-generated.yaml | 362 +-- ...st-storage-migration-step-0-generated.yaml | 355 +-- ...st-storage-migration-step-1-generated.yaml | 356 +-- ...t-storage-migration-step-10-generated.yaml | 362 +-- ...t-storage-migration-step-11-generated.yaml | 362 +-- ...st-storage-migration-step-2-generated.yaml | 363 +-- ...st-storage-migration-step-3-generated.yaml | 363 +-- ...st-storage-migration-step-4-generated.yaml | 362 +-- ...t-storage-migration-step-5a-generated.yaml | 362 +-- ...t-storage-migration-step-5b-generated.yaml | 362 +-- ...st-storage-migration-step-6-generated.yaml | 362 +-- ...st-storage-migration-step-7-generated.yaml | 362 +-- ...st-storage-migration-step-8-generated.yaml | 362 +-- ...st-storage-migration-step-9-generated.yaml | 362 +-- ...torage-migration-step-final-generated.yaml | 362 +-- ...test-multi-zone-distributor-generated.yaml | 2718 +++++++++++++++++ .../test-multi-zone-distributor.jsonnet | 12 + .../test-multi-zone-generated.yaml | 354 +-- ...teway-automated-downscaling-generated.yaml | 354 +-- .../mimir-tests/test-multi-zone.jsonnet | 8 - 22 files changed, 2920 insertions(+), 6669 deletions(-) create mode 100644 operations/mimir-tests/test-multi-zone-distributor-generated.yaml create mode 100644 operations/mimir-tests/test-multi-zone-distributor.jsonnet diff --git a/operations/mimir-tests/test-ingest-storage-autoscaling-custom-stabilization-window-generated.yaml b/operations/mimir-tests/test-ingest-storage-autoscaling-custom-stabilization-window-generated.yaml index acc37dcbb8d..e3562950763 100644 --- a/operations/mimir-tests/test-ingest-storage-autoscaling-custom-stabilization-window-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-autoscaling-custom-stabilization-window-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -404,8 +391,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -420,29 +407,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -866,14 +831,15 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 + replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -882,135 +848,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix=partition-ingesters/ - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1082,15 +921,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3107,182 +2941,6 @@ spec: --- apiVersion: keda.sh/v1alpha1 kind: ScaledObject -metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-b - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject metadata: name: ingester-zone-a namespace: default diff --git a/operations/mimir-tests/test-ingest-storage-autoscaling-multiple-triggers-generated.yaml b/operations/mimir-tests/test-ingest-storage-autoscaling-multiple-triggers-generated.yaml index e323b82f7f9..3d71ac653bf 100644 --- a/operations/mimir-tests/test-ingest-storage-autoscaling-multiple-triggers-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-autoscaling-multiple-triggers-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -404,8 +391,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -420,29 +407,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -866,14 +831,15 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 + replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -882,135 +848,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix=partition-ingesters/ - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1082,15 +921,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3107,182 +2941,6 @@ spec: --- apiVersion: keda.sh/v1alpha1 kind: ScaledObject -metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-b - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject metadata: name: ingester-zone-a namespace: default diff --git a/operations/mimir-tests/test-ingest-storage-autoscaling-one-trigger-generated.yaml b/operations/mimir-tests/test-ingest-storage-autoscaling-one-trigger-generated.yaml index cd3b23f8823..bb11bd645a1 100644 --- a/operations/mimir-tests/test-ingest-storage-autoscaling-one-trigger-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-autoscaling-one-trigger-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -404,8 +391,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -420,29 +407,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -866,14 +831,15 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 + replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -882,135 +848,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix=partition-ingesters/ - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1082,15 +921,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3107,182 +2941,6 @@ spec: --- apiVersion: keda.sh/v1alpha1 kind: ScaledObject -metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-b - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject metadata: name: ingester-zone-a namespace: default diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-0-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-0-generated.yaml index 277b97dbc0f..b86c1fd5032 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-0-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-0-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -345,30 +332,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -383,7 +348,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -807,14 +772,15 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 + replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -823,128 +789,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix= - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1009,15 +855,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2934,179 +2775,3 @@ spec: memory: 512Mi size: 3 version: 3.3.13 ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-b - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default - type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-1-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-1-generated.yaml index 2ad072737c7..705760da7a2 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-1-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-1-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -345,8 +332,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -361,29 +348,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -876,14 +841,15 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 + replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -892,129 +858,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix= - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1080,15 +925,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3437,179 +3277,3 @@ spec: memory: 512Mi size: 3 version: 3.3.13 ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-b - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default - type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-10-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-10-generated.yaml index 81ed79856d6..5493a610ba9 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-10-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-10-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -404,8 +391,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -420,29 +407,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -843,14 +808,15 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 + replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -859,135 +825,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix=partition-ingesters/ - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1059,15 +898,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2940,182 +2774,6 @@ spec: --- apiVersion: keda.sh/v1alpha1 kind: ScaledObject -metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-b - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject metadata: name: ingester-zone-a namespace: default diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-11-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-11-generated.yaml index 0000c478769..04b7c5145cf 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-11-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-11-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -404,8 +391,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -420,29 +407,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -843,14 +808,15 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 + replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -859,135 +825,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix=partition-ingesters/ - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1059,15 +898,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2944,182 +2778,6 @@ spec: --- apiVersion: keda.sh/v1alpha1 kind: ScaledObject -metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-b - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject metadata: name: ingester-zone-a namespace: default diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-2-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-2-generated.yaml index f6417ab7d97..f1dd8905fe4 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-2-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-2-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -345,8 +332,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -361,29 +348,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -876,14 +841,15 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 + replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -892,136 +858,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingest-storage.migration.distributor-send-to-ingesters-enabled=true - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix= - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1094,15 +932,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3457,179 +3290,3 @@ spec: memory: 512Mi size: 3 version: 3.3.13 ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-b - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default - type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-3-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-3-generated.yaml index 6242839f973..8c66ea46ecc 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-3-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-3-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -345,8 +332,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -361,29 +348,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -876,14 +841,15 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 + replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -892,136 +858,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingest-storage.migration.distributor-send-to-ingesters-enabled=true - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix= - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1094,15 +932,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3479,179 +3312,3 @@ spec: memory: 512Mi size: 3 version: 3.3.13 ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-b - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default - type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-4-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-4-generated.yaml index dd6a09b9b69..fa331a58566 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-4-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-4-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -345,8 +332,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -361,29 +348,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -876,14 +841,15 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 + replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -892,135 +858,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix= - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1092,15 +931,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3476,179 +3310,3 @@ spec: memory: 512Mi size: 3 version: 3.3.13 ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-b - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default - type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-5a-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-5a-generated.yaml index 971823dcf0c..9ff16c6f497 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-5a-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-5a-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -345,8 +332,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -361,29 +348,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -876,14 +841,15 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 + replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -892,135 +858,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix= - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1092,15 +931,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3476,179 +3310,3 @@ spec: memory: 512Mi size: 3 version: 3.3.13 ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-b - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default - type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-5b-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-5b-generated.yaml index edf18f75652..71a1767b8ee 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-5b-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-5b-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -345,8 +332,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -361,29 +348,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -876,14 +841,15 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 + replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -892,135 +858,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix= - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1092,15 +931,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3476,179 +3310,3 @@ spec: memory: 512Mi size: 3 version: 3.3.13 ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-b - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default - type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-6-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-6-generated.yaml index a3d03e8ed06..533e044db21 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-6-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-6-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -345,30 +332,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -383,7 +348,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -807,14 +772,15 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 + replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -823,135 +789,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix= - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1023,15 +862,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3005,179 +2839,3 @@ spec: memory: 512Mi size: 3 version: 3.3.13 ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-b - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default - type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-7-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-7-generated.yaml index 7cbd02874f8..a56bf98d767 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-7-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-7-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -345,30 +332,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -383,7 +348,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -807,14 +772,15 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 + replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -823,135 +789,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix=partition-ingesters/ - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1023,15 +862,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3033,179 +2867,3 @@ spec: memory: 512Mi size: 3 version: 3.3.13 ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-b - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default - type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-8-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-8-generated.yaml index 1a6d4979b2f..3994bf7bb43 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-8-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-8-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -345,30 +332,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -383,7 +348,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -807,14 +772,15 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 + replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -823,135 +789,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix=partition-ingesters/ - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1023,15 +862,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3033,179 +2867,3 @@ spec: memory: 512Mi size: 3 version: 3.3.13 ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-b - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default - type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-9-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-9-generated.yaml index 8e40cb7b9ea..8028222f189 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-9-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-9-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -345,30 +332,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -383,7 +348,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -784,14 +749,15 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 + replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -800,135 +766,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix=partition-ingesters/ - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1000,15 +839,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2870,179 +2704,3 @@ spec: memory: 512Mi size: 3 version: 3.3.13 ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-b - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default - type: prometheus diff --git a/operations/mimir-tests/test-ingest-storage-migration-step-final-generated.yaml b/operations/mimir-tests/test-ingest-storage-migration-step-final-generated.yaml index f735026f40f..c65472b3c02 100644 --- a/operations/mimir-tests/test-ingest-storage-migration-step-final-generated.yaml +++ b/operations/mimir-tests/test-ingest-storage-migration-step-final-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -404,8 +391,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -420,29 +407,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -866,14 +831,15 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 + replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -882,135 +848,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ingestion-tenant-shard-size=3 - - -distributor.remote-timeout=5s - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingest-storage.enabled=true - - -ingest-storage.ingestion-partition-tenant-shard-size=1 - - -ingest-storage.kafka.address=kafka.default.svc.cluster.local.:9092 - - -ingest-storage.kafka.auto-create-topic-default-partitions=1000 - - -ingest-storage.kafka.topic=ingest - - -ingester.partition-ring.prefix= - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix=partition-ingesters/ - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -1082,15 +921,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -3107,182 +2941,6 @@ spec: --- apiVersion: keda.sh/v1alpha1 kind: ScaledObject -metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-b - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject metadata: name: ingester-zone-a namespace: default diff --git a/operations/mimir-tests/test-multi-zone-distributor-generated.yaml b/operations/mimir-tests/test-multi-zone-distributor-generated.yaml new file mode 100644 index 00000000000..8fb4577c280 --- /dev/null +++ b/operations/mimir-tests/test-multi-zone-distributor-generated.yaml @@ -0,0 +1,2718 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: default +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: alertmanager + name: alertmanager + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: alertmanager +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: compactor + name: compactor + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: compactor +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-a + name: distributor-zone-a + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-a +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor-zone-b +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: ingester-rollout + name: ingester-rollout + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + rollout-group: ingester +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: memcached + name: memcached + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: memcached +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: memcached-frontend + name: memcached-frontend + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: memcached-frontend +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: memcached-index-queries + name: memcached-index-queries + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: memcached-index-queries +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: memcached-metadata + name: memcached-metadata + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: memcached-metadata +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: querier + name: querier + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: querier +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: query-frontend + name: query-frontend + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: query-frontend +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: query-scheduler + name: query-scheduler + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: query-scheduler +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: rollout-operator + name: rollout-operator + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: rollout-operator +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: ruler + name: ruler + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: ruler +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: store-gateway-rollout + name: store-gateway-rollout + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + rollout-group: store-gateway +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: rollout-operator + namespace: default +--- +apiVersion: v1 +data: + overrides.yaml: | + overrides: {} +kind: ConfigMap +metadata: + name: overrides + namespace: default +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: rollout-operator-role + namespace: default +rules: +- apiGroups: + - "" + resources: + - pods + verbs: + - list + - get + - watch + - delete +- apiGroups: + - apps + resources: + - statefulsets + verbs: + - list + - get + - watch + - patch +- apiGroups: + - apps + resources: + - statefulsets/status + verbs: + - update +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: rollout-operator-rolebinding + namespace: default +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: rollout-operator-role +subjects: +- kind: ServiceAccount + name: rollout-operator + namespace: default +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: alertmanager + name: alertmanager + namespace: default +spec: + clusterIP: None + ports: + - name: alertmanager-http-metrics + port: 8080 + targetPort: 8080 + - name: alertmanager-grpc + port: 9095 + targetPort: 9095 + - name: alertmanager-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: alertmanager +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: compactor + name: compactor + namespace: default +spec: + clusterIP: None + ports: + - name: compactor-http-metrics + port: 8080 + targetPort: 8080 + - name: compactor-grpc + port: 9095 + targetPort: 9095 + - name: compactor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: compactor +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-a + name: distributor-zone-a + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-a +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor-zone-b + name: distributor-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor-zone-b +--- +apiVersion: v1 +kind: Service +metadata: + name: gossip-ring + namespace: default +spec: + clusterIP: None + ports: + - appProtocol: tcp + name: gossip-ring + port: 7946 + protocol: TCP + targetPort: 7946 + selector: + gossip_ring_member: "true" +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: ingester-zone-a + name: ingester-zone-a + namespace: default +spec: + clusterIP: None + ports: + - name: ingester-http-metrics + port: 8080 + targetPort: 8080 + - name: ingester-grpc + port: 9095 + targetPort: 9095 + - name: ingester-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: ingester-zone-a + rollout-group: ingester +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: ingester-zone-b + name: ingester-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: ingester-http-metrics + port: 8080 + targetPort: 8080 + - name: ingester-grpc + port: 9095 + targetPort: 9095 + - name: ingester-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: ingester-zone-b + rollout-group: ingester +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: ingester-zone-c + name: ingester-zone-c + namespace: default +spec: + clusterIP: None + ports: + - name: ingester-http-metrics + port: 8080 + targetPort: 8080 + - name: ingester-grpc + port: 9095 + targetPort: 9095 + - name: ingester-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: ingester-zone-c + rollout-group: ingester +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: memcached + name: memcached + namespace: default +spec: + clusterIP: None + ports: + - name: memcached-client + port: 11211 + targetPort: 11211 + - name: exporter-http-metrics + port: 9150 + targetPort: 9150 + selector: + name: memcached +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: memcached-frontend + name: memcached-frontend + namespace: default +spec: + clusterIP: None + ports: + - name: memcached-client + port: 11211 + targetPort: 11211 + - name: exporter-http-metrics + port: 9150 + targetPort: 9150 + selector: + name: memcached-frontend +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: memcached-index-queries + name: memcached-index-queries + namespace: default +spec: + clusterIP: None + ports: + - name: memcached-client + port: 11211 + targetPort: 11211 + - name: exporter-http-metrics + port: 9150 + targetPort: 9150 + selector: + name: memcached-index-queries +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: memcached-metadata + name: memcached-metadata + namespace: default +spec: + clusterIP: None + ports: + - name: memcached-client + port: 11211 + targetPort: 11211 + - name: exporter-http-metrics + port: 9150 + targetPort: 9150 + selector: + name: memcached-metadata +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: querier + name: querier + namespace: default +spec: + ports: + - name: querier-http-metrics + port: 8080 + targetPort: 8080 + - name: querier-grpc + port: 9095 + targetPort: 9095 + - name: querier-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: querier +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: query-frontend + name: query-frontend + namespace: default +spec: + ports: + - name: query-frontend-http-metrics + port: 8080 + targetPort: 8080 + - name: query-frontend-grpc + port: 9095 + targetPort: 9095 + selector: + name: query-frontend +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: query-scheduler + name: query-scheduler + namespace: default +spec: + ports: + - name: query-scheduler-http-metrics + port: 8080 + targetPort: 8080 + - name: query-scheduler-grpc + port: 9095 + targetPort: 9095 + selector: + name: query-scheduler +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: query-scheduler + name: query-scheduler-discovery + namespace: default +spec: + clusterIP: None + ports: + - name: query-scheduler-http-metrics + port: 8080 + targetPort: 8080 + - name: query-scheduler-grpc + port: 9095 + targetPort: 9095 + publishNotReadyAddresses: true + selector: + name: query-scheduler +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: ruler + name: ruler + namespace: default +spec: + ports: + - name: ruler-http-metrics + port: 8080 + targetPort: 8080 + - name: ruler-grpc + port: 9095 + targetPort: 9095 + selector: + name: ruler +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: store-gateway-multi-zone + name: store-gateway-multi-zone + namespace: default +spec: + ports: + - name: store-gateway-http-metrics + port: 80 + protocol: TCP + targetPort: 80 + selector: + rollout-group: store-gateway +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: store-gateway-zone-a + name: store-gateway-zone-a + namespace: default +spec: + clusterIP: None + ports: + - name: store-gateway-http-metrics + port: 8080 + targetPort: 8080 + - name: store-gateway-grpc + port: 9095 + targetPort: 9095 + - name: store-gateway-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: store-gateway-zone-a + rollout-group: store-gateway +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: store-gateway-zone-b + name: store-gateway-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: store-gateway-http-metrics + port: 8080 + targetPort: 8080 + - name: store-gateway-grpc + port: 9095 + targetPort: 9095 + - name: store-gateway-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: store-gateway-zone-b + rollout-group: store-gateway +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: store-gateway-zone-c + name: store-gateway-zone-c + namespace: default +spec: + clusterIP: None + ports: + - name: store-gateway-http-metrics + port: 8080 + targetPort: 8080 + - name: store-gateway-grpc + port: 9095 + targetPort: 9095 + - name: store-gateway-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: store-gateway-zone-c + rollout-group: store-gateway +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-a + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-a + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-a + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent + name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-a + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor-zone-b + namespace: default +spec: + minReadySeconds: 10 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor-zone-b + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor-zone-b + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2b + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent + name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor-zone-b + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: querier + namespace: default +spec: + minReadySeconds: 10 + replicas: 6 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: querier + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: querier + spec: + containers: + - args: + - -blocks-storage.bucket-store.metadata-cache.backend=memcached + - -blocks-storage.bucket-store.metadata-cache.memcached.addresses=dnssrvnoa+memcached-metadata.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-item-size=1048576 + - -blocks-storage.bucket-store.sync-dir=/data/tsdb + - -blocks-storage.bucket-store.sync-interval=15m + - -blocks-storage.gcs.bucket-name=blocks-bucket + - -common.storage.backend=gcs + - -distributor.health-check-ingesters=true + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=268435456 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -querier.frontend-client.grpc-max-send-msg-size=104857600 + - -querier.max-concurrent=8 + - -querier.max-partial-query-length=768h + - -querier.scheduler-address=query-scheduler-discovery.default.svc.cluster.local.:9095 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -store-gateway.sharding-ring.heartbeat-timeout=4m + - -store-gateway.sharding-ring.prefix=multi-zone/ + - -store-gateway.sharding-ring.replication-factor=3 + - -store-gateway.sharding-ring.store=memberlist + - -store-gateway.sharding-ring.zone-awareness-enabled=true + - -target=querier + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "5" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "5000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent + name: querier + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 24Gi + requests: + cpu: "1" + memory: 12Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 180 + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: querier + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: query-frontend + namespace: default +spec: + minReadySeconds: 10 + replicas: 2 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: query-frontend + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + name: query-frontend + spec: + containers: + - args: + - -query-frontend.cache-results=true + - -query-frontend.max-cache-freshness=10m + - -query-frontend.max-total-query-length=12000h + - -query-frontend.query-sharding-target-series-per-shard=2500 + - -query-frontend.results-cache.backend=memcached + - -query-frontend.results-cache.memcached.addresses=dnssrvnoa+memcached-frontend.default.svc.cluster.local.:11211 + - -query-frontend.results-cache.memcached.max-item-size=5242880 + - -query-frontend.results-cache.memcached.timeout=500ms + - -query-frontend.scheduler-address=query-scheduler-discovery.default.svc.cluster.local.:9095 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=30s + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=query-frontend + - -usage-stats.installation-mode=jsonnet + env: + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "5000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent + name: query-frontend + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 1200Mi + requests: + cpu: "2" + memory: 600Mi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 390 + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: query-frontend + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: query-scheduler + namespace: default +spec: + minReadySeconds: 10 + replicas: 2 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: query-scheduler + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + name: query-scheduler + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + name: query-scheduler + topologyKey: kubernetes.io/hostname + containers: + - args: + - -query-scheduler.max-outstanding-requests-per-tenant=100 + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -target=query-scheduler + - -usage-stats.installation-mode=jsonnet + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent + name: query-scheduler + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 2Gi + requests: + cpu: "2" + memory: 1Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 180 + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: rollout-operator + namespace: default +spec: + minReadySeconds: 10 + replicas: 1 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: rollout-operator + strategy: + rollingUpdate: + maxSurge: 0 + maxUnavailable: 1 + template: + metadata: + labels: + name: rollout-operator + spec: + containers: + - args: + - -kubernetes.namespace=default + image: grafana/rollout-operator:v0.19.1 + imagePullPolicy: IfNotPresent + name: rollout-operator + ports: + - containerPort: 8001 + name: http-metrics + readinessProbe: + httpGet: + path: /ready + port: 8001 + initialDelaySeconds: 5 + timeoutSeconds: 1 + resources: + limits: + memory: 200Mi + requests: + cpu: 100m + memory: 100Mi + serviceAccountName: rollout-operator +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ruler + namespace: default +spec: + minReadySeconds: 10 + replicas: 2 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: ruler + strategy: + rollingUpdate: + maxSurge: 50% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: ruler + spec: + containers: + - args: + - -blocks-storage.bucket-store.metadata-cache.backend=memcached + - -blocks-storage.bucket-store.metadata-cache.memcached.addresses=dnssrvnoa+memcached-metadata.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-item-size=1048576 + - -blocks-storage.bucket-store.sync-dir=/data/tsdb + - -blocks-storage.bucket-store.sync-interval=15m + - -blocks-storage.gcs.bucket-name=blocks-bucket + - -common.storage.backend=gcs + - -distributor.health-check-ingesters=true + - -distributor.remote-timeout=10s + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -querier.max-partial-query-length=768h + - -ruler-storage.cache.backend=memcached + - -ruler-storage.cache.memcached.addresses=dnssrvnoa+memcached-metadata.default.svc.cluster.local.:11211 + - -ruler-storage.cache.memcached.max-async-concurrency=50 + - -ruler-storage.cache.memcached.max-item-size=1048576 + - -ruler-storage.gcs.bucket-name=rules-bucket + - -ruler.alertmanager-url=http://alertmanager.default.svc.cluster.local./alertmanager + - -ruler.max-rule-groups-per-tenant=70 + - -ruler.max-rules-per-rule-group=20 + - -ruler.ring.store=memberlist + - -ruler.rule-path=/rules + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -store-gateway.sharding-ring.heartbeat-timeout=4m + - -store-gateway.sharding-ring.prefix=multi-zone/ + - -store-gateway.sharding-ring.replication-factor=3 + - -store-gateway.sharding-ring.store=memberlist + - -store-gateway.sharding-ring.zone-awareness-enabled=true + - -target=ruler + - -usage-stats.installation-mode=jsonnet + env: + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent + name: ruler + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + cpu: "16" + memory: 16Gi + requests: + cpu: "1" + memory: 6Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 600 + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: ruler + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + labels: + name: alertmanager + name: alertmanager + namespace: default +spec: + replicas: 3 + selector: + matchLabels: + name: alertmanager + serviceName: alertmanager + template: + metadata: + labels: + gossip_ring_member: "true" + name: alertmanager + spec: + containers: + - args: + - -alertmanager-storage.gcs.bucket-name=alerts-bucket + - -alertmanager.sharding-ring.replication-factor=3 + - -alertmanager.sharding-ring.store=memberlist + - -alertmanager.storage.path=/data + - -alertmanager.web.external-url=http://test/alertmanager + - -common.storage.backend=gcs + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-idle-timeout=6m + - -server.http-listen-port=8080 + - -target=alertmanager + - -usage-stats.installation-mode=jsonnet + env: + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent + name: alertmanager + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 15Gi + requests: + cpu: "2" + memory: 10Gi + volumeMounts: + - mountPath: /data + name: alertmanager-data + - mountPath: /etc/mimir + name: overrides + securityContext: + runAsUser: 0 + terminationGracePeriodSeconds: 900 + volumes: + - configMap: + name: overrides + name: overrides + updateStrategy: + type: RollingUpdate + volumeClaimTemplates: + - apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: alertmanager-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 100Gi +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + labels: + name: compactor + name: compactor + namespace: default +spec: + podManagementPolicy: Parallel + replicas: 1 + selector: + matchLabels: + name: compactor + serviceName: compactor + template: + metadata: + labels: + gossip_ring_member: "true" + name: compactor + spec: + containers: + - args: + - -blocks-storage.gcs.bucket-name=blocks-bucket + - -common.storage.backend=gcs + - -compactor.block-ranges=2h,12h,24h + - -compactor.blocks-retention-period=0 + - -compactor.cleanup-interval=15m + - -compactor.compaction-concurrency=1 + - -compactor.compaction-interval=30m + - -compactor.compactor-tenant-shard-size=1 + - -compactor.data-dir=/data + - -compactor.deletion-delay=2h + - -compactor.first-level-compaction-wait-period=25m + - -compactor.max-closing-blocks-concurrency=2 + - -compactor.max-opening-blocks-concurrency=4 + - -compactor.ring.heartbeat-period=1m + - -compactor.ring.heartbeat-timeout=4m + - -compactor.ring.prefix= + - -compactor.ring.store=memberlist + - -compactor.ring.wait-stability-min-duration=1m + - -compactor.split-and-merge-shards=0 + - -compactor.split-groups=1 + - -compactor.symbols-flushers-concurrency=4 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -target=compactor + - -usage-stats.installation-mode=jsonnet + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent + name: compactor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 6Gi + requests: + cpu: 1 + memory: 6Gi + volumeMounts: + - mountPath: /data + name: compactor-data + - mountPath: /etc/mimir + name: overrides + securityContext: + runAsUser: 0 + terminationGracePeriodSeconds: 900 + volumes: + - configMap: + name: overrides + name: overrides + updateStrategy: + type: RollingUpdate + volumeClaimTemplates: + - apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: compactor-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 250Gi + storageClassName: standard +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + annotations: + rollout-max-unavailable: "50" + labels: + rollout-group: ingester + name: ingester-zone-a + namespace: default +spec: + podManagementPolicy: Parallel + replicas: 1 + selector: + matchLabels: + name: ingester-zone-a + rollout-group: ingester + serviceName: ingester-zone-a + template: + metadata: + labels: + gossip_ring_member: "true" + name: ingester-zone-a + rollout-group: ingester + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: rollout-group + operator: In + values: + - ingester + - key: name + operator: NotIn + values: + - ingester-zone-a + topologyKey: kubernetes.io/hostname + containers: + - args: + - -blocks-storage.gcs.bucket-name=blocks-bucket + - -blocks-storage.tsdb.block-ranges-period=2h + - -blocks-storage.tsdb.dir=/data/tsdb + - -blocks-storage.tsdb.head-compaction-interval=15m + - -blocks-storage.tsdb.ship-interval=1m + - -blocks-storage.tsdb.wal-replay-concurrency=3 + - -common.storage.backend=gcs + - -distributor.health-check-ingesters=true + - -ingester.max-global-metadata-per-metric=10 + - -ingester.max-global-metadata-per-user=30000 + - -ingester.max-global-series-per-user=150000 + - -ingester.ring.heartbeat-period=2m + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.instance-availability-zone=zone-a + - -ingester.ring.num-tokens=512 + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.tokens-file-path=/data/tokens + - -ingester.ring.unregister-on-shutdown=true + - -ingester.ring.zone-awareness-enabled=true + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc-max-concurrent-streams=500 + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -target=ingester + - -usage-stats.installation-mode=jsonnet + env: + - name: A + value: ingester-a-only + - name: GOGC + value: "off" + - name: GOMAXPROCS + value: "9" + - name: GOMEMLIMIT + value: 1Gi + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + - name: Z + value: "123" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent + name: ingester + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 25Gi + requests: + cpu: "4" + memory: 15Gi + volumeMounts: + - mountPath: /data + name: ingester-data + - mountPath: /etc/mimir + name: overrides + securityContext: + runAsUser: 0 + terminationGracePeriodSeconds: 1200 + volumes: + - configMap: + name: overrides + name: overrides + updateStrategy: + type: OnDelete + volumeClaimTemplates: + - apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: ingester-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 100Gi + storageClassName: fast +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + annotations: + rollout-max-unavailable: "50" + labels: + rollout-group: ingester + name: ingester-zone-b + namespace: default +spec: + podManagementPolicy: Parallel + replicas: 1 + selector: + matchLabels: + name: ingester-zone-b + rollout-group: ingester + serviceName: ingester-zone-b + template: + metadata: + labels: + gossip_ring_member: "true" + name: ingester-zone-b + rollout-group: ingester + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: rollout-group + operator: In + values: + - ingester + - key: name + operator: NotIn + values: + - ingester-zone-b + topologyKey: kubernetes.io/hostname + containers: + - args: + - -blocks-storage.gcs.bucket-name=blocks-bucket + - -blocks-storage.tsdb.block-ranges-period=2h + - -blocks-storage.tsdb.dir=/data/tsdb + - -blocks-storage.tsdb.head-compaction-interval=15m + - -blocks-storage.tsdb.ship-interval=1m + - -blocks-storage.tsdb.wal-replay-concurrency=3 + - -common.storage.backend=gcs + - -distributor.health-check-ingesters=true + - -ingester.max-global-metadata-per-metric=10 + - -ingester.max-global-metadata-per-user=30000 + - -ingester.max-global-series-per-user=150000 + - -ingester.ring.heartbeat-period=2m + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.instance-availability-zone=zone-b + - -ingester.ring.num-tokens=512 + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.tokens-file-path=/data/tokens + - -ingester.ring.unregister-on-shutdown=true + - -ingester.ring.zone-awareness-enabled=true + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc-max-concurrent-streams=500 + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -target=ingester + - -usage-stats.installation-mode=jsonnet + env: + - name: A + value: all-ingesters + - name: GOMAXPROCS + value: "9" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent + name: ingester + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 25Gi + requests: + cpu: "4" + memory: 15Gi + volumeMounts: + - mountPath: /data + name: ingester-data + - mountPath: /etc/mimir + name: overrides + securityContext: + runAsUser: 0 + terminationGracePeriodSeconds: 1200 + volumes: + - configMap: + name: overrides + name: overrides + updateStrategy: + type: OnDelete + volumeClaimTemplates: + - apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: ingester-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 100Gi + storageClassName: fast +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + annotations: + rollout-max-unavailable: "50" + labels: + rollout-group: ingester + name: ingester-zone-c + namespace: default +spec: + podManagementPolicy: Parallel + replicas: 1 + selector: + matchLabels: + name: ingester-zone-c + rollout-group: ingester + serviceName: ingester-zone-c + template: + metadata: + labels: + gossip_ring_member: "true" + name: ingester-zone-c + rollout-group: ingester + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: rollout-group + operator: In + values: + - ingester + - key: name + operator: NotIn + values: + - ingester-zone-c + topologyKey: kubernetes.io/hostname + containers: + - args: + - -blocks-storage.gcs.bucket-name=blocks-bucket + - -blocks-storage.tsdb.block-ranges-period=2h + - -blocks-storage.tsdb.dir=/data/tsdb + - -blocks-storage.tsdb.head-compaction-interval=15m + - -blocks-storage.tsdb.ship-interval=1m + - -blocks-storage.tsdb.wal-replay-concurrency=3 + - -common.storage.backend=gcs + - -distributor.health-check-ingesters=true + - -ingester.max-global-metadata-per-metric=10 + - -ingester.max-global-metadata-per-user=30000 + - -ingester.max-global-series-per-user=150000 + - -ingester.ring.heartbeat-period=2m + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.instance-availability-zone=zone-c + - -ingester.ring.num-tokens=512 + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.tokens-file-path=/data/tokens + - -ingester.ring.unregister-on-shutdown=true + - -ingester.ring.zone-awareness-enabled=true + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc-max-concurrent-streams=500 + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -target=ingester + - -usage-stats.installation-mode=jsonnet + env: + - name: A + value: all-ingesters + - name: GOMAXPROCS + value: "9" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent + name: ingester + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 25Gi + requests: + cpu: "4" + memory: 15Gi + volumeMounts: + - mountPath: /data + name: ingester-data + - mountPath: /etc/mimir + name: overrides + securityContext: + runAsUser: 0 + terminationGracePeriodSeconds: 1200 + volumes: + - configMap: + name: overrides + name: overrides + updateStrategy: + type: OnDelete + volumeClaimTemplates: + - apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: ingester-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 100Gi + storageClassName: fast +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: memcached + namespace: default +spec: + replicas: 3 + selector: + matchLabels: + name: memcached + serviceName: memcached + template: + metadata: + labels: + name: memcached + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + name: memcached + topologyKey: kubernetes.io/hostname + containers: + - args: + - -m 6144 + - -I 1m + - -c 16384 + - -v + - --extended=track_sizes + image: memcached:1.6.28-alpine + imagePullPolicy: IfNotPresent + name: memcached + ports: + - containerPort: 11211 + name: client + resources: + limits: + memory: 9Gi + requests: + cpu: 500m + memory: 6552Mi + - args: + - --memcached.address=localhost:11211 + - --web.listen-address=0.0.0.0:9150 + image: prom/memcached-exporter:v0.14.4 + imagePullPolicy: IfNotPresent + name: exporter + ports: + - containerPort: 9150 + name: http-metrics + updateStrategy: + type: RollingUpdate +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: memcached-frontend + namespace: default +spec: + replicas: 3 + selector: + matchLabels: + name: memcached-frontend + serviceName: memcached-frontend + template: + metadata: + labels: + name: memcached-frontend + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + name: memcached-frontend + topologyKey: kubernetes.io/hostname + containers: + - args: + - -m 1024 + - -I 5m + - -c 16384 + - -v + - --extended=track_sizes + image: memcached:1.6.28-alpine + imagePullPolicy: IfNotPresent + name: memcached + ports: + - containerPort: 11211 + name: client + resources: + limits: + memory: 1536Mi + requests: + cpu: 500m + memory: 1176Mi + - args: + - --memcached.address=localhost:11211 + - --web.listen-address=0.0.0.0:9150 + image: prom/memcached-exporter:v0.14.4 + imagePullPolicy: IfNotPresent + name: exporter + ports: + - containerPort: 9150 + name: http-metrics + updateStrategy: + type: RollingUpdate +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: memcached-index-queries + namespace: default +spec: + replicas: 3 + selector: + matchLabels: + name: memcached-index-queries + serviceName: memcached-index-queries + template: + metadata: + labels: + name: memcached-index-queries + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + name: memcached-index-queries + topologyKey: kubernetes.io/hostname + containers: + - args: + - -m 1024 + - -I 5m + - -c 16384 + - -v + - --extended=track_sizes + image: memcached:1.6.28-alpine + imagePullPolicy: IfNotPresent + name: memcached + ports: + - containerPort: 11211 + name: client + resources: + limits: + memory: 1536Mi + requests: + cpu: 500m + memory: 1176Mi + - args: + - --memcached.address=localhost:11211 + - --web.listen-address=0.0.0.0:9150 + image: prom/memcached-exporter:v0.14.4 + imagePullPolicy: IfNotPresent + name: exporter + ports: + - containerPort: 9150 + name: http-metrics + updateStrategy: + type: RollingUpdate +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: memcached-metadata + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + name: memcached-metadata + serviceName: memcached-metadata + template: + metadata: + labels: + name: memcached-metadata + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + name: memcached-metadata + topologyKey: kubernetes.io/hostname + containers: + - args: + - -m 512 + - -I 1m + - -c 16384 + - -v + - --extended=track_sizes + image: memcached:1.6.28-alpine + imagePullPolicy: IfNotPresent + name: memcached + ports: + - containerPort: 11211 + name: client + resources: + limits: + memory: 768Mi + requests: + cpu: 500m + memory: 638Mi + - args: + - --memcached.address=localhost:11211 + - --web.listen-address=0.0.0.0:9150 + image: prom/memcached-exporter:v0.14.4 + imagePullPolicy: IfNotPresent + name: exporter + ports: + - containerPort: 9150 + name: http-metrics + updateStrategy: + type: RollingUpdate +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + annotations: + rollout-max-unavailable: "50" + labels: + rollout-group: store-gateway + name: store-gateway-zone-a + namespace: default +spec: + podManagementPolicy: Parallel + replicas: 1 + selector: + matchLabels: + name: store-gateway-zone-a + rollout-group: store-gateway + serviceName: store-gateway-zone-a + template: + metadata: + labels: + gossip_ring_member: "true" + name: store-gateway-zone-a + rollout-group: store-gateway + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: rollout-group + operator: In + values: + - store-gateway + - key: name + operator: NotIn + values: + - store-gateway-zone-a + topologyKey: kubernetes.io/hostname + containers: + - args: + - -blocks-storage.bucket-store.chunks-cache.backend=memcached + - -blocks-storage.bucket-store.chunks-cache.memcached.addresses=dnssrvnoa+memcached.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency=100 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections=150 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-item-size=1048576 + - -blocks-storage.bucket-store.chunks-cache.memcached.timeout=750ms + - -blocks-storage.bucket-store.index-cache.backend=memcached + - -blocks-storage.bucket-store.index-cache.memcached.addresses=dnssrvnoa+memcached-index-queries.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.index-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency=100 + - -blocks-storage.bucket-store.index-cache.memcached.max-idle-connections=150 + - -blocks-storage.bucket-store.index-cache.memcached.max-item-size=5242880 + - -blocks-storage.bucket-store.index-cache.memcached.timeout=750ms + - -blocks-storage.bucket-store.metadata-cache.backend=memcached + - -blocks-storage.bucket-store.metadata-cache.memcached.addresses=dnssrvnoa+memcached-metadata.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency=100 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections=150 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-item-size=1048576 + - -blocks-storage.bucket-store.sync-dir=/data/tsdb + - -blocks-storage.bucket-store.sync-interval=15m + - -blocks-storage.gcs.bucket-name=blocks-bucket + - -common.storage.backend=gcs + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -store-gateway.sharding-ring.heartbeat-period=1m + - -store-gateway.sharding-ring.heartbeat-timeout=4m + - -store-gateway.sharding-ring.instance-availability-zone=zone-a + - -store-gateway.sharding-ring.prefix=multi-zone/ + - -store-gateway.sharding-ring.replication-factor=3 + - -store-gateway.sharding-ring.store=memberlist + - -store-gateway.sharding-ring.tokens-file-path=/data/tokens + - -store-gateway.sharding-ring.unregister-on-shutdown=false + - -store-gateway.sharding-ring.wait-stability-min-duration=1m + - -store-gateway.sharding-ring.zone-awareness-enabled=true + - -target=store-gateway + - -usage-stats.installation-mode=jsonnet + env: + - name: A + value: all-store-gateways + - name: GOMAXPROCS + value: "5" + - name: GOMEMLIMIT + value: "12884901888" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent + name: store-gateway + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 18Gi + requests: + cpu: "1" + memory: 12Gi + volumeMounts: + - mountPath: /data + name: store-gateway-data + - mountPath: /etc/mimir + name: overrides + securityContext: + runAsUser: 0 + terminationGracePeriodSeconds: 120 + volumes: + - configMap: + name: overrides + name: overrides + updateStrategy: + type: OnDelete + volumeClaimTemplates: + - apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: store-gateway-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: standard +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + annotations: + rollout-max-unavailable: "50" + labels: + rollout-group: store-gateway + name: store-gateway-zone-b + namespace: default +spec: + podManagementPolicy: Parallel + replicas: 1 + selector: + matchLabels: + name: store-gateway-zone-b + rollout-group: store-gateway + serviceName: store-gateway-zone-b + template: + metadata: + labels: + gossip_ring_member: "true" + name: store-gateway-zone-b + rollout-group: store-gateway + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: rollout-group + operator: In + values: + - store-gateway + - key: name + operator: NotIn + values: + - store-gateway-zone-b + topologyKey: kubernetes.io/hostname + containers: + - args: + - -blocks-storage.bucket-store.chunks-cache.backend=memcached + - -blocks-storage.bucket-store.chunks-cache.memcached.addresses=dnssrvnoa+memcached.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency=100 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections=150 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-item-size=1048576 + - -blocks-storage.bucket-store.chunks-cache.memcached.timeout=750ms + - -blocks-storage.bucket-store.index-cache.backend=memcached + - -blocks-storage.bucket-store.index-cache.memcached.addresses=dnssrvnoa+memcached-index-queries.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.index-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency=100 + - -blocks-storage.bucket-store.index-cache.memcached.max-idle-connections=150 + - -blocks-storage.bucket-store.index-cache.memcached.max-item-size=5242880 + - -blocks-storage.bucket-store.index-cache.memcached.timeout=750ms + - -blocks-storage.bucket-store.metadata-cache.backend=memcached + - -blocks-storage.bucket-store.metadata-cache.memcached.addresses=dnssrvnoa+memcached-metadata.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency=100 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections=150 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-item-size=1048576 + - -blocks-storage.bucket-store.sync-dir=/data/tsdb + - -blocks-storage.bucket-store.sync-interval=15m + - -blocks-storage.gcs.bucket-name=blocks-bucket + - -common.storage.backend=gcs + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -store-gateway.sharding-ring.heartbeat-period=1m + - -store-gateway.sharding-ring.heartbeat-timeout=4m + - -store-gateway.sharding-ring.instance-availability-zone=zone-b + - -store-gateway.sharding-ring.prefix=multi-zone/ + - -store-gateway.sharding-ring.replication-factor=3 + - -store-gateway.sharding-ring.store=memberlist + - -store-gateway.sharding-ring.tokens-file-path=/data/tokens + - -store-gateway.sharding-ring.unregister-on-shutdown=false + - -store-gateway.sharding-ring.wait-stability-min-duration=1m + - -store-gateway.sharding-ring.zone-awareness-enabled=true + - -target=store-gateway + - -usage-stats.installation-mode=jsonnet + env: + - name: A + value: zone-b + - name: GOGC + value: "1000" + - name: GOMAXPROCS + value: "5" + - name: GOMEMLIMIT + value: "12884901888" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent + name: store-gateway + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 18Gi + requests: + cpu: "1" + memory: 12Gi + volumeMounts: + - mountPath: /data + name: store-gateway-data + - mountPath: /etc/mimir + name: overrides + securityContext: + runAsUser: 0 + terminationGracePeriodSeconds: 120 + volumes: + - configMap: + name: overrides + name: overrides + updateStrategy: + type: OnDelete + volumeClaimTemplates: + - apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: store-gateway-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: standard +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + annotations: + rollout-max-unavailable: "50" + labels: + rollout-group: store-gateway + name: store-gateway-zone-c + namespace: default +spec: + podManagementPolicy: Parallel + replicas: 1 + selector: + matchLabels: + name: store-gateway-zone-c + rollout-group: store-gateway + serviceName: store-gateway-zone-c + template: + metadata: + labels: + gossip_ring_member: "true" + name: store-gateway-zone-c + rollout-group: store-gateway + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: rollout-group + operator: In + values: + - store-gateway + - key: name + operator: NotIn + values: + - store-gateway-zone-c + topologyKey: kubernetes.io/hostname + containers: + - args: + - -blocks-storage.bucket-store.chunks-cache.backend=memcached + - -blocks-storage.bucket-store.chunks-cache.memcached.addresses=dnssrvnoa+memcached.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency=100 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections=150 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-item-size=1048576 + - -blocks-storage.bucket-store.chunks-cache.memcached.timeout=750ms + - -blocks-storage.bucket-store.index-cache.backend=memcached + - -blocks-storage.bucket-store.index-cache.memcached.addresses=dnssrvnoa+memcached-index-queries.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.index-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency=100 + - -blocks-storage.bucket-store.index-cache.memcached.max-idle-connections=150 + - -blocks-storage.bucket-store.index-cache.memcached.max-item-size=5242880 + - -blocks-storage.bucket-store.index-cache.memcached.timeout=750ms + - -blocks-storage.bucket-store.metadata-cache.backend=memcached + - -blocks-storage.bucket-store.metadata-cache.memcached.addresses=dnssrvnoa+memcached-metadata.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency=100 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections=150 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-item-size=1048576 + - -blocks-storage.bucket-store.sync-dir=/data/tsdb + - -blocks-storage.bucket-store.sync-interval=15m + - -blocks-storage.gcs.bucket-name=blocks-bucket + - -common.storage.backend=gcs + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -store-gateway.sharding-ring.heartbeat-period=1m + - -store-gateway.sharding-ring.heartbeat-timeout=4m + - -store-gateway.sharding-ring.instance-availability-zone=zone-c + - -store-gateway.sharding-ring.prefix=multi-zone/ + - -store-gateway.sharding-ring.replication-factor=3 + - -store-gateway.sharding-ring.store=memberlist + - -store-gateway.sharding-ring.tokens-file-path=/data/tokens + - -store-gateway.sharding-ring.unregister-on-shutdown=false + - -store-gateway.sharding-ring.wait-stability-min-duration=1m + - -store-gateway.sharding-ring.zone-awareness-enabled=true + - -target=store-gateway + - -usage-stats.installation-mode=jsonnet + env: + - name: A + value: all-store-gateways + - name: GOMAXPROCS + value: "5" + - name: GOMEMLIMIT + value: "12884901888" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.13.0 + imagePullPolicy: IfNotPresent + name: store-gateway + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 18Gi + requests: + cpu: "1" + memory: 12Gi + volumeMounts: + - mountPath: /data + name: store-gateway-data + - mountPath: /etc/mimir + name: overrides + securityContext: + runAsUser: 0 + terminationGracePeriodSeconds: 120 + volumes: + - configMap: + name: overrides + name: overrides + updateStrategy: + type: OnDelete + volumeClaimTemplates: + - apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: store-gateway-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: standard +--- +apiVersion: etcd.database.coreos.com/v1beta2 +kind: EtcdCluster +metadata: + annotations: + etcd.database.coreos.com/scope: clusterwide + name: etcd + namespace: default +spec: + pod: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + etcd_cluster: etcd + topologyKey: kubernetes.io/hostname + annotations: + prometheus.io/port: "2379" + prometheus.io/scrape: "true" + etcdEnv: + - name: ETCD_AUTO_COMPACTION_RETENTION + value: 1h + labels: + name: etcd + resources: + limits: + memory: 512Mi + requests: + cpu: 500m + memory: 512Mi + size: 3 + version: 3.3.13 +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-a + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-a + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_a_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_a_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_a_memory_hpa_default + type: prometheus +--- +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: distributor-zone-b + namespace: default +spec: + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + policies: + - periodSeconds: 120 + type: Percent + value: 10 + stabilizationWindowSeconds: 1800 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 120 + maxReplicaCount: 30 + minReplicaCount: 3 + pollingInterval: 10 + scaleTargetRef: + name: distributor-zone-b + triggers: + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_cpu_hpa_default + query: | + quantile_over_time(0.95, + sum( + sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 + )[15m:] + ) * 1000 + and + count ( + count_over_time( + present_over_time( + container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2000" + name: cortex_distributor_zone_b_cpu_hpa_default + type: prometheus + - metadata: + ignoreNullValues: "false" + metricName: cortex_distributor_zone_b_memory_hpa_default + query: | + quantile_over_time(0.95, + sum( + ( + sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) + and + max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 + ) or vector(0) + )[15m:] + ) + + + sum( + sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) + and + max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) + and + max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) + or vector(0) + ) + and + count ( + count_over_time( + present_over_time( + container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] + )[15m:1m] + ) >= 15 + ) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "2147483648" + name: cortex_distributor_zone_b_memory_hpa_default + type: prometheus diff --git a/operations/mimir-tests/test-multi-zone-distributor.jsonnet b/operations/mimir-tests/test-multi-zone-distributor.jsonnet new file mode 100644 index 00000000000..45045e97f5b --- /dev/null +++ b/operations/mimir-tests/test-multi-zone-distributor.jsonnet @@ -0,0 +1,12 @@ +// Based on test-multi-zone.jsonnet. +(import 'test-multi-zone.jsonnet') { + _config+:: { + local availabilityZones = ['us-east-2a', 'us-east-2b'], + multi_zone_distributor_enabled: true, + multi_zone_distributor_availability_zones: availabilityZones, + + autoscaling_distributor_enabled: true, + autoscaling_distributor_min_replicas: 3, + autoscaling_distributor_max_replicas: 30, + }, +} diff --git a/operations/mimir-tests/test-multi-zone-generated.yaml b/operations/mimir-tests/test-multi-zone-generated.yaml index 8fb4577c280..40cba8a70a7 100644 --- a/operations/mimir-tests/test-multi-zone-generated.yaml +++ b/operations/mimir-tests/test-multi-zone-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -306,8 +293,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -322,29 +309,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -690,14 +655,15 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 + replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -706,127 +672,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix= - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -890,15 +737,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2540,179 +2382,3 @@ spec: memory: 512Mi size: 3 version: 3.3.13 ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-b - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default - type: prometheus diff --git a/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml b/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml index 8b24a075e45..e621e14d280 100644 --- a/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml +++ b/operations/mimir-tests/test-multi-zone-with-store-gateway-automated-downscaling-generated.yaml @@ -33,27 +33,14 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: labels: - name: distributor-zone-a - name: distributor-zone-a + name: distributor + name: distributor namespace: default spec: maxUnavailable: 1 selector: matchLabels: - name: distributor-zone-a ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b - namespace: default -spec: - maxUnavailable: 1 - selector: - matchLabels: - name: distributor-zone-b + name: distributor --- apiVersion: policy/v1 kind: PodDisruptionBudget @@ -369,30 +356,8 @@ apiVersion: v1 kind: Service metadata: labels: - name: distributor-zone-a - name: distributor-zone-a - namespace: default -spec: - clusterIP: None - ports: - - name: distributor-http-metrics - port: 8080 - targetPort: 8080 - - name: distributor-grpc - port: 9095 - targetPort: 9095 - - name: distributor-gossip-ring - port: 7946 - targetPort: 7946 - selector: - name: distributor-zone-a ---- -apiVersion: v1 -kind: Service -metadata: - labels: - name: distributor-zone-b - name: distributor-zone-b + name: distributor + name: distributor namespace: default spec: clusterIP: None @@ -407,7 +372,7 @@ spec: port: 7946 targetPort: 7946 selector: - name: distributor-zone-b + name: distributor --- apiVersion: v1 kind: Service @@ -767,14 +732,15 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: distributor-zone-a + name: distributor namespace: default spec: minReadySeconds: 10 + replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: - name: distributor-zone-a + name: distributor strategy: rollingUpdate: maxSurge: 15% @@ -783,127 +749,8 @@ spec: metadata: labels: gossip_ring_member: "true" - name: distributor-zone-a - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - containers: - - args: - - -distributor.ha-tracker.enable=true - - -distributor.ha-tracker.enable-for-all-users=true - - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 - - -distributor.ha-tracker.prefix=prom_ha/ - - -distributor.ha-tracker.store=etcd - - -distributor.health-check-ingesters=true - - -distributor.ingestion-burst-size=200000 - - -distributor.ingestion-rate-limit=10000 - - -distributor.ring.heartbeat-period=1m - - -distributor.ring.heartbeat-timeout=4m - - -distributor.ring.prefix= - - -distributor.ring.store=memberlist - - -ingester.ring.heartbeat-timeout=10m - - -ingester.ring.prefix= - - -ingester.ring.replication-factor=3 - - -ingester.ring.store=memberlist - - -ingester.ring.zone-awareness-enabled=true - - -mem-ballast-size-bytes=1073741824 - - -memberlist.bind-port=7946 - - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 - - -runtime-config.file=/etc/mimir/overrides.yaml - - -server.grpc.keepalive.max-connection-age=60s - - -server.grpc.keepalive.max-connection-age-grace=5m - - -server.grpc.keepalive.max-connection-idle=1m - - -server.grpc.keepalive.min-time-between-pings=10s - - -server.grpc.keepalive.ping-without-stream-allowed=true - - -server.http-listen-port=8080 - - -shutdown-delay=90s - - -target=distributor - - -usage-stats.installation-mode=jsonnet - env: - - name: GOMAXPROCS - value: "8" - - name: JAEGER_REPORTER_MAX_QUEUE_SIZE - value: "1000" - image: grafana/mimir:2.13.0 - imagePullPolicy: IfNotPresent name: distributor - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 9095 - name: grpc - - containerPort: 7946 - name: gossip-ring - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 15 - timeoutSeconds: 1 - resources: - limits: - memory: 4Gi - requests: - cpu: "2" - memory: 2Gi - volumeMounts: - - mountPath: /etc/mimir - name: overrides - terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az - topologySpreadConstraints: - - labelSelector: - matchLabels: - name: distributor-zone-a - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - volumes: - - configMap: - name: overrides - name: overrides ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: distributor-zone-b - namespace: default -spec: - minReadySeconds: 10 - revisionHistoryLimit: 10 - selector: - matchLabels: - name: distributor-zone-b - strategy: - rollingUpdate: - maxSurge: 15% - maxUnavailable: 0 - template: - metadata: - labels: - gossip_ring_member: "true" - name: distributor-zone-b spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2b containers: - args: - -distributor.ha-tracker.enable=true @@ -967,15 +814,10 @@ spec: - mountPath: /etc/mimir name: overrides terminationGracePeriodSeconds: 100 - tolerations: - - effect: NoSchedule - key: topology - operator: Equal - value: multi-az topologySpreadConstraints: - labelSelector: matchLabels: - name: distributor-zone-b + name: distributor maxSkew: 1 topologyKey: kubernetes.io/hostname whenUnsatisfiable: ScheduleAnyway @@ -2670,182 +2512,6 @@ webhooks: sideEffects: NoneOnDryRun timeoutSeconds: 10 --- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-a - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-a - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_a_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_a_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-a.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-a.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-a.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-a.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-a.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_a_memory_hpa_default - type: prometheus ---- -apiVersion: keda.sh/v1alpha1 -kind: ScaledObject -metadata: - name: distributor-zone-b - namespace: default -spec: - advanced: - horizontalPodAutoscalerConfig: - behavior: - scaleDown: - policies: - - periodSeconds: 120 - type: Percent - value: 10 - stabilizationWindowSeconds: 1800 - scaleUp: - policies: - - periodSeconds: 120 - type: Percent - value: 50 - - periodSeconds: 120 - type: Pods - value: 15 - stabilizationWindowSeconds: 120 - maxReplicaCount: 30 - minReplicaCount: 3 - pollingInterval: 10 - scaleTargetRef: - name: distributor-zone-b - triggers: - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_cpu_hpa_default - query: | - quantile_over_time(0.95, - sum( - sum by (pod) (rate(container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[5m])) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - )[15m:] - ) * 1000 - and - count ( - count_over_time( - present_over_time( - container_cpu_usage_seconds_total{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2000" - name: cortex_distributor_zone_b_cpu_hpa_default - type: prometheus - - metadata: - ignoreNullValues: "false" - metricName: cortex_distributor_zone_b_memory_hpa_default - query: | - quantile_over_time(0.95, - sum( - ( - sum by (pod) (container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}) - and - max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true",pod=~"distributor-zone-b.*"}[1m])) > 0 - ) or vector(0) - )[15m:] - ) - + - sum( - sum by (pod) (max_over_time(kube_pod_container_resource_requests{container="distributor", namespace="default", resource="memory",pod=~"distributor-zone-b.*"}[15m])) - and - max by (pod) (changes(kube_pod_container_status_restarts_total{container="distributor", namespace="default",pod=~"distributor-zone-b.*"}[15m]) > 0) - and - max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled",pod=~"distributor-zone-b.*"}) - or vector(0) - ) - and - count ( - count_over_time( - present_over_time( - container_memory_working_set_bytes{container="distributor",namespace="default",pod=~"distributor-zone-b.*"}[1m] - )[15m:1m] - ) >= 15 - ) - serverAddress: http://prometheus.default:9090/prometheus - threshold: "2147483648" - name: cortex_distributor_zone_b_memory_hpa_default - type: prometheus ---- apiVersion: admissionregistration.k8s.io/v1 kind: ValidatingWebhookConfiguration metadata: diff --git a/operations/mimir-tests/test-multi-zone.jsonnet b/operations/mimir-tests/test-multi-zone.jsonnet index 55332c87197..0eb300d8c53 100644 --- a/operations/mimir-tests/test-multi-zone.jsonnet +++ b/operations/mimir-tests/test-multi-zone.jsonnet @@ -19,14 +19,6 @@ mimir { multi_zone_store_gateway_enabled: true, multi_zone_store_gateway_replicas: 3, - - local availabilityZones = ['us-east-2a', 'us-east-2b'], - multi_zone_distributor_enabled: true, - multi_zone_distributor_availability_zones: availabilityZones, - - autoscaling_distributor_enabled: true, - autoscaling_distributor_min_replicas: 3, - autoscaling_distributor_max_replicas: 30, }, ingester_env_map+:: {