Skip to content

Commit

Permalink
Use dedicated services for metrics release (#401)
Browse files Browse the repository at this point in the history
  • Loading branch information
mkjpryor authored Sep 4, 2024
1 parent e4e1ec1 commit 856f8fb
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 11 deletions.
7 changes: 7 additions & 0 deletions .github/values/arcus/latest-addons.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
addons:
kubernetesDashboard:
enabled: true
ingress:
enabled: true
monitoring:
enabled: true
7 changes: 7 additions & 0 deletions .github/values/leafcloud/latest-addons.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
addons:
kubernetesDashboard:
enabled: true
ingress:
enabled: true
monitoring:
enabled: true
74 changes: 73 additions & 1 deletion .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ on:
default: false

jobs:
# This job tests a clean deployment against the latest version
# This job tests a basic deployment against the latest version
# It is the only job that runs when tests-full=false
# For tests-full=true it creates an internal network + router and runs Sonobuoy in conformance mode
# For tests-full=false it uses a pre-existing internal network and runs Sonobuoy in quick mode
Expand Down Expand Up @@ -95,6 +95,78 @@ jobs:
process-id: ${{ github.repository }}/${{ github.run_id }}/${{ github.job }}
if: ${{ vars.TARGET_CLOUD == 'arcus' && always() }}

# This job tests a deployment against the latest version with the dashboard, monitoring and ingress enabled
latest-addons:
runs-on: ubuntu-latest
if: ${{ inputs.tests-full }}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
ref: ${{ inputs.ref }}

- name: Create kind cluster
uses: helm/[email protected]

- name: Set up test environment
uses: ./.github/actions/setup

- name: Write cloud credential
run: echo "$OS_CLOUDS" > ./clouds.yaml
env:
OS_CLOUDS: ${{ secrets.OS_CLOUDS }}

# This is the point at which we start to consume OpenStack resources
# We use the same resource as the Azimuth CI, so acquire the same CI lock
- name: Acquire S3 lock
uses: azimuth-cloud/github-actions/s3-lock@master
with:
host: ${{ vars.CI_S3_LOCK_HOST }}
access-key: ${{ secrets.CI_S3_LOCK_ACCESS_KEY }}
secret-key: ${{ secrets.CI_S3_LOCK_SECRET_KEY }}
bucket: ${{ vars.CI_S3_LOCK_BUCKET }}
action: acquire
# Include the job ID in the process ID
process-id: ${{ github.repository }}/${{ github.run_id }}/${{ github.job }}
# GitHub terminates jobs after 6 hours
# We don't want jobs to acquire the lock then get timed out before they can finish
# So wait a maximum of 3 hours to acquire the lock, leaving 3 hours for other tasks in the workflow
timeout-minutes: 180
if: ${{ vars.TARGET_CLOUD == 'arcus' }}

- name: Deploy Kubernetes 1.31 with all addons
uses: ./.github/actions/upgrade-and-test
with:
name: ci-${{ github.run_id }}-${{ github.job }}
os-cloud: ${{ vars.TARGET_CLOUD }}
chart-version: ${{ inputs.chart-version }}
kubernetes-version: ${{ fromJson(inputs.images).kube-1-31-version }}
image-id: ${{ fromJson(inputs.images).kube-1-31-image }}
defaults-path: ./.github/values/${{ vars.TARGET_CLOUD }}/base.yaml
overrides-path: ./.github/values/${{ vars.TARGET_CLOUD }}/latest-addons.yaml

- name: Delete Kubernetes 1.31 deployment
run: helm delete ci-${{ github.run_id }}-${{ github.job }} --wait
if: ${{ always() }}

- name: Upload logs
uses: ./.github/actions/upload-logs
with:
name-suffix: ${{ github.job }}
if: ${{ always() }}

# Release the CI lock when we are done
- name: Release S3 lock
uses: azimuth-cloud/github-actions/s3-lock@master
with:
host: ${{ vars.CI_S3_LOCK_HOST }}
access-key: ${{ secrets.CI_S3_LOCK_ACCESS_KEY }}
secret-key: ${{ secrets.CI_S3_LOCK_SECRET_KEY }}
bucket: ${{ vars.CI_S3_LOCK_BUCKET }}
action: release
process-id: ${{ github.repository }}/${{ github.run_id }}/${{ github.job }}
if: ${{ vars.TARGET_CLOUD == 'arcus' && always() }}

# This job tests the etcd volume support
# It only runs for non-draft PRs
# It uses a pre-existing internal network and the default volume type
Expand Down
52 changes: 42 additions & 10 deletions charts/cluster-addons/templates/cni/calico.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -132,12 +132,39 @@ spec:
targetNamespace: {{ .Values.cni.calico.release.namespace }}
releaseName: cni-calico-monitoring
manifestSources:
# As of 3.28.1, the Tigera operator now creates a calico-typha-metrics service itself,
# but uses a different port name
# On upgraded deployments, the Tigera operator happily adopts the existing service but breaks
# the metrics collection when the port is renamed
# On new deployments, this release fails because the service exists but is not owned by
# the Helm release
#
# In order to avoid this happening again, this release no longer relies on any services
# created by the operator for metrics collection, instead managing our own services with
# a naming convention that is unlikely to conflict with Calico resources

# calico-kube-controllers
- template: |
apiVersion: v1
kind: Service
metadata:
name: cni-calico-monitoring-kube-controllers
namespace: calico-system
labels:
app.kubernetes.io/name: cni-calico-monitoring
app.kubernetes.io/component: kube-controllers
spec:
clusterIP: None
ports:
- name: metrics-port
port: 9094
selector:
k8s-app: calico-kube-controllers
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: calico-kube-controllers-metrics
name: cni-calico-monitoring-kube-controllers
namespace: calico-system
spec:
endpoints:
Expand All @@ -147,16 +174,18 @@ spec:
- calico-system
selector:
matchLabels:
k8s-app: calico-kube-controllers
app.kubernetes.io/name: cni-calico-monitoring
app.kubernetes.io/component: kube-controllers
# calico-node
- template: |
apiVersion: v1
kind: Service
metadata:
name: calico-node-metrics
name: cni-calico-monitoring-node
namespace: calico-system
labels:
k8s-app: calico-node
app.kubernetes.io/name: cni-calico-monitoring
app.kubernetes.io/component: node
spec:
clusterIP: None
ports:
Expand All @@ -168,7 +197,7 @@ spec:
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: calico-node-metrics
name: cni-calico-monitoring-node
namespace: calico-system
spec:
endpoints:
Expand All @@ -178,16 +207,18 @@ spec:
- calico-system
selector:
matchLabels:
k8s-app: calico-node
app.kubernetes.io/name: cni-calico-monitoring
app.kubernetes.io/component: node
# calico-typha
- template: |
apiVersion: v1
kind: Service
metadata:
name: calico-typha-metrics
name: cni-calico-monitoring-typha
namespace: calico-system
labels:
k8s-app: calico-typha
app.kubernetes.io/name: cni-calico-monitoring
app.kubernetes.io/component: typha
spec:
clusterIP: None
ports:
Expand All @@ -199,7 +230,7 @@ spec:
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: calico-typha-metrics
name: cni-calico-monitoring-typha
namespace: calico-system
spec:
endpoints:
Expand All @@ -209,7 +240,8 @@ spec:
- calico-system
selector:
matchLabels:
k8s-app: calico-typha
app.kubernetes.io/name: cni-calico-monitoring
app.kubernetes.io/component: typha
# dashboard
- template: |
apiVersion: v1
Expand Down

0 comments on commit 856f8fb

Please sign in to comment.