Skip to content

Cilium E2E Upgrade (ci-e2e-upgrade) #1110

Cilium E2E Upgrade (ci-e2e-upgrade)

Cilium E2E Upgrade (ci-e2e-upgrade) #1110

name: Cilium E2E Upgrade (ci-e2e-upgrade)
# Any change in triggers needs to be reflected in the concurrency group.
on:
workflow_dispatch:
inputs:
PR-number:
description: "Pull request number."
required: true
context-ref:
description: "Context in which the workflow runs. If PR is from a fork, will be the PR target branch (general case). If PR is NOT from a fork, will be the PR branch itself (this allows committers to test changes to workflows directly from PRs)."
required: true
SHA:
description: "SHA under test (head of the PR branch)."
required: true
extra-args:
description: "[JSON object] Arbitrary arguments passed from the trigger comment via regex capture group. Parse with 'fromJson(inputs.extra-args).argName' in workflow."
required: false
default: '{}'
push:
branches:
- 'renovate/main-**'
# Run every 6 hours
schedule:
- cron: '0 5/6 * * *'
# By specifying the access of one of the scopes, all of those that are not
# specified are set to 'none'.
permissions:
# To read actions state with catchpoint/workflow-telemetry-action
actions: read
# To be able to access the repository with actions/checkout
contents: read
# To allow retrieving information from the PR API
pull-requests: read
# To be able to set commit status
statuses: write
concurrency:
# Structure:
# - Workflow name
# - Event type
# - A unique identifier depending on event type:
# - schedule: SHA
# - workflow_dispatch: PR number
#
# This structure ensures a unique concurrency group name is generated for each
# type of testing, such that re-runs will cancel the previous run.
group: |
${{ github.workflow }}
${{ github.event_name }}
${{
(github.event_name == 'push' && github.sha) ||
(github.event_name == 'schedule' && github.sha) ||
(github.event_name == 'workflow_dispatch' && github.event.inputs.PR-number)
}}
cancel-in-progress: true
env:
cilium_cli_ci_version:
jobs:
echo-inputs:
if: ${{ github.event_name == 'workflow_dispatch' }}
name: Echo Workflow Dispatch Inputs
runs-on: ubuntu-22.04
steps:
- name: Echo Workflow Dispatch Inputs
run: |
echo '${{ tojson(inputs) }}'
commit-status-start:
name: Commit Status Start
runs-on: ubuntu-latest
steps:
- name: Set initial commit status
uses: myrotvorets/set-commit-status-action@3730c0a348a2ace3c110851bed53331bc6406e9f # v2.0.1
with:
sha: ${{ inputs.SHA || github.sha }}
setup-and-test:
runs-on: ${{ vars.GH_RUNNER_EXTRA_POWER }}
name: 'Setup & Test'
env:
job_name: 'Setup & Test'
strategy:
fail-fast: false
max-parallel: 22
matrix:
include:
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# ! NOTE: keep conformance-e2e.yaml config in sync !
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- name: '1'
# renovate: datasource=docker depName=quay.io/lvh-images/kind
kernel: 'rhel8-20240404.144247'
kube-proxy: 'iptables'
kpr: 'false'
tunnel: 'vxlan'
host-fw: 'true'
- name: '2'
# renovate: datasource=docker depName=quay.io/lvh-images/kind
kernel: '5.4-20240710.064909'
kube-proxy: 'iptables'
kpr: 'false'
tunnel: 'disabled'
host-fw: 'true'
- name: '3'
# renovate: datasource=docker depName=quay.io/lvh-images/kind
kernel: '5.10-20240710.064909'
kube-proxy: 'iptables'
kpr: 'false'
tunnel: 'disabled'
endpoint-routes: 'true'
- name: '4'
# renovate: datasource=docker depName=quay.io/lvh-images/kind
kernel: '5.10-20240710.064909'
kube-proxy: 'iptables'
kpr: 'true'
devices: '{eth0,eth1}'
secondary-network: 'true'
tunnel: 'vxlan'
lb-mode: 'snat'
endpoint-routes: 'true'
egress-gateway: 'true'
- name: '5'
# renovate: datasource=docker depName=quay.io/lvh-images/kind
kernel: '5.15-20240710.064909'
kube-proxy: 'iptables'
kpr: 'true'
devices: '{eth0,eth1}'
secondary-network: 'true'
tunnel: 'disabled'
lb-mode: 'dsr'
endpoint-routes: 'true'
egress-gateway: 'true'
host-fw: 'false' # enabling breaks downgrading (missed tail calls)
- name: '6'
# renovate: datasource=docker depName=quay.io/lvh-images/kind
kernel: '6.1-20240710.064909'
kube-proxy: 'none'
kpr: 'true'
tunnel: 'vxlan'
lb-mode: 'snat'
egress-gateway: 'true'
host-fw: 'true'
lb-acceleration: 'testing-only'
# Disable until https://github.com/cilium/cilium/issues/30717
# has been resolved.
# ingress-controller: 'true'
bgp-control-plane: 'true'
- name: '7'
# renovate: datasource=docker depName=quay.io/lvh-images/kind
kernel: 'bpf-next-20240711.013133'
kube-proxy: 'none'
kpr: 'true'
devices: '{eth0,eth1}'
secondary-network: 'true'
tunnel: 'disabled'
lb-mode: 'snat'
egress-gateway: 'true'
lb-acceleration: 'testing-only'
# Disable until https://github.com/cilium/cilium/issues/30717
# has been resolved.
# ingress-controller: 'true'
local-redirect-policy: 'true'
node-local-dns: 'true'
- name: '8'
# renovate: datasource=docker depName=quay.io/lvh-images/kind
kernel: 'bpf-next-20240711.013133'
kube-proxy: 'iptables'
kpr: 'false'
tunnel: 'geneve'
endpoint-routes: 'true'
- name: '9'
# renovate: datasource=docker depName=quay.io/lvh-images/kind
kernel: '5.10-20240710.064909'
kube-proxy: 'iptables'
kpr: 'true'
devices: '{eth0,eth1}'
secondary-network: 'true'
tunnel: 'vxlan'
encryption: 'wireguard'
encryption-node: 'false'
lb-mode: 'snat'
endpoint-routes: 'true'
egress-gateway: 'true'
- name: '10'
# renovate: datasource=docker depName=quay.io/lvh-images/kind
kernel: '5.15-20240710.064909'
kube-proxy: 'iptables'
kpr: 'false'
tunnel: 'disabled'
encryption: 'wireguard'
encryption-node: 'false'
- name: '11'
# renovate: datasource=docker depName=quay.io/lvh-images/kind
kernel: '6.1-20240710.064909'
kube-proxy: 'none'
kpr: 'true'
devices: '{eth0,eth1}'
secondary-network: 'true'
tunnel: 'vxlan'
encryption: 'wireguard'
encryption-node: 'true'
lb-mode: 'snat'
egress-gateway: 'true'
# Disable until https://github.com/cilium/cilium/issues/30717
# has been resolved.
# ingress-controller: 'true'
- name: '12'
# renovate: datasource=docker depName=quay.io/lvh-images/kind
kernel: 'bpf-next-20240711.013133'
kube-proxy: 'none'
kpr: 'true'
devices: '{eth0,eth1}'
secondary-network: 'true'
tunnel: 'disabled'
encryption: 'wireguard'
encryption-node: 'true'
lb-mode: 'snat'
egress-gateway: 'true'
ingress-controller: 'true'
- name: '14'
# Switch to 5.15 until https://github.com/cilium/cilium/issues/27642
# has been resolved. https://github.com/cilium/cilium/pull/30837#issuecomment-1960897445
# explains why 5.4 might cause north-south-loadbalancing tests to
# fail.
# renovate: datasource=docker depName=quay.io/lvh-images/kind
kernel: '5.15-20240710.064909'
kube-proxy: 'iptables'
kpr: 'true'
devices: '{eth0,eth1}'
secondary-network: 'true'
tunnel: 'vxlan'
lb-mode: 'snat'
egress-gateway: 'true'
lb-acceleration: 'testing-only'
- name: '15'
# renovate: datasource=docker depName=quay.io/lvh-images/kind
kernel: 'bpf-next-20240711.013133'
kube-proxy: 'none'
kpr: 'true'
devices: '{eth0,eth1}'
secondary-network: 'true'
tunnel: 'disabled'
# Disable until https://github.com/cilium/cilium/issues/30717
# has been resolved.
# ingress-controller: 'true'
# Disable bpf.tproxy=true until https://github.com/cilium/cilium/issues/31918
# has been resolved.
misc: 'bpfClockProbe=false,cni.uninstall=false'
ciliumendpointslice: 'true'
- name: '16'
# renovate: datasource=docker depName=quay.io/lvh-images/kind
kernel: '5.15-20240710.064909'
kube-proxy: 'none'
kpr: 'true'
devices: '{eth0,eth1}'
secondary-network: 'true'
tunnel: 'vxlan'
lb-mode: 'snat'
encryption: 'wireguard'
encryption-node: 'false'
host-fw: 'true'
ciliumendpointslice: 'true'
# Disable until 1.17 has been released, so that 1.16 to 1.17 upgrade
# can be tested with netkit.
# - name: '17'
# # renovate: datasource=docker depName=quay.io/lvh-images/kind
# kernel: 'bpf-20240711.013133'
# misc: 'bpf.datapathMode=netkit,bpf.masquerade=true,enableIPv4BIGTCP=true,enableIPv6BIGTCP=true'
# kube-proxy: 'none'
# kpr: 'true'
# ipv6: 'true'
# tunnel: 'disabled'
# devices: '{eth0,eth1}'
# secondary-network: 'true'
#
# - name: '18'
# # renovate: datasource=docker depName=quay.io/lvh-images/kind
# kernel: 'bpf-20240711.013133'
# misc: 'bpf.datapathMode=netkit-l2,bpf.masquerade=true,enableIPv4BIGTCP=true,enableIPv6BIGTCP=true'
# kube-proxy: 'none'
# kpr: 'true'
# ipv6: 'true'
# tunnel: 'disabled'
# devices: '{eth0,eth1}'
# secondary-network: 'true'
#
# - name: '19'
# # renovate: datasource=docker depName=quay.io/lvh-images/kind
# kernel: 'bpf-20240711.013133'
# misc: 'bpf.datapathMode=netkit,bpf.masquerade=true'
# kube-proxy: 'none'
# kpr: 'true'
# tunnel: 'vxlan'
# devices: '{eth0,eth1}'
# secondary-network: 'true'
#
# - name: '20'
# # renovate: datasource=docker depName=quay.io/lvh-images/kind
# kernel: 'bpf-20240711.013133'
# misc: 'bpf.datapathMode=netkit-l2,bpf.masquerade=true'
# kube-proxy: 'none'
# kpr: 'true'
# tunnel: 'vxlan'
# devices: '{eth0,eth1}'
# secondary-network: 'true'
#
# - name: '21'
# # renovate: datasource=docker depName=quay.io/lvh-images/kind
# kernel: 'bpf-20240711.013133'
# misc: 'bpf.datapathMode=netkit,bpf.masquerade=true'
# kube-proxy: 'none'
# kpr: 'true'
# tunnel: 'geneve'
# devices: '{eth0,eth1}'
# secondary-network: 'true'
#
# - name: '22'
# # renovate: datasource=docker depName=quay.io/lvh-images/kind
# kernel: 'bpf-20240711.013133'
# misc: 'bpf.datapathMode=netkit-l2,bpf.masquerade=true'
# kube-proxy: 'none'
# kpr: 'true'
# tunnel: 'geneve'
# devices: '{eth0,eth1}'
# secondary-network: 'true'
timeout-minutes: 90
steps:
- name: Collect Workflow Telemetry
uses: catchpoint/workflow-telemetry-action@94c3c3d9567a0205de6da68a76c428ce4e769af1 # v2.0.0
with:
comment_on_pr: false
- name: Checkout context ref (trusted)
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
with:
ref: ${{ inputs.context-ref || github.sha }}
persist-credentials: false
- name: Cleanup Disk space in runner
uses: ./.github/actions/disk-cleanup
- name: Set Environment Variables
uses: ./.github/actions/set-env-variables
- name: Set up job variables
id: vars
run: |
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
SHA="${{ inputs.SHA }}"
else
SHA="${{ github.sha }}"
fi
echo sha=${SHA} >> $GITHUB_OUTPUT
CILIUM_DOWNGRADE_VERSION=$(contrib/scripts/print-downgrade-version.sh stable)
echo downgrade_version=${CILIUM_DOWNGRADE_VERSION} >> $GITHUB_OUTPUT
- name: Derive stable Cilium installation config
id: cilium-stable-config
uses: ./.github/actions/cilium-config
with:
image-tag: ${{ steps.vars.outputs.downgrade_version }}
chart-dir: './untrusted/cilium-downgrade/install/kubernetes/cilium/'
tunnel: ${{ matrix.tunnel }}
devices: ${{ matrix.devices }}
endpoint-routes: ${{ matrix.endpoint-routes }}
ipv6: ${{ matrix.ipv6 }}
kpr: ${{ matrix.kpr }}
lb-mode: ${{ matrix.lb-mode }}
lb-acceleration: ${{ matrix.lb-acceleration }}
encryption: ${{ matrix.encryption }}
encryption-node: ${{ matrix.encryption-node }}
egress-gateway: ${{ matrix.egress-gateway }}
host-fw: ${{ matrix.host-fw }}
mutual-auth: false
ingress-controller: ${{ matrix.ingress-controller }}
misc: ${{ matrix.misc || 'bpfClockProbe=false,cni.uninstall=false' }}
ciliumendpointslice: ${{ matrix.ciliumendpointslice }}
local-redirect-policy: ${{ matrix.local-redirect-policy }}
bgp-control-plane: ${{ matrix.bgp-control-plane }}
- name: Derive newest Cilium installation config
id: cilium-newest-config
uses: ./.github/actions/cilium-config
with:
image-tag: ${{ steps.vars.outputs.sha }}
chart-dir: './untrusted/cilium-newest/install/kubernetes/cilium'
tunnel: ${{ matrix.tunnel }}
devices: ${{ matrix.devices }}
endpoint-routes: ${{ matrix.endpoint-routes }}
ipv6: ${{ matrix.ipv6 }}
kpr: ${{ matrix.kpr }}
lb-mode: ${{ matrix.lb-mode }}
lb-acceleration: ${{ matrix.lb-acceleration }}
encryption: ${{ matrix.encryption }}
encryption-node: ${{ matrix.encryption-node }}
egress-gateway: ${{ matrix.egress-gateway }}
host-fw: ${{ matrix.host-fw }}
mutual-auth: false
ingress-controller: ${{ matrix.ingress-controller }}
misc: ${{ matrix.misc || 'bpfClockProbe=false,cni.uninstall=false' }}
ciliumendpointslice: ${{ matrix.ciliumendpointslice }}
local-redirect-policy: ${{ matrix.local-redirect-policy }}
bgp-control-plane: ${{ matrix.bgp-control-plane }}
- name: Set Kind params
id: kind-params
shell: bash
run: |
IP_FAM="dual"
if [ "${{ matrix.ipv6 }}" == "false" ]; then
IP_FAM="ipv4"
fi
echo params="--xdp --secondary-network \"\" 3 \"\" \"\" ${{ matrix.kube-proxy }} $IP_FAM" >> $GITHUB_OUTPUT
- name: Provision K8s on LVH VM
uses: ./.github/actions/lvh-kind
with:
test-name: e2e-conformance
kernel: ${{ matrix.kernel }}
kind-params: "${{ steps.kind-params.outputs.params }}"
kind-image: ${{ env.KIND_K8S_IMAGE }}
- name: Install Cilium CLI
uses: cilium/cilium-cli@62bd4511031211b50a4623870955a5ad27b43e3b # v0.16.16
with:
skip-build: ${{ env.CILIUM_CLI_SKIP_BUILD }}
image-repo: ${{ env.CILIUM_CLI_IMAGE_REPO }}
image-tag: ${{ steps.vars.outputs.sha }}
# Warning: since this is a privileged workflow, subsequent workflow job
# steps must take care not to execute untrusted code.
- name: Checkout pull request branch (NOT TRUSTED)
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
with:
ref: ${{ steps.vars.outputs.sha }}
persist-credentials: false
path: untrusted/cilium-newest
sparse-checkout: |
install/kubernetes/cilium
examples
- name: Checkout ${{ steps.vars.outputs.downgrade_version }} branch to get the Helm chart
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
with:
ref: ${{ steps.vars.outputs.downgrade_version }}
persist-credentials: false
path: untrusted/cilium-downgrade
sparse-checkout: |
install/kubernetes/cilium
- name: Wait for images to be available
timeout-minutes: 10
shell: bash
run: |
for image in cilium-ci operator-generic-ci hubble-relay-ci ; do
until docker manifest inspect quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/$image:${{ steps.vars.outputs.sha }} &> /dev/null; do sleep 45s; done
done
- name: Install Cilium ${{ env.cilium_stable_version }}
shell: bash
run: |
kubectl patch node kind-worker3 --type=json -p='[{"op":"add","path":"/metadata/labels/cilium.io~1no-schedule","value":"true"}]'
cilium install \
${{ steps.cilium-stable-config.outputs.config }}
cilium status --wait
kubectl get pods --all-namespaces -o wide
kubectl -n kube-system exec daemonset/cilium -- cilium status
mkdir -p cilium-junits
- name: Install node local DNS
if: ${{ matrix.node-local-dns == 'true' }}
shell: bash
run: |
kubedns=$(kubectl get svc kube-dns -n kube-system -o jsonpath={.spec.clusterIP}) && sed -i "s/__PILLAR__DNS__SERVER__/$kubedns/g;" untrusted/cilium-newest/examples/kubernetes-local-redirect/node-local-dns.yaml
sed -i "s/__PILLAR__UPSTREAM__SERVERS__/1.1.1.1/g;" untrusted/cilium-newest/examples/kubernetes-local-redirect/node-local-dns.yaml
kubectl apply -k untrusted/cilium-newest/examples/kubernetes-local-redirect
kubectl rollout status -n kube-system ds/node-local-dns
- name: Start conn-disrupt-test
shell: bash
run: |
# Create pods which establish long lived connections. It will be used by
# subsequent connectivity tests with --include-conn-disrupt-test to catch any
# interruption in such flows.
cilium connectivity test --include-conn-disrupt-test --conn-disrupt-test-setup \
--conn-disrupt-test-restarts-path "./cilium-conn-disrupt-restarts" \
--conn-disrupt-dispatch-interval 0ms
- name: Upgrade Cilium
shell: bash
run: |
cilium upgrade \
${{ steps.cilium-newest-config.outputs.config }}
cilium status --wait --wait-duration=10m
kubectl get pods --all-namespaces -o wide
kubectl -n kube-system exec daemonset/cilium -- cilium status
- name: Test Cilium after upgrade
shell: bash
run: |
EXTRA=()
if [ "${{ matrix.secondary-network }}" = "true" ]; then
EXTRA+=("--secondary-network-iface=eth1")
fi
# it's fine to ignore the "No egress gateway found" drop reason as this may be caused by the kind=echo pods
# sending traffic while the egressgw policy map is still being populated.
#
# The actual connectivity test will ensure that the map is in sync with the policy and that egressgw traffic
# always go through the correct gateway
EXTRA+=("--expected-drop-reasons=+No egress gateway found")
cilium connectivity test --include-unsafe-tests --collect-sysdump-on-failure \
--include-conn-disrupt-test \
--conn-disrupt-test-restarts-path "./cilium-conn-disrupt-restarts" \
--flush-ct \
--sysdump-hubble-flows-count=1000000 --sysdump-hubble-flows-timeout=5m \
--sysdump-output-filename "cilium-sysdump-${{ matrix.name }}-<ts>" \
--junit-file "cilium-junits/${{ env.job_name }} (${{ matrix.name }}).xml" \
--junit-property github_job_step="Run tests upgrade 2 (${{ matrix.name }})" \
"${EXTRA[@]}"
# --flush-ct interrupts the flows, so we need to set up again.
cilium connectivity test --include-conn-disrupt-test --conn-disrupt-test-setup \
--conn-disrupt-test-restarts-path "./cilium-conn-disrupt-restarts" \
--conn-disrupt-dispatch-interval 0ms
- name: Downgrade Cilium ${{ env.cilium_stable_version }}
shell: bash
run: |
cilium upgrade \
${{ steps.cilium-stable-config.outputs.config }}
cilium status --wait --wait-duration=10m
kubectl get pods --all-namespaces -o wide
kubectl -n kube-system exec daemonset/cilium -- cilium status
- name: Test Cilium after downgrade to ${{ env.cilium_stable_version }}
shell: bash
run: |
EXTRA=()
if [ "${{ matrix.secondary-network }}" = "true" ]; then
EXTRA+=("--secondary-network-iface=eth1")
fi
# it's fine to ignore the "No egress gateway found" drop reason as this may be caused by the kind=echo pods
# sending traffic while the egressgw policy map is still being populated.
#
# The actual connectivity test will ensure that the map is in sync with the policy and that egressgw traffic
# always go through the correct gateway
EXTRA+=("--expected-drop-reasons=+No egress gateway found")
cilium connectivity test --include-unsafe-tests --collect-sysdump-on-failure \
--include-conn-disrupt-test \
--conn-disrupt-test-restarts-path "./cilium-conn-disrupt-restarts" \
--flush-ct \
--sysdump-hubble-flows-count=1000000 --sysdump-hubble-flows-timeout=5m \
--sysdump-output-filename "cilium-sysdump-${{ matrix.name }}-<ts>" \
--junit-file "cilium-junits/${{ env.job_name }} (${{ matrix.name }}).xml" \
--junit-property github_job_step="Run tests upgrade 2 (${{ matrix.name }})" \
"${EXTRA[@]}"
- name: Fetch artifacts
if: ${{ !success() }}
shell: bash
run: |
kubectl get pods --all-namespaces -o wide
cilium status
mkdir -p cilium-sysdumps
cilium sysdump --output-filename cilium-sysdump-${{ matrix.name }}-final
- name: Upload artifacts
if: ${{ !success() }}
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
with:
name: cilium-sysdumps-${{ matrix.name }}
path: cilium-sysdump-*.zip
- name: Upload JUnits [junit]
if: ${{ always() }}
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
with:
name: cilium-junits-${{ matrix.name }}
path: cilium-junits/*.xml
- name: Publish Test Results As GitHub Summary
if: ${{ always() }}
uses: aanm/junit2md@332ebf0fddd34e91b03a832cfafaa826306558f9 # v0.0.3
with:
junit-directory: "cilium-junits"
merge-upload:
if: ${{ always() }}
name: Merge and Upload Artifacts
runs-on: ubuntu-latest
needs: setup-and-test
steps:
- name: Merge Sysdumps
if: ${{ needs.setup-and-test.result == 'failure' }}
uses: actions/upload-artifact/merge@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
with:
name: cilium-sysdumps
pattern: cilium-sysdumps-*
retention-days: 5
delete-merged: true
continue-on-error: true
- name: Merge JUnits
uses: actions/upload-artifact/merge@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
with:
name: cilium-junits
pattern: cilium-junits-*
retention-days: 5
delete-merged: true
commit-status-final:
if: ${{ always() }}
name: Commit Status Final
needs: setup-and-test
runs-on: ubuntu-latest
steps:
- name: Set final commit status
uses: myrotvorets/set-commit-status-action@3730c0a348a2ace3c110851bed53331bc6406e9f # v2.0.1
with:
sha: ${{ inputs.SHA || github.sha }}
status: ${{ needs.setup-and-test.result }}