Skip to content

Commit

Permalink
Address comments
Browse files Browse the repository at this point in the history
Signed-off-by: Wenqi Qiu <[email protected]>
  • Loading branch information
wenqiq committed Jun 5, 2024
1 parent 55aa720 commit c0f69af
Show file tree
Hide file tree
Showing 13 changed files with 88 additions and 73 deletions.
2 changes: 2 additions & 0 deletions ci/jenkins/jobs/macros.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -310,3 +310,5 @@
#!/bin/bash
set -e
./ci/jenkins/stop-stale-jobs.sh --pull-request "${ghprbPullId}" --jenkins "${JENKINS_URL}"
#
1 change: 1 addition & 0 deletions ci/jenkins/jobs/projects-lab.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1485,3 +1485,4 @@
timeout: 135
type: absolute
publishers: []
#
58 changes: 16 additions & 42 deletions ci/jenkins/test-scale.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env bash

# Copyright 2023 Antrea Authors
# Copyright 2024 Antrea Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -27,18 +27,11 @@ WORKDIR=$DEFAULT_WORKDIR
KUBECONFIG_PATH=$DEFAULT_KUBECONFIG_PATH
TESTCASE=""
TEST_FAILURE=false
MODE="report"
DOCKER_REGISTRY=$(head -n1 "${WORKSPACE}/ci/docker-registry")
TESTBED_TYPE="legacy"
GO_VERSION=$(head -n1 "${WORKSPACE}/build/images/deps/go-version")
IMAGE_PULL_POLICY="Always"
DEFAULT_IP_MODE="ipv4"
IP_MODE=""
GOLANG_RELEASE_DIR=${WORKDIR}/golang-releases

CONFORMANCE_SKIP="\[Slow\]|\[Serial\]|\[Disruptive\]|\[Flaky\]|\[Feature:.+\]|\[sig-cli\]|\[sig-storage\]|\[sig-auth\]|\[sig-api-machinery\]|\[sig-apps\]|\[sig-node\]"
NETWORKPOLICY_SKIP="NetworkPolicyLegacy|should allow egress access to server in CIDR block|should enforce except clause while egress access to server in CIDR block"

CONTROL_PLANE_NODE_ROLE="master|control-plane"

CLEAN_STALE_IMAGES="docker system prune --force --all --filter until=4h"
Expand All @@ -53,9 +46,6 @@ Run K8s e2e community tests (Conformance & Network Policy) or Antrea e2e tests o
--kubeconfig Path of cluster kubeconfig.
--workdir Home path for Go, vSphere information and antrea_logs during cluster setup. Default is $WORKDIR.
--testcase Windows install OVS, Conformance and Network Policy or Antrea e2e testcases on a Windows or Linux cluster. It can also be flexible ipam or multicast e2e test.
--registry The docker registry to use instead of dockerhub.
--testbed-type The testbed type to run tests. It can be flexible-ipam, jumper or legacy."
function print_usage {
echoerr "$_usage"
Expand All @@ -82,18 +72,6 @@ case $key in
WORKDIR="$2"
shift 2
;;
--testcase)
TESTCASE="$2"
shift 2
;;
--registry)
DOCKER_REGISTRY="$2"
shift 2
;;
--testbed-type)
TESTBED_TYPE="$2"
shift 2
;;
-h|--help)
print_usage
exit 0
Expand All @@ -118,8 +96,6 @@ if [[ "$DOCKER_REGISTRY" != "" ]]; then
fi
export NO_PULL
E2ETEST_PATH=${WORKDIR}/kubernetes/_output/dockerized/bin/linux/amd64/e2e.test

function export_govc_env_var {
env_govc="${WORKDIR}/govc.env"
if [ -f "$env_govc" ]; then
Expand All @@ -136,15 +112,7 @@ function export_govc_env_var {
function clean_antrea {
echo "====== Cleanup Antrea Installation ======"
clean_ns "monitoring"
clean_ns "antrea-ipam-test"
clean_ns "antrea-test"
echo "====== Cleanup Conformance Namespaces ======"
clean_ns "net"
clean_ns "service"
clean_ns "x-"
clean_ns "y-"
clean_ns "z-"
clean_ns "antrea-scale-ns"
# Delete antrea-prometheus first for k8s>=1.22 to avoid Pod stuck in Terminating state.
kubectl delete -f ${WORKDIR}/antrea-prometheus.yml --ignore-not-found=true || true
Expand All @@ -158,7 +126,7 @@ function clean_antrea {
function clean_ns {
ns=$1
matching_ns=$(kubectl get ns | awk -v ns="${ns}" '$1 ~ ns {print $1}')
if [ -n "${matching_ns}" ]; then
echo "${matching_ns}" | while read -r ns_name; do
kubectl get pod -n "${ns_name}" --no-headers=true | awk '{print $1}' | while read pod_name; do
Expand Down Expand Up @@ -201,10 +169,7 @@ function deliver_antrea_scale {
make clean
${CLEAN_STALE_IMAGES}
${PRINT_DOCKER_STATUS}
if [[ ! "${TESTCASE}" =~ "e2e" && "${DOCKER_REGISTRY}" != "" ]]; then
docker pull "${DOCKER_REGISTRY}/antrea/sonobuoy-systemd-logs:v0.3"
docker tag "${DOCKER_REGISTRY}/antrea/sonobuoy-systemd-logs:v0.3" "sonobuoy/systemd-logs:v0.3"
fi
chmod -R g-w build/images/ovs
chmod -R g-w build/images/base
DOCKER_REGISTRY="${DOCKER_REGISTRY}" ./hack/build-antrea-linux-all.sh --pull
Expand Down Expand Up @@ -235,7 +200,7 @@ function deliver_antrea_scale {

control_plane_ip="$(kubectl get nodes -o wide --no-headers=true | awk -v role="$CONTROL_PLANE_NODE_ROLE" '$3 ~ role {print $6}')"
scp -q -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -i "${WORKDIR}/.ssh/id_rsa" build/yamls/*.yml jenkins@[${control_plane_ip}]:${WORKDIR}/

cp -f build/yamls/*.yml $WORKDIR

echo "====== Delivering Antrea Simulators to all Nodes ======"
Expand Down Expand Up @@ -275,12 +240,19 @@ function generate_ssh_config {
}

function prepare_scale_simulator {
## Try best to clean up old config.
kubectl delete -f "${WORKSPACE}/build/yamls/antrea-agent-simulator.yml" || true
kubectl delete secret kubeconfig || true

# Create simulators.
kubectl taint -l 'antrea/instance=simulator' node mocknode=true:NoExecute
kubectl create secret generic kubeconfig --type=Opaque --namespace=kube-system --from-file=${WORKDIR}/.kube
kubectl create secret generic kubeconfig --type=Opaque --namespace=kube-system --from-file=admin.conf=${WORKDIR}/.kube

kubectl apply -f "${WORKSPACE}/build/yamls/antrea-agent-simulator.yml"
}

function run_scale_test {
echo "====== Running Antrea E2E Tests ======"
echo "====== Running Antrea Scale Tests ======"
export GO111MODULE=on
export GOPATH=${WORKDIR}/go
export GOROOT=${GOLANG_RELEASE_DIR}/go
Expand All @@ -293,6 +265,8 @@ function run_scale_test {
generate_ssh_config

set +e
ls ${WORKSPACE}
make bin
${WORKSPACE}/bin/antrea-scale --config ./test/performance/scale.yml
set -e

Expand Down
2 changes: 1 addition & 1 deletion cmd/antrea-scale/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ func run() error {
}
for i := 0; i < tc.RepeatTimes; i++ {
klog.InfoS("Start test", "Case", tc, "Repeat", i+1)
tCase, err := framework.NewScaleTestCase(tc.Name, false)
tCase, err := framework.NewScaleTestCase(tc.Name)
if err != nil {
return err
}
Expand Down
60 changes: 50 additions & 10 deletions test/performance/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,26 @@ CNI within Kubernetes clusters.

The following sections detail these features and explain how to use the tool.

## 1. Scalability
```shell
Usage of ./bin/antrea-scale:
--config string Config file of scale test cases list (default "test/performance/scale.yml")
--kubeConfigPath string Path of cluster kubeconfig
--templateFilesPath string Template YAML files path of test cases (default "test/performance/assets")
--timeout int Timeout limit (minutes) of the whole scale test (default 10)
--v string (default "2")
```

## Scalability

Whether it's a cluster with a single node or hundreds of nodes, you can execute tests with just
a command line.

```shell
make bin
make antrea-scale
./bin/antrea-scale --kubeConfigPath=/root/.kube/config --timeout=120 --config=./test/performance/scale.yml
```

## 2. Configurability
## Configurability

You can easily configure the test parameters to meet various testing scenarios through a YAML file.

Expand Down Expand Up @@ -46,7 +55,7 @@ scales:
repeat_times: 1
```
## 3. Flexibility & Assemblability
## Flexibility & Assemblability
The configuration file can be divided into two parts:
the first part controls the scale of the test,
Expand Down Expand Up @@ -76,11 +85,11 @@ scales:
```
For example, if we want to test whether the startup speed of the agent is affected after creating
NetworkPolicy on a large scale, we can place "ScaleNP" under "ScaleAgent."
NetworkPolicy on a large scale cluster, we can place "ScaleNetworkPolicy" after "ScaleRestartAgent".
Also, we can control the number of tests by setting the "repeat" parameter.
Also, we can control the number of tests by setting the `repeat_times` field.

## 4. Efficient resource utilization(Antrea simulator agent)
## Efficient resource utilization(Antrea simulator agent)

A significant concern is that large-scale testing requires a vast amount of cluster resources.
We can use simulated agents to conduct tests to save resources and achieve the goal of scaling
Expand All @@ -92,22 +101,53 @@ The simulator can watch the Antrea controller just like the real agent does, and
able to simulate a large number of agents in a smaller number of nodes. It is useful for Antrea
scalability testing, without having to create a very large cluster.

## 5. Multiple platforms(Different CNIs)
## Multiple platforms(Different CNIs)

Additionally, for some common functional features, it's essential to compare the performance
differences between different CNIs to understand our shortcomings or advantages.
The scale test tool can run tests on different Kubernetes platforms and compare the performance
metrics of different CNIs.

## 6. Measure and monitoring
For example, if you want to test the scale performance of other CNIs with the following cases:

```yaml
scales:
- name: "ScaleUpWorkloadPods"
package: "test/performance/framework"
repeat_times: 1
- name: "ScaleService"
package: "test/performance/framework"
repeat_times: 1
- name: "ScaleNetworkPolicy"
package: "test/performance/framework"
repeat_times: 1
```

Note that you should deploy the Kubernetes cluster and CNI firstly then execute the test tool.

## Measure and monitoring

The Antrea scale test tool also integrates monitoring tools, with Prometheus and Grafana,
it's easy to view metrics such as CPU/Memory usage and the number of Pods/Networks during the
testing process.

![img.png](img.png)

## 7. Expanding test cases
### deploy monitoring

With just one simple step, by executing a script, you can activate the monitoring system,
providing convenient access to observe the resource consumption and an array of pertinent metrics
across each node within the cluster during the testing phase.

```shell
cd test/performance/monitoring
./deploy.sh --kubeconfig path/to/.kube/config
```

Then you can access grafana via `http://$ip:3100` with `admin/admin`.
For more information, please refer `./deploy.sh --help`.

## Expanding test cases

Regarding the extensibility of the testing framework, we've designed a model in which test data is
separated from the framework. We expect the scale test tool to make it easy to add test cases
Expand Down
4 changes: 2 additions & 2 deletions test/performance/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,15 @@ type ScaleConfiguration struct {
NamespaceNum int `yaml:"namespace_num"` // NamespaceNum specifies the number of Namespaces to create.
}

type Scale struct {
type ScaleTestCase struct {
Name string `yaml:"name"`
Package string `yaml:"package"`
RepeatTimes int `yaml:"repeat_times"`
}

type ScaleList struct {
ScaleConfiguration `yaml:",inline"`
Scales []Scale `yaml:"scales"`
Scales []ScaleTestCase `yaml:"scales"`
}

func ParseConfigs(configPath string) (*ScaleList, error) {
Expand Down
10 changes: 5 additions & 5 deletions test/performance/framework/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,13 +146,13 @@ func RestartController(ctx context.Context, ch chan time.Duration, data *ScaleDa
return
}

startTime0 := time.Now().UnixNano()
timeBeforePodDeletion := time.Now().UnixNano()
err = data.kubernetesClientSet.CoreV1().Pods(metav1.NamespaceSystem).DeleteCollection(ctx, metav1.DeleteOptions{}, metav1.ListOptions{LabelSelector: "app=antrea,component=antrea-controller"})
if err != nil {
return
}
startTime := time.Now().UnixNano()
klog.InfoS("Deleting operate time", "Duration(ms)", (startTime-startTime0)/1000000)
timeAfterPodDeletion := time.Now().UnixNano()
klog.V(2).InfoS("Deleting operation time", "Duration(ms)", (timeAfterPodDeletion-timeBeforePodDeletion)/1000000)

err = wait.PollUntilContextTimeout(ctx, config.WaitInterval, config.DefaultTimeout, true, func(ctx context.Context) (bool, error) {
var dp *appv1.Deployment
Expand Down Expand Up @@ -180,9 +180,9 @@ func RestartController(ctx context.Context, ch chan time.Duration, data *ScaleDa
for _, pod := range podList.Items {
if pod.Spec.NodeName == controllerPod.Spec.NodeName {
key := "down to up"
downToUpErr := utils.FetchTimestampFromLog(ctx, data.kubernetesClientSet, pod.Namespace, pod.Name, clientpod.ScaleControllerProbeContainerName, ch, startTime, key)
downToUpErr := utils.FetchTimestampFromLog(ctx, data.kubernetesClientSet, pod.Namespace, pod.Name, clientpod.ScaleControllerProbeContainerName, ch, timeAfterPodDeletion, key)
key = "unknown to up"
unknownToUpErr := utils.FetchTimestampFromLog(ctx, data.kubernetesClientSet, pod.Namespace, pod.Name, clientpod.ScaleControllerProbeContainerName, ch, startTime, key)
unknownToUpErr := utils.FetchTimestampFromLog(ctx, data.kubernetesClientSet, pod.Namespace, pod.Name, clientpod.ScaleControllerProbeContainerName, ch, timeAfterPodDeletion, key)
if downToUpErr != nil && unknownToUpErr != nil {
klog.ErrorS(err, "Checking antrea controller restart time error", "ClientPodName", pod.Name)
}
Expand Down
12 changes: 5 additions & 7 deletions test/performance/framework/case.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,18 @@ func RegisterFunc(name string, runFunc RunFunc) {
}

type ScaleTestCase struct {
name string
run RunFunc
parallel bool
name string
run RunFunc
}

func NewScaleTestCase(name string, parallel bool) (*ScaleTestCase, error) {
func NewScaleTestCase(name string) (*ScaleTestCase, error) {
tCase, ok := cases[name]
if !ok {
return nil, fmt.Errorf("test func %s not registered", name)
}
return &ScaleTestCase{
name: name,
run: tCase,
parallel: parallel,
name: name,
run: tCase,
}, nil
}

Expand Down
4 changes: 2 additions & 2 deletions test/performance/framework/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ func ScaleService(ctx context.Context, ch chan time.Duration, data *ScaleData) (
var err error

var svcs []ServiceInfo
svcs, err = scaleUp(ctx, data, ch)
svcs, err = scaleUp(ctx, ch, data)
if err != nil {
res.err = fmt.Errorf("scale up services error: %v", err)
return
Expand Down Expand Up @@ -158,7 +158,7 @@ func renderServices(templatePath string, num int) (svcs []*corev1.Service, err e
return
}

func scaleUp(ctx context.Context, data *ScaleData, ch chan time.Duration) (svcs []ServiceInfo, err error) {
func scaleUp(ctx context.Context, ch chan time.Duration, data *ScaleData) (svcs []ServiceInfo, err error) {
provider := data.provider
controlPlaneNodeName := data.controlPlaneNodes[0]
cs := data.kubernetesClientSet
Expand Down
2 changes: 1 addition & 1 deletion test/performance/monitoring/grafana/grafana.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ spec:
- 0
containers:
- name: grafana
image: grafana/grafana:latest
image: grafana/grafana:v11.0.0
imagePullPolicy: IfNotPresent
ports:
- containerPort: 3000
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ spec:
app: node-exporter
spec:
containers:
- image: quay.io/prometheus/node-exporter:latest
- image: quay.io/prometheus/node-exporter:v1.8.1
name: node-exporter
ports:
- containerPort: 9100
Expand Down
2 changes: 1 addition & 1 deletion test/performance/utils/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ func GenRandInt() int64 {
b := new(big.Int).SetInt64(int64(math.MaxInt64))
i, err := rand.Int(rand.Reader, b)
if err != nil {
return 0
return -1
}
return i.Int64()
}
Expand Down
2 changes: 1 addition & 1 deletion test/performance/utils/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,6 @@ import (
func TestGenRand(t *testing.T) {
for i := 0; i < 10; i++ {
num := GenRandInt() % 100
assert.Equal(t, true, 0 < num && num < 100, "gen random num error")
assert.Equal(t, true, 0 <= num && num < 100, "gen random num error")
}
}

0 comments on commit c0f69af

Please sign in to comment.