Address comments

Signed-off-by: Wenqi Qiu <[email protected]>
antrea-io · Jun 5, 2024 · c0f69af · c0f69af
1 parent 55aa720
commit c0f69af
Show file tree

Hide file tree

Showing 13 changed files with 88 additions and 73 deletions.
diff --git a/ci/jenkins/jobs/macros.yaml b/ci/jenkins/jobs/macros.yaml
@@ -310,3 +310,5 @@
            #!/bin/bash
            set -e
            ./ci/jenkins/stop-stale-jobs.sh --pull-request "${ghprbPullId}" --jenkins "${JENKINS_URL}"
+
+#
diff --git a/ci/jenkins/jobs/projects-lab.yaml b/ci/jenkins/jobs/projects-lab.yaml
@@ -1485,3 +1485,4 @@
                 timeout: 135
                 type: absolute
             publishers: []
+#
diff --git a/ci/jenkins/test-scale.sh b/ci/jenkins/test-scale.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-# Copyright 2023 Antrea Authors
+# Copyright 2024 Antrea Authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,18 +27,11 @@ WORKDIR=$DEFAULT_WORKDIR
 KUBECONFIG_PATH=$DEFAULT_KUBECONFIG_PATH
 TESTCASE=""
 TEST_FAILURE=false
-MODE="report"
 DOCKER_REGISTRY=$(head -n1 "${WORKSPACE}/ci/docker-registry")
 TESTBED_TYPE="legacy"
-GO_VERSION=$(head -n1 "${WORKSPACE}/build/images/deps/go-version")
 IMAGE_PULL_POLICY="Always"
-DEFAULT_IP_MODE="ipv4"
-IP_MODE=""
 GOLANG_RELEASE_DIR=${WORKDIR}/golang-releases
 
-CONFORMANCE_SKIP="\[Slow\]|\[Serial\]|\[Disruptive\]|\[Flaky\]|\[Feature:.+\]|\[sig-cli\]|\[sig-storage\]|\[sig-auth\]|\[sig-api-machinery\]|\[sig-apps\]|\[sig-node\]"
-NETWORKPOLICY_SKIP="NetworkPolicyLegacy|should allow egress access to server in CIDR block|should enforce except clause while egress access to server in CIDR block"
-
 CONTROL_PLANE_NODE_ROLE="master|control-plane"
 
 CLEAN_STALE_IMAGES="docker system prune --force --all --filter until=4h"
@@ -53,9 +46,6 @@ Run K8s e2e community tests (Conformance & Network Policy) or Antrea e2e tests o
 
         --kubeconfig             Path of cluster kubeconfig.
         --workdir                Home path for Go, vSphere information and antrea_logs during cluster setup. Default is $WORKDIR.
-        --testcase               Windows install OVS, Conformance and Network Policy or Antrea e2e testcases on a Windows or Linux cluster. It can also be flexible ipam or multicast e2e test.
-        --registry               The docker registry to use instead of dockerhub.
-        --testbed-type           The testbed type to run tests. It can be flexible-ipam, jumper or legacy." 
 
 function print_usage {
     echoerr "$_usage"
@@ -82,18 +72,6 @@ case $key in
     WORKDIR="$2"
     shift 2
     ;;
-    --testcase)
-    TESTCASE="$2"
-    shift 2
-    ;;
-    --registry)
-    DOCKER_REGISTRY="$2"
-    shift 2
-    ;;
-    --testbed-type)
-    TESTBED_TYPE="$2"
-    shift 2
-    ;;
     -h|--help)
     print_usage
     exit 0
@@ -118,8 +96,6 @@ if [[ "$DOCKER_REGISTRY" != "" ]]; then
 fi
 export NO_PULL
 
-E2ETEST_PATH=${WORKDIR}/kubernetes/_output/dockerized/bin/linux/amd64/e2e.test
-
 function export_govc_env_var {
     env_govc="${WORKDIR}/govc.env"
     if [ -f "$env_govc" ]; then
@@ -136,15 +112,7 @@ function export_govc_env_var {
 
 function clean_antrea {
     echo "====== Cleanup Antrea Installation ======"
-    clean_ns "monitoring"
-    clean_ns "antrea-ipam-test"
-    clean_ns "antrea-test"
-    echo "====== Cleanup Conformance Namespaces ======"
-    clean_ns "net"
-    clean_ns "service"
-    clean_ns "x-"
-    clean_ns "y-"
-    clean_ns "z-"
+    clean_ns "antrea-scale-ns"
 
     # Delete antrea-prometheus first for k8s>=1.22 to avoid Pod stuck in Terminating state.
     kubectl delete -f ${WORKDIR}/antrea-prometheus.yml --ignore-not-found=true || true
@@ -158,7 +126,7 @@ function clean_antrea {
 function clean_ns {
     ns=$1
     matching_ns=$(kubectl get ns | awk -v ns="${ns}" '$1 ~ ns {print $1}')
-    
+
     if [ -n "${matching_ns}" ]; then
         echo "${matching_ns}" | while read -r ns_name; do
             kubectl get pod -n "${ns_name}" --no-headers=true | awk '{print $1}' | while read pod_name; do
@@ -201,10 +169,7 @@ function deliver_antrea_scale {
     make clean
     ${CLEAN_STALE_IMAGES}
     ${PRINT_DOCKER_STATUS}
-    if [[ ! "${TESTCASE}" =~ "e2e" && "${DOCKER_REGISTRY}" != "" ]]; then
-        docker pull "${DOCKER_REGISTRY}/antrea/sonobuoy-systemd-logs:v0.3"
-        docker tag "${DOCKER_REGISTRY}/antrea/sonobuoy-systemd-logs:v0.3" "sonobuoy/systemd-logs:v0.3"
-    fi
+
     chmod -R g-w build/images/ovs
     chmod -R g-w build/images/base
     DOCKER_REGISTRY="${DOCKER_REGISTRY}" ./hack/build-antrea-linux-all.sh --pull
@@ -235,7 +200,7 @@ function deliver_antrea_scale {
 
     control_plane_ip="$(kubectl get nodes -o wide --no-headers=true | awk -v role="$CONTROL_PLANE_NODE_ROLE" '$3 ~ role {print $6}')"
     scp -q -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -i "${WORKDIR}/.ssh/id_rsa" build/yamls/*.yml jenkins@[${control_plane_ip}]:${WORKDIR}/
-    
+
     cp -f build/yamls/*.yml $WORKDIR
 
     echo "====== Delivering Antrea Simulators to all Nodes ======"
@@ -275,12 +240,19 @@ function generate_ssh_config {
 }
 
 function prepare_scale_simulator {
+    ## Try best to clean up old config.
+    kubectl delete -f "${WORKSPACE}/build/yamls/antrea-agent-simulator.yml" || true
+    kubectl delete secret kubeconfig || true
+
+    # Create simulators.
     kubectl taint -l 'antrea/instance=simulator' node mocknode=true:NoExecute
-    kubectl create secret generic kubeconfig --type=Opaque --namespace=kube-system --from-file=${WORKDIR}/.kube
+    kubectl create secret generic kubeconfig --type=Opaque --namespace=kube-system --from-file=admin.conf=${WORKDIR}/.kube
+
+    kubectl apply -f "${WORKSPACE}/build/yamls/antrea-agent-simulator.yml"
 }
 
 function run_scale_test {
-    echo "====== Running Antrea E2E Tests ======"
+    echo "====== Running Antrea Scale Tests ======"
     export GO111MODULE=on
     export GOPATH=${WORKDIR}/go
     export GOROOT=${GOLANG_RELEASE_DIR}/go
@@ -293,6 +265,8 @@ function run_scale_test {
     generate_ssh_config
 
     set +e
+    ls ${WORKSPACE}
+    make bin
     ${WORKSPACE}/bin/antrea-scale --config ./test/performance/scale.yml
     set -e
 

diff --git a/cmd/antrea-scale/main.go b/cmd/antrea-scale/main.go
@@ -87,7 +87,7 @@ func run() error {
 		}
 		for i := 0; i < tc.RepeatTimes; i++ {
 			klog.InfoS("Start test", "Case", tc, "Repeat", i+1)
-			tCase, err := framework.NewScaleTestCase(tc.Name, false)
+			tCase, err := framework.NewScaleTestCase(tc.Name)
 			if err != nil {
 				return err
 			}

diff --git a/test/performance/README.md b/test/performance/README.md
@@ -5,17 +5,26 @@ CNI within Kubernetes clusters.
 
 The following sections detail these features and explain how to use the tool.
 
-## 1. Scalability
+```shell
+Usage of ./bin/antrea-scale:
+      --config string              Config file of scale test cases list (default "test/performance/scale.yml")
+      --kubeConfigPath string      Path of cluster kubeconfig
+      --templateFilesPath string   Template YAML files path of test cases (default "test/performance/assets")
+      --timeout int                Timeout limit (minutes) of the whole scale test (default 10)
+      --v string                    (default "2")
+```
+
+## Scalability
 
 Whether it's a cluster with a single node or hundreds of nodes, you can execute tests with just
 a command line.
 
 ```shell
-make bin
+make antrea-scale
 ./bin/antrea-scale --kubeConfigPath=/root/.kube/config --timeout=120 --config=./test/performance/scale.yml
 ```
 
-## 2. Configurability
+## Configurability
 
 You can easily configure the test parameters to meet various testing scenarios through a YAML file.
 
@@ -46,7 +55,7 @@ scales:
     repeat_times: 1
 ```
 
-## 3. Flexibility & Assemblability
+## Flexibility & Assemblability
 
 The configuration file can be divided into two parts:
 the first part controls the scale of the test,
@@ -76,11 +85,11 @@ scales:
 ```
 
 For example, if we want to test whether the startup speed of the agent is affected after creating
-NetworkPolicy on a large scale, we can place "ScaleNP" under "ScaleAgent."
+NetworkPolicy on a large scale cluster, we can place "ScaleNetworkPolicy" after "ScaleRestartAgent".
 
-Also, we can control the number of tests by setting the "repeat" parameter.
+Also, we can control the number of tests by setting the `repeat_times` field.
 
-## 4. Efficient resource utilization(Antrea simulator agent)
+## Efficient resource utilization(Antrea simulator agent)
 
 A significant concern is that large-scale testing requires a vast amount of cluster resources.
 We can use simulated agents to conduct tests to save resources and achieve the goal of scaling
@@ -92,22 +101,53 @@ The simulator can watch the Antrea controller just like the real agent does, and
 able to simulate a large number of agents in a smaller number of nodes. It is useful for Antrea
 scalability testing, without having to create a very large cluster.
 
-## 5. Multiple platforms(Different CNIs)
+## Multiple platforms(Different CNIs)
 
 Additionally, for some common functional features, it's essential to compare the performance
 differences between different CNIs to understand our shortcomings or advantages.
 The scale test tool can run tests on different Kubernetes platforms and compare the performance
 metrics of different CNIs.
 
-## 6. Measure and monitoring
+For example, if you want to test the scale performance of other CNIs with the following cases:
+
+```yaml
+scales:
+  - name: "ScaleUpWorkloadPods"
+    package: "test/performance/framework"
+    repeat_times: 1
+  - name: "ScaleService"
+    package: "test/performance/framework"
+    repeat_times: 1
+  - name: "ScaleNetworkPolicy"
+    package: "test/performance/framework"
+    repeat_times: 1
+```
+
+Note that you should deploy the Kubernetes cluster and CNI firstly then execute the test tool.
+
+## Measure and monitoring
 
 The Antrea scale test tool also integrates monitoring tools, with Prometheus and Grafana,
 it's easy to view metrics such as CPU/Memory usage and the number of Pods/Networks during the
 testing process.
 
 ![img.png](img.png)
 
-## 7. Expanding test cases
+### deploy monitoring
+
+With just one simple step, by executing a script, you can activate the monitoring system,
+providing convenient access to observe the resource consumption and an array of pertinent metrics
+across each node within the cluster during the testing phase.
+
+```shell
+cd test/performance/monitoring
+./deploy.sh --kubeconfig path/to/.kube/config 
+```
+
+Then you can access grafana via `http://$ip:3100` with `admin/admin`.
+For more information, please refer `./deploy.sh --help`.
+
+## Expanding test cases
 
 Regarding the extensibility of the testing framework, we've designed a model in which test data is
 separated from the framework. We expect the scale test tool to make it easy to add test cases

diff --git a/test/performance/config/config.go b/test/performance/config/config.go
@@ -38,15 +38,15 @@ type ScaleConfiguration struct {
 	NamespaceNum       int  `yaml:"namespace_num"`        // NamespaceNum specifies the number of Namespaces to create.
 }
 
-type Scale struct {
+type ScaleTestCase struct {
 	Name        string `yaml:"name"`
 	Package     string `yaml:"package"`
 	RepeatTimes int    `yaml:"repeat_times"`
 }
 
 type ScaleList struct {
 	ScaleConfiguration `yaml:",inline"`
-	Scales             []Scale `yaml:"scales"`
+	Scales             []ScaleTestCase `yaml:"scales"`
 }
 
 func ParseConfigs(configPath string) (*ScaleList, error) {

diff --git a/test/performance/framework/agent.go b/test/performance/framework/agent.go
@@ -146,13 +146,13 @@ func RestartController(ctx context.Context, ch chan time.Duration, data *ScaleDa
 		return
 	}
 
-	startTime0 := time.Now().UnixNano()
+	timeBeforePodDeletion := time.Now().UnixNano()
 	err = data.kubernetesClientSet.CoreV1().Pods(metav1.NamespaceSystem).DeleteCollection(ctx, metav1.DeleteOptions{}, metav1.ListOptions{LabelSelector: "app=antrea,component=antrea-controller"})
 	if err != nil {
 		return
 	}
-	startTime := time.Now().UnixNano()
-	klog.InfoS("Deleting operate time", "Duration(ms)", (startTime-startTime0)/1000000)
+	timeAfterPodDeletion := time.Now().UnixNano()
+	klog.V(2).InfoS("Deleting operation time", "Duration(ms)", (timeAfterPodDeletion-timeBeforePodDeletion)/1000000)
 
 	err = wait.PollUntilContextTimeout(ctx, config.WaitInterval, config.DefaultTimeout, true, func(ctx context.Context) (bool, error) {
 		var dp *appv1.Deployment
@@ -180,9 +180,9 @@ func RestartController(ctx context.Context, ch chan time.Duration, data *ScaleDa
 		for _, pod := range podList.Items {
 			if pod.Spec.NodeName == controllerPod.Spec.NodeName {
 				key := "down to up"
-				downToUpErr := utils.FetchTimestampFromLog(ctx, data.kubernetesClientSet, pod.Namespace, pod.Name, clientpod.ScaleControllerProbeContainerName, ch, startTime, key)
+				downToUpErr := utils.FetchTimestampFromLog(ctx, data.kubernetesClientSet, pod.Namespace, pod.Name, clientpod.ScaleControllerProbeContainerName, ch, timeAfterPodDeletion, key)
 				key = "unknown to up"
-				unknownToUpErr := utils.FetchTimestampFromLog(ctx, data.kubernetesClientSet, pod.Namespace, pod.Name, clientpod.ScaleControllerProbeContainerName, ch, startTime, key)
+				unknownToUpErr := utils.FetchTimestampFromLog(ctx, data.kubernetesClientSet, pod.Namespace, pod.Name, clientpod.ScaleControllerProbeContainerName, ch, timeAfterPodDeletion, key)
 				if downToUpErr != nil && unknownToUpErr != nil {
 					klog.ErrorS(err, "Checking antrea controller restart time error", "ClientPodName", pod.Name)
 				}

diff --git a/test/performance/framework/case.go b/test/performance/framework/case.go
@@ -36,20 +36,18 @@ func RegisterFunc(name string, runFunc RunFunc) {
 }
 
 type ScaleTestCase struct {
-	name     string
-	run      RunFunc
-	parallel bool
+	name string
+	run  RunFunc
 }
 
-func NewScaleTestCase(name string, parallel bool) (*ScaleTestCase, error) {
+func NewScaleTestCase(name string) (*ScaleTestCase, error) {
 	tCase, ok := cases[name]
 	if !ok {
 		return nil, fmt.Errorf("test func %s not registered", name)
 	}
 	return &ScaleTestCase{
-		name:     name,
-		run:      tCase,
-		parallel: parallel,
+		name: name,
+		run:  tCase,
 	}, nil
 }
 

diff --git a/test/performance/framework/service.go b/test/performance/framework/service.go
@@ -49,7 +49,7 @@ func ScaleService(ctx context.Context, ch chan time.Duration, data *ScaleData) (
 	var err error
 
 	var svcs []ServiceInfo
-	svcs, err = scaleUp(ctx, data, ch)
+	svcs, err = scaleUp(ctx, ch, data)
 	if err != nil {
 		res.err = fmt.Errorf("scale up services error: %v", err)
 		return
@@ -158,7 +158,7 @@ func renderServices(templatePath string, num int) (svcs []*corev1.Service, err e
 	return
 }
 
-func scaleUp(ctx context.Context, data *ScaleData, ch chan time.Duration) (svcs []ServiceInfo, err error) {
+func scaleUp(ctx context.Context, ch chan time.Duration, data *ScaleData) (svcs []ServiceInfo, err error) {
 	provider := data.provider
 	controlPlaneNodeName := data.controlPlaneNodes[0]
 	cs := data.kubernetesClientSet

diff --git a/test/performance/monitoring/grafana/grafana.yml b/test/performance/monitoring/grafana/grafana.yml
@@ -86,7 +86,7 @@ spec:
           - 0
       containers:
         - name: grafana
-          image: grafana/grafana:latest
+          image: grafana/grafana:v11.0.0
           imagePullPolicy: IfNotPresent
           ports:
             - containerPort: 3000

diff --git a/test/performance/monitoring/node_exporter/node-exporter.yml b/test/performance/monitoring/node_exporter/node-exporter.yml
@@ -17,7 +17,7 @@ spec:
         app: node-exporter
     spec:
       containers:
-      - image: quay.io/prometheus/node-exporter:latest
+      - image: quay.io/prometheus/node-exporter:v1.8.1
         name: node-exporter
         ports:
         - containerPort: 9100

diff --git a/test/performance/utils/util.go b/test/performance/utils/util.go
@@ -38,7 +38,7 @@ func GenRandInt() int64 {
 	b := new(big.Int).SetInt64(int64(math.MaxInt64))
 	i, err := rand.Int(rand.Reader, b)
 	if err != nil {
-		return 0
+		return -1
 	}
 	return i.Int64()
 }

diff --git a/test/performance/utils/util_test.go b/test/performance/utils/util_test.go
@@ -23,6 +23,6 @@ import (
 func TestGenRand(t *testing.T) {
 	for i := 0; i < 10; i++ {
 		num := GenRandInt() % 100
-		assert.Equal(t, true, 0 < num && num < 100, "gen random num error")
+		assert.Equal(t, true, 0 <= num && num < 100, "gen random num error")
 	}
 }