From 4c18889857e12e4efb736386a6ce82b3517b34bc Mon Sep 17 00:00:00 2001 From: Dumitru Ceara Date: Wed, 27 Mar 2024 20:21:22 +0100 Subject: [PATCH 1/3] bringup-cluster: Remove tester pid limits. By default the container running ovn-tester could use only up to 2048 pids. With large scale OVN-IC tests (e.g., 500 zones) the tester (through ovsdbapp) was failing to spawn enough threads to handle all the database connections. Remove the pid limit. NOTE: this is a bit of a band aid because it doesn't fix the real bottleneck in such scenarios, i.e., the fact that the tester still keeps long lived connections to all DBs. Signed-off-by: Dumitru Ceara --- ovn-fake-multinode-utils/playbooks/bringup-cluster.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ovn-fake-multinode-utils/playbooks/bringup-cluster.yml b/ovn-fake-multinode-utils/playbooks/bringup-cluster.yml index c9c6186d..da1e23e6 100644 --- a/ovn-fake-multinode-utils/playbooks/bringup-cluster.yml +++ b/ovn-fake-multinode-utils/playbooks/bringup-cluster.yml @@ -4,7 +4,7 @@ - name: Start tester container ansible.builtin.shell: | podman run -dt --name=ovn-tester --hostname=ovn-tester \ - --privileged ovn/ovn-tester + --pids-limit -1 --privileged ovn/ovn-tester - name: Add tester container interfaces to OVS bridges environment: From 334cf200c64d8aa392ca5cb5bc7047916563d1ca Mon Sep 17 00:00:00 2001 From: Dumitru Ceara Date: Wed, 27 Mar 2024 20:25:17 +0100 Subject: [PATCH 2/3] translate_yaml: Change external, gw and ts default subnets to avoid overlaps. With IC deployments these defaults are used as base for computing the subnet (external, gw and ts) that should be used by each zone. With larger scale tests, e.g., 500 nodes, when using default values we would get overlapping IPs between external and gw subnets. Signed-off-by: Dumitru Ceara --- ovn-fake-multinode-utils/translate_yaml.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ovn-fake-multinode-utils/translate_yaml.py b/ovn-fake-multinode-utils/translate_yaml.py index 1b3f2d19..abb50339 100755 --- a/ovn-fake-multinode-utils/translate_yaml.py +++ b/ovn-fake-multinode-utils/translate_yaml.py @@ -102,12 +102,12 @@ class ClusterConfig: node_timeout_s: int = 20 internal_net: str = "16.0.0.0/16" internal_net6: str = "16::/64" - external_net: str = "3.0.0.0/16" - external_net6: str = "3::/64" - gw_net: str = "2.0.0.0/16" - gw_net6: str = "2::/64" - ts_net: str = "30.0.0.0/16" - ts_net6: str = "30::/64" + external_net: str = "20.0.0.0/16" + external_net6: str = "20::/64" + gw_net: str = "30.0.0.0/16" + gw_net6: str = "30::/64" + ts_net: str = "40.0.0.0/16" + ts_net6: str = "40::/64" cluster_net: str = "16.0.0.0/4" cluster_net6: str = "16::/32" n_workers: int = 2 From 5787e81fec814ade93ba252e76b44b885dea1f7e Mon Sep 17 00:00:00 2001 From: Dumitru Ceara Date: Wed, 27 Mar 2024 20:31:11 +0100 Subject: [PATCH 3/3] ovn-kubernetes: Use a single, short lived, IC NB connection. There's no need for a long lived one. We use it to create the transit switch, once. Signed-off-by: Dumitru Ceara --- .../cms/ovn_kubernetes/tests/base_cluster_bringup.py | 12 +++++++++--- ovn-tester/ovn_workload.py | 10 +++------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/ovn-tester/cms/ovn_kubernetes/tests/base_cluster_bringup.py b/ovn-tester/cms/ovn_kubernetes/tests/base_cluster_bringup.py index 310fbe59..4f794497 100644 --- a/ovn-tester/cms/ovn_kubernetes/tests/base_cluster_bringup.py +++ b/ovn-tester/cms/ovn_kubernetes/tests/base_cluster_bringup.py @@ -1,7 +1,7 @@ from collections import namedtuple from randmac import RandMac -from ovn_utils import LSwitch +from ovn_utils import LSwitch, OvnIcNbctl from ovn_context import Context from ovn_ext_cmd import ExtCmd @@ -18,8 +18,14 @@ def __init__(self, config, clusters, global_cfg): self.ic_cluster = clusters[0] if len(clusters) > 1 else None def create_transit_switch(self): - if self.ic_cluster: - self.ic_cluster.icnbctl.ts_add() + if self.ic_cluster is None: + return + + inactivity_probe = ( + self.ic_cluster.cluster_cfg.db_inactivity_probe // 1000 + ) + ic_remote = f'tcp:{self.ic_cluster.cluster_cfg.node_net.ip + 2}:6645' + OvnIcNbctl(None, ic_remote, inactivity_probe).ts_add() def connect_transit_switch(self, cluster): if self.ic_cluster is None: diff --git a/ovn-tester/ovn_workload.py b/ovn-tester/ovn_workload.py index 352e4002..60bd61a1 100644 --- a/ovn-tester/ovn_workload.py +++ b/ovn-tester/ovn_workload.py @@ -303,7 +303,6 @@ def __init__( self.brex_cfg = brex_cfg self.nbctl: Optional[ovn_utils.OvnNbctl] = None self.sbctl: Optional[ovn_utils.OvnSbctl] = None - self.icnbctl: Optional[ovn_utils.OvnIcNbctl] = None self.az = az protocol = "ssl" if cluster_cfg.enable_ssl else "tcp" @@ -364,12 +363,9 @@ def start(self): self.central_nodes[0], sb_conn, inactivity_probe ) - # ovn-ic configuration - self.icnbctl = ovn_utils.OvnIcNbctl( - None, - f'tcp:{self.cluster_cfg.node_net.ip + 2}:6645', - inactivity_probe, - ) + # ovn-ic configuration: enable route learning/advertising to allow + # automatic pinging between cluster_net subnets in different AZs. + # This is required for IC connectivity checks. self.nbctl.set_global('ic-route-learn', 'true') self.nbctl.set_global('ic-route-adv', 'true')