From 249dc4d9eba597175ca3d6126a3f9b3be43964d7 Mon Sep 17 00:00:00 2001
From: yanliang567 <82361606+yanliang567@users.noreply.github.com>
Date: Tue, 20 Aug 2024 14:20:56 +0800
Subject: [PATCH] test: Add tests for upsert with auto id (#35556)

Related issue: #34668

---------

Signed-off-by: yanliang567 <yanliang.qiao@zilliz.com>
---
 .../python_client/base/collection_wrapper.py  |   4 +-
 tests/python_client/requirements.txt          |   4 +-
 tests/python_client/testcases/test_insert.py  | 105 +++++++++++++-----
 tests/python_client/testcases/test_utility.py |   4 +-
 4 files changed, 81 insertions(+), 36 deletions(-)

diff --git a/tests/python_client/base/collection_wrapper.py b/tests/python_client/base/collection_wrapper.py
index 2bb9fcb82abe1..2fae11cbec0df 100644
--- a/tests/python_client/base/collection_wrapper.py
+++ b/tests/python_client/base/collection_wrapper.py
@@ -339,10 +339,10 @@ def upsert(self, data, partition_name=None, timeout=None, check_task=None, check
         return res, check_result
 
     @trace()
-    def compact(self, timeout=None, check_task=None, check_items=None, **kwargs):
+    def compact(self, is_clustering=False, timeout=None, check_task=None, check_items=None, **kwargs):
         timeout = TIMEOUT if timeout is None else timeout
         func_name = sys._getframe().f_code.co_name
-        res, check = api_request([self.collection.compact, timeout], **kwargs)
+        res, check = api_request([self.collection.compact, is_clustering, timeout], **kwargs)
         check_result = ResponseChecker(res, func_name, check_task, check_items, check, **kwargs).run()
         return res, check_result
 
diff --git a/tests/python_client/requirements.txt b/tests/python_client/requirements.txt
index 928f09bc0e489..523983569cdaa 100644
--- a/tests/python_client/requirements.txt
+++ b/tests/python_client/requirements.txt
@@ -12,8 +12,8 @@ allure-pytest==2.7.0
 pytest-print==0.2.1
 pytest-level==0.1.1
 pytest-xdist==2.5.0
-pymilvus==2.5.0rc45
-pymilvus[bulk_writer]==2.5.0rc45
+pymilvus==2.5.0rc70
+pymilvus[bulk_writer]==2.5.0rc70
 pytest-rerunfailures==9.1.1
 git+https://github.com/Projectplace/pytest-tags
 ndg-httpsclient
diff --git a/tests/python_client/testcases/test_insert.py b/tests/python_client/testcases/test_insert.py
index 04bae701a4aa3..ea44b0664b89c 100644
--- a/tests/python_client/testcases/test_insert.py
+++ b/tests/python_client/testcases/test_insert.py
@@ -513,7 +513,7 @@ def test_insert_exceed_varchar_limit(self):
         data = [vectors, ["limit_1___________",
                           "limit_2___________"], ['1', '2']]
         error = {ct.err_code: 999,
-                 ct.err_msg: "invalid input, length of string exceeds max length"}
+                 ct.err_msg: "length of string exceeds max length"}
         collection_w.insert(
             data, check_task=CheckTasks.err_res, check_items=error)
 
@@ -815,16 +815,6 @@ def insert(thread_i):
             t.join()
         assert collection_w.num_entities == ct.default_nb * thread_num
 
-    @pytest.mark.tags(CaseLabel.L2)
-    @pytest.mark.skip(reason="Currently primary keys are not unique")
-    def test_insert_multi_threading_auto_id(self):
-        """
-        target: test concurrent insert auto_id=True collection
-        method: 1.create auto_id=True collection 2.concurrent insert
-        expected: verify primary keys unique
-        """
-        pass
-
     @pytest.mark.tags(CaseLabel.L1)
     def test_insert_multi_times(self, dim):
         """
@@ -1211,11 +1201,11 @@ def test_insert_with_invalid_partition_name(self):
                                               check_items=error)
 
     @pytest.mark.tags(CaseLabel.L2)
-    def test_insert_invalid_with_pk_varchar_auto_id_true(self):
+    def test_insert_with_pk_varchar_auto_id_true(self):
         """
         target: test insert invalid with pk varchar and auto id true
         method: set pk varchar max length < 18, insert data
-        expected: raise exception
+        expected: varchar pk supports auto_id=true
         """
         string_field = cf.gen_string_field(is_primary=True, max_length=6)
         embedding_field = cf.gen_float_vec_field()
@@ -1547,8 +1537,56 @@ def test_upsert_data_pk_exist(self, start):
         res = collection_w.query(exp, output_fields=[default_float_name])[0]
         assert [res[i][default_float_name] for i in range(upsert_nb)] == float_values.to_list()
 
-    @pytest.mark.tags(CaseLabel.L2)
-    def test_upsert_with_primary_key_string(self):
+    @pytest.mark.tags(CaseLabel.L0)
+    def test_upsert_with_auto_id(self):
+        """
+        target: test upsert with auto id
+        method: 1. create a collection with autoID=true
+                2. upsert 10 entities with non-existing pks
+                verify: success, and the pks are auto-generated
+                3. query 10 entities to get the existing pks
+                4. upsert 10 entities with existing pks
+                verify: success, and the pks are re-generated, and the new pks are visibly
+        """
+        dim = 32
+        collection_w, _, _, insert_ids, _ = self.init_collection_general(pre_upsert, auto_id=True,
+                                                                         dim=dim, insert_data=True, with_json=False)
+        nb = 10
+        start = ct.default_nb * 10
+        data = cf.gen_default_list_data(dim=dim, nb=nb, start=start, with_json=False)
+        res_upsert1 = collection_w.upsert(data=data)[0]
+        collection_w.flush()
+        # assert the pks are auto-generated, and num_entities increased for upsert with non_existing pks
+        assert res_upsert1.primary_keys[0] > insert_ids[-1]
+        assert collection_w.num_entities == ct.default_nb + nb
+
+        # query 10 entities to get the existing pks
+        res_q = collection_w.query(expr='', limit=nb)[0]
+        print(f"res_q: {res_q}")
+        existing_pks = [res_q[i][ct.default_int64_field_name] for i in range(nb)]
+        existing_count = collection_w.query(expr=f"{ct.default_int64_field_name} in {existing_pks}",
+                                            output_fields=[ct.default_count_output])[0]
+        assert nb == existing_count[0].get(ct.default_count_output)
+        # upsert 10 entities with the existing pks
+        start = ct.default_nb * 20
+        data = cf.gen_default_list_data(dim=dim, nb=nb, start=start, with_json=False)
+        data[0] = existing_pks
+        res_upsert2 = collection_w.upsert(data=data)[0]
+        collection_w.flush()
+        # assert the new pks are auto-generated again
+        assert res_upsert2.primary_keys[0] > res_upsert1.primary_keys[-1]
+        existing_count = collection_w.query(expr=f"{ct.default_int64_field_name} in {existing_pks}",
+                                            output_fields=[ct.default_count_output])[0]
+        assert 0 == existing_count[0].get(ct.default_count_output)
+        res_q = collection_w.query(expr=f"{ct.default_int64_field_name} in {res_upsert2.primary_keys}",
+                                   output_fields=["*"])[0]
+        assert nb == len(res_q)
+        current_count = collection_w.query(expr='', output_fields=[ct.default_count_output])[0]
+        assert current_count[0].get(ct.default_count_output) == ct.default_nb + nb
+
+    @pytest.mark.tags(CaseLabel.L1)
+    @pytest.mark.parametrize("auto_id", [True, False])
+    def test_upsert_with_primary_key_string(self, auto_id):
         """
         target: test upsert with string primary key
         method: 1. create a collection with pk string
@@ -1558,11 +1596,18 @@ def test_upsert_with_primary_key_string(self):
         """
         c_name = cf.gen_unique_str(pre_upsert)
         fields = [cf.gen_string_field(), cf.gen_float_vec_field(dim=ct.default_dim)]
-        schema = cf.gen_collection_schema(fields=fields, primary_field=ct.default_string_field_name)
+        schema = cf.gen_collection_schema(fields=fields, primary_field=ct.default_string_field_name,
+                                          auto_id=auto_id)
         collection_w = self.init_collection_wrap(name=c_name, schema=schema)
         vectors = [[random.random() for _ in range(ct.default_dim)] for _ in range(2)]
-        collection_w.insert([["a", "b"], vectors])
-        collection_w.upsert([[" a", "b  "], vectors])
+        if not auto_id:
+            collection_w.insert([["a", "b"], vectors])
+            res_upsert = collection_w.upsert([[" a", "b  "], vectors])[0]
+            assert res_upsert.primary_keys[0] == " a" and res_upsert.primary_keys[1] == "b  "
+        else:
+            collection_w.insert([vectors])
+            res_upsert = collection_w.upsert([[" a", "b  "], vectors])[0]
+            assert res_upsert.primary_keys[0] != " a" and res_upsert.primary_keys[1] != "b  "
         assert collection_w.num_entities == 4
 
     @pytest.mark.tags(CaseLabel.L2)
@@ -2046,7 +2091,7 @@ def test_upsert_partition_name_nonexistent(self):
                             check_task=CheckTasks.err_res, check_items=error)
 
     @pytest.mark.tags(CaseLabel.L2)
-    @pytest.mark.skip("insert and upsert have removed the [] error check")
+    @pytest.mark.xfail("insert and upsert have removed the [] error check")
     def test_upsert_multi_partitions(self):
         """
         target: test upsert two partitions
@@ -2066,20 +2111,20 @@ def test_upsert_multi_partitions(self):
                             check_task=CheckTasks.err_res, check_items=error)
 
     @pytest.mark.tags(CaseLabel.L2)
-    @pytest.mark.skip(reason="smellthemoon: behavior changed")
-    def test_upsert_with_auto_id(self):
+    def test_upsert_with_auto_id_pk_type_dismacth(self):
         """
-        target: test upsert with auto id
-        method: 1. create a collection with autoID=true
-                2. upsert data no pk
+        target: test upsert with auto_id and pk type dismatch
+        method: 1. create a collection with pk int64 and auto_id=True
+                2. upsert with pk string type dismatch
         expected: raise exception
         """
-        collection_w = self.init_collection_general(pre_upsert, auto_id=True, is_index=False)[0]
-        error = {ct.err_code: 999,
-                 ct.err_msg: "Upsert don't support autoid == true"}
-        float_vec_values = cf.gen_vectors(ct.default_nb, ct.default_dim)
-        data = [[np.float32(i) for i in range(ct.default_nb)], [str(i) for i in range(ct.default_nb)],
-                float_vec_values]
+        dim = 16
+        collection_w = self.init_collection_general(pre_upsert, auto_id=False,
+                                                    dim=dim, insert_data=True, with_json=False)[0]
+        nb = 10
+        data = cf.gen_default_list_data(dim=dim, nb=nb, with_json=False)
+        data[0] = [str(i) for i in range(nb)]
+        error = {ct.err_code: 999, ct.err_msg: "The Input data type is inconsistent with defined schema"}
         collection_w.upsert(data=data, check_task=CheckTasks.err_res, check_items=error)
 
     @pytest.mark.tags(CaseLabel.L2)
diff --git a/tests/python_client/testcases/test_utility.py b/tests/python_client/testcases/test_utility.py
index f4eccf19597cc..ee578b0d8efeb 100644
--- a/tests/python_client/testcases/test_utility.py
+++ b/tests/python_client/testcases/test_utility.py
@@ -731,7 +731,7 @@ def test_index_process_collection_empty(self):
         cw = self.init_collection_wrap(name=c_name)
         self.index_wrap.init_index(cw.collection, default_field_name, default_index_params)
         res, _ = self.utility_wrap.index_building_progress(c_name)
-        exp_res = {'total_rows': 0, 'indexed_rows': 0, 'pending_index_rows': 0}
+        exp_res = {'total_rows': 0, 'indexed_rows': 0, 'pending_index_rows': 0, 'state': 'Finished'}
         assert res == exp_res
 
     @pytest.mark.tags(CaseLabel.L2)
@@ -822,7 +822,7 @@ def test_wait_index_collection_empty(self):
         cw.create_index(default_field_name, default_index_params)
         assert self.utility_wrap.wait_for_index_building_complete(c_name)[0]
         res, _ = self.utility_wrap.index_building_progress(c_name)
-        exp_res = {'total_rows': 0, 'indexed_rows': 0, 'pending_index_rows': 0}
+        exp_res = {'total_rows': 0, 'indexed_rows': 0, 'pending_index_rows': 0, 'state': 'Finished'}
         assert res == exp_res
 
     @pytest.mark.tags(CaseLabel.L1)