From 319b4f41d472cb4b6bf68c595a5a901dcf389270 Mon Sep 17 00:00:00 2001 From: ThreadDao Date: Wed, 10 Apr 2024 19:56:31 +0800 Subject: [PATCH] Add test cases for sparse vector Signed-off-by: ThreadDao --- client/collection.go | 3 +- entity/rows.go | 4 + test/common/utils.go | 97 ++++++++++++++++++++- test/common/utils_client.go | 13 ++- test/testcases/collection_test.go | 44 ++++++++++ test/testcases/hybrid_search_test.go | 55 +++++++++++- test/testcases/index_test.go | 116 +++++++++++++++++++++++++ test/testcases/insert_test.go | 123 ++++++++++++++++++++++++++- test/testcases/main_test.go | 33 +++++-- test/testcases/option.go | 1 + test/testcases/query_test.go | 43 ++++++++++ test/testcases/search_test.go | 107 +++++++++++++++++++++++ 12 files changed, 622 insertions(+), 17 deletions(-) diff --git a/client/collection.go b/client/collection.go index 142aac915..98b0cb4be 100644 --- a/client/collection.go +++ b/client/collection.go @@ -218,7 +218,8 @@ func (c *GrpcClient) validateSchema(sch *entity.Schema) error { if field.DataType == entity.FieldTypeFloatVector || field.DataType == entity.FieldTypeBinaryVector || field.DataType == entity.FieldTypeBFloat16Vector || - field.DataType == entity.FieldTypeFloat16Vector { + field.DataType == entity.FieldTypeFloat16Vector || + field.DataType == entity.FieldTypeSparseVector { vectors++ } } diff --git a/entity/rows.go b/entity/rows.go index 33799d410..f4a7ff6ca 100644 --- a/entity/rows.go +++ b/entity/rows.go @@ -389,6 +389,10 @@ func AnyToColumns(rows []interface{}, schemas ...*Schema) ([]Column, error) { } col := NewColumnBFloat16Vector(field.Name, int(dim), data) nameColumns[field.Name] = col + case FieldTypeSparseVector: + data := make([]SparseEmbedding, 0, rowsLen) + col := NewColumnSparseVectors(field.Name, data) + nameColumns[field.Name] = col } } diff --git a/test/common/utils.go b/test/common/utils.go index 864cc7b5c..e048ae86b 100644 --- a/test/common/utils.go +++ b/test/common/utils.go @@ -31,6 +31,7 @@ const ( DefaultBinaryVecFieldName = "binaryVec" DefaultFloat16VecFieldName = "fp16Vec" DefaultBFloat16VecFieldName = "bf16Vec" + DefaultSparseVecFieldName = "sparseVec" DefaultDynamicNumberField = "dynamicNumber" DefaultDynamicStringField = "dynamicString" DefaultDynamicBoolField = "dynamicBool" @@ -220,6 +221,22 @@ func GenBinaryVector(dim int64) []byte { return vector } +func GenSparseVector(maxLen int) entity.SparseEmbedding { + length := 1 + rand.Intn(1+maxLen) + positions := make([]uint32, length) + values := make([]float32, length) + for i := 0; i < length; i++ { + //positions[i] = rand.Uint32() - 1 + positions[i] = uint32(i) + values[i] = rand.Float32() + } + vector, err := entity.NewSliceSparseEmbedding(positions, values) + if err != nil { + log.Fatalf("Generate vector failed %s", err) + } + return vector +} + // --- common utils --- // --- gen fields --- @@ -402,6 +419,13 @@ func GenColumnData(start int, nb int, fieldType entity.FieldType, fieldName stri bf16Vectors = append(bf16Vectors, vec) } return entity.NewColumnBFloat16Vector(fieldName, int(opt.dim), bf16Vectors) + case entity.FieldTypeSparseVector: + vectors := make([]entity.SparseEmbedding, 0, nb) + for i := start; i < start+nb; i++ { + vec := GenSparseVector(opt.maxLenSparse) + vectors = append(vectors, vec) + } + return entity.NewColumnSparseVectors(fieldName, vectors) default: return nil } @@ -981,6 +1005,53 @@ func GenDefaultArrayRows(start int, nb int, dim int64, enableDynamicField bool, return rows } +func GenDefaultSparseRows(start int, nb int, dim int64, maxLenSparse int, enableDynamicField bool) []interface{} { + rows := make([]interface{}, 0, nb) + type BaseRow struct { + Int64 int64 `json:"int64" milvus:"name:int64"` + Varchar string `json:"varchar" milvus:"name:varchar"` + FloatVec []float32 `json:"floatVec" milvus:"name:floatVec"` + SparseVec entity.SparseEmbedding `json:"sparseVec" milvus:"name:sparseVec"` + } + + type DynamicRow struct { + Int64 int64 `json:"int64" milvus:"name:int64"` + Varchar string `json:"varchar" milvus:"name:varchar"` + FloatVec []float32 `json:"floatVec" milvus:"name:floatVec"` + SparseVec entity.SparseEmbedding `json:"sparseVec" milvus:"name:sparseVec"` + Dynamic Dynamic `json:"dynamic" milvus:"name:dynamic"` + } + + for i := start; i < start+nb; i++ { + baseRow := BaseRow{ + Int64: int64(i), + Varchar: strconv.Itoa(i), + FloatVec: GenFloatVector(dim), + SparseVec: GenSparseVector(maxLenSparse), + } + // json and dynamic field + dynamicJSON := Dynamic{ + Number: int32(i), + String: strconv.Itoa(i), + Bool: i%2 == 0, + List: []int64{int64(i), int64(i + 1)}, + } + if enableDynamicField { + dynamicRow := DynamicRow{ + Int64: baseRow.Int64, + Varchar: baseRow.Varchar, + FloatVec: baseRow.FloatVec, + SparseVec: baseRow.SparseVec, + Dynamic: dynamicJSON, + } + rows = append(rows, dynamicRow) + } else { + rows = append(rows, &baseRow) + } + } + return rows +} + func GenAllVectorsRows(start int, nb int, dim int64, enableDynamicField bool) []interface{} { rows := make([]interface{}, 0, nb) type BaseRow struct { @@ -1231,11 +1302,28 @@ var SupportBinIvfFlatMetricType = []entity.MetricType{ entity.HAMMING, } +var UnsupportedSparseVecMetricsType = []entity.MetricType{ + entity.L2, + entity.COSINE, + entity.JACCARD, + entity.HAMMING, + entity.SUBSTRUCTURE, + entity.SUPERSTRUCTURE, +} + // GenAllFloatIndex gen all float vector index -func GenAllFloatIndex() []entity.Index { +func GenAllFloatIndex(metricTypes ...entity.MetricType) []entity.Index { nlist := 128 var allFloatIndex []entity.Index - for _, metricType := range SupportFloatMetricType { + var allMetricTypes []entity.MetricType + log.Println(metricTypes) + if len(metricTypes) == 0 { + allMetricTypes = SupportFloatMetricType + } else { + allMetricTypes = metricTypes + } + for _, metricType := range allMetricTypes { + log.Println(metricType) idxFlat, _ := entity.NewIndexFlat(metricType) idxIvfFlat, _ := entity.NewIndexIvfFlat(metricType, nlist) idxIvfSq8, _ := entity.NewIndexIvfSQ8(metricType, nlist) @@ -1277,6 +1365,11 @@ func GenSearchVectors(nq int, dim int64, dataType entity.FieldType) []entity.Vec vector := GenBFloat16Vector(dim) vectors = append(vectors, entity.BFloat16Vector(vector)) } + case entity.FieldTypeSparseVector: + for i := 0; i < nq; i++ { + vec := GenSparseVector(int(dim)) + vectors = append(vectors, vec) + } } return vectors } diff --git a/test/common/utils_client.go b/test/common/utils_client.go index da730684e..d81cd3156 100644 --- a/test/common/utils_client.go +++ b/test/common/utils_client.go @@ -114,9 +114,10 @@ func GenSchema(name string, autoID bool, fields []*entity.Field, opts ...CreateS // GenColumnDataOption -- create column data -- type GenColumnDataOption func(opt *genDataOpt) type genDataOpt struct { - dim int64 - ElementType entity.FieldType - capacity int64 + dim int64 + ElementType entity.FieldType + capacity int64 + maxLenSparse int } func WithVectorDim(dim int64) GenColumnDataOption { @@ -137,4 +138,10 @@ func WithArrayCapacity(capacity int64) GenColumnDataOption { } } +func WithSparseVectorLen(length int) GenColumnDataOption { + return func(opt *genDataOpt) { + opt.maxLenSparse = length + } +} + // -- create column data -- diff --git a/test/testcases/collection_test.go b/test/testcases/collection_test.go index 0a592564c..362a8c199 100644 --- a/test/testcases/collection_test.go +++ b/test/testcases/collection_test.go @@ -629,6 +629,50 @@ func TestCreateMultiVectorExceed(t *testing.T) { common.CheckErr(t, errCreateCollection, false, "maximum vector field's number should be limited to 4") } +// specify dim for sparse vector -> error +func TestCreateCollectionSparseVectorWithDim(t *testing.T) { + ctx := createContext(t, time.Second*common.DefaultTimeout) + mc := createMilvusClient(ctx, t) + allFields := []*entity.Field{ + common.GenField(common.DefaultIntFieldName, entity.FieldTypeInt64, common.WithIsPrimaryKey(true), common.WithAutoID(false)), + common.GenField(common.DefaultSparseVecFieldName, entity.FieldTypeSparseVector, common.WithDim(common.DefaultDim)), + } + collName := common.GenRandomString(6) + schema := common.GenSchema(collName, false, allFields) + + // create collection + errCreateCollection := mc.CreateCollection(ctx, schema, common.DefaultShards) + common.CheckErr(t, errCreateCollection, false, "dim should not be specified for sparse vector field sparseVec(0)") +} + +// create collection with sparse vector +func TestCreateCollectionSparseVector(t *testing.T) { + ctx := createContext(t, time.Second*common.DefaultTimeout) + mc := createMilvusClient(ctx, t) + allFields := []*entity.Field{ + common.GenField(common.DefaultIntFieldName, entity.FieldTypeInt64, common.WithIsPrimaryKey(true), common.WithAutoID(false)), + common.GenField(common.DefaultVarcharFieldName, entity.FieldTypeVarChar, common.WithMaxLength(common.TestMaxLen)), + common.GenField(common.DefaultSparseVecFieldName, entity.FieldTypeSparseVector), + } + collName := common.GenRandomString(6) + schema := common.GenSchema(collName, false, allFields) + + // create collection + errCreateCollection := mc.CreateCollection(ctx, schema, common.DefaultShards) + common.CheckErr(t, errCreateCollection, true) + + // describe collection + collection, err := mc.DescribeCollection(ctx, collName) + common.CheckErr(t, err, true) + common.CheckCollection(t, collection, collName, common.DefaultShards, schema, common.DefaultConsistencyLevel) + require.Len(t, collection.Schema.Fields, 3) + for _, field := range collection.Schema.Fields { + if field.DataType == entity.FieldTypeSparseVector { + require.Equal(t, common.DefaultSparseVecFieldName, field.Name) + } + } +} + // -- Get Collection Statistics -- func TestGetStaticsCollectionNotExisted(t *testing.T) { diff --git a/test/testcases/hybrid_search_test.go b/test/testcases/hybrid_search_test.go index 369892719..4452c0231 100644 --- a/test/testcases/hybrid_search_test.go +++ b/test/testcases/hybrid_search_test.go @@ -282,7 +282,7 @@ func TestHybridSearchMultiVectorsRangeSearch(t *testing.T) { queryVec2 := common.GenSearchVectors(1, common.DefaultDim, entity.FieldTypeFloat16Vector) // search with different reranker and offset - sp.AddRadius(10) + sp.AddRadius(20) sp.AddRangeFilter(0.01) for _, reranker := range []client.Reranker{ client.NewRRFReranker(), @@ -300,8 +300,59 @@ func TestHybridSearchMultiVectorsRangeSearch(t *testing.T) { for _, res := range resRange { for _, score := range res.Scores { require.GreaterOrEqual(t, score, float32(0.01)) - require.LessOrEqual(t, score, float32(5)) + require.LessOrEqual(t, score, float32(20)) } } } } + +func TestHybridSearchSparseVector(t *testing.T) { + t.Skip("https://github.com/milvus-io/milvus/pull/32177") + t.Parallel() + idxInverted := entity.NewGenericIndex(common.DefaultSparseVecFieldName, "SPARSE_INVERTED_INDEX", map[string]string{"drop_ratio_build": "0.2", "metric_type": "IP"}) + idxWand := entity.NewGenericIndex(common.DefaultSparseVecFieldName, "SPARSE_WAND", map[string]string{"drop_ratio_build": "0.3", "metric_type": "IP"}) + for _, idx := range []entity.Index{idxInverted, idxWand} { + ctx := createContext(t, time.Second*common.DefaultTimeout*2) + // connect + mc := createMilvusClient(ctx, t) + + // create -> insert [0, 3000) -> flush -> index -> load + cp := CollectionParams{CollectionFieldsType: Int64VarcharSparseVec, AutoID: false, EnableDynamicField: true, + ShardsNum: common.DefaultShards, Dim: common.DefaultDim, MaxLength: common.TestMaxLen} + + dp := DataParams{DoInsert: true, CollectionFieldsType: Int64VarcharSparseVec, start: 0, nb: common.DefaultNb * 3, + dim: common.DefaultDim, EnableDynamicField: true} + + // index params + idxHnsw, _ := entity.NewIndexHNSW(entity.L2, 8, 96) + ips := []IndexParams{ + {BuildIndex: true, Index: idx, FieldName: common.DefaultSparseVecFieldName, async: false}, + {BuildIndex: true, Index: idxHnsw, FieldName: common.DefaultFloatVecFieldName, async: false}, + } + collName := prepareCollection(ctx, t, mc, cp, WithDataParams(dp), WithIndexParams(ips), WithCreateOption(client.WithConsistencyLevel(entity.ClStrong))) + + // search + queryVec1 := common.GenSearchVectors(common.DefaultNq, common.DefaultDim*2, entity.FieldTypeSparseVector) + queryVec2 := common.GenSearchVectors(common.DefaultNq, common.DefaultDim, entity.FieldTypeFloatVector) + sp1, _ := entity.NewIndexSparseInvertedSearchParam(0.2) + sp2, _ := entity.NewIndexHNSWSearchParam(20) + expr := fmt.Sprintf("%s > 1", common.DefaultIntFieldName) + sReqs := []*client.ANNSearchRequest{ + client.NewANNSearchRequest(common.DefaultSparseVecFieldName, entity.IP, expr, queryVec1, sp1, common.DefaultTopK), + client.NewANNSearchRequest(common.DefaultFloatVecFieldName, entity.L2, "", queryVec2, sp2, common.DefaultTopK), + } + for _, reranker := range []client.Reranker{ + client.NewRRFReranker(), + client.NewWeightedReranker([]float64{0.5, 0.6}), + } { + // hybrid search + searchRes, errSearch := mc.HybridSearch(ctx, collName, []string{}, common.DefaultTopK, []string{"*"}, reranker, sReqs) + common.CheckErr(t, errSearch, true) + common.CheckSearchResult(t, searchRes, common.DefaultNq, common.DefaultTopK) + common.CheckErr(t, errSearch, true) + outputFields := []string{common.DefaultIntFieldName, common.DefaultVarcharFieldName, common.DefaultFloatVecFieldName, + common.DefaultSparseVecFieldName, common.DefaultDynamicFieldName} + common.CheckOutputFields(t, searchRes[0].Fields, outputFields) + } + } +} diff --git a/test/testcases/index_test.go b/test/testcases/index_test.go index 739982373..a9fa0539e 100644 --- a/test/testcases/index_test.go +++ b/test/testcases/index_test.go @@ -672,6 +672,122 @@ func TestCreateAutoIndexScalarFields(t *testing.T) { } } +func TestCreateIndexSparseVector(t *testing.T) { + t.Parallel() + idxInverted := entity.NewGenericIndex(common.DefaultSparseVecFieldName, "SPARSE_INVERTED_INDEX", map[string]string{"drop_ratio_build": "0.2", "metric_type": "IP"}) + idxWand := entity.NewGenericIndex(common.DefaultSparseVecFieldName, "SPARSE_WAND", map[string]string{"drop_ratio_build": "0.3", "metric_type": "IP"}) + + for _, idx := range []entity.Index{idxInverted, idxWand} { + ctx := createContext(t, time.Second*common.DefaultTimeout) + //connect + mc := createMilvusClient(ctx, t) + + // create collection with all datatype + cp := CollectionParams{CollectionFieldsType: Int64VarcharSparseVec, AutoID: false, EnableDynamicField: true, + ShardsNum: common.DefaultShards, Dim: common.DefaultDim, MaxLength: 300} + collName := createCollection(ctx, t, mc, cp) + + // insert + dp := DataParams{CollectionName: collName, PartitionName: "", CollectionFieldsType: Int64VarcharSparseVec, + start: 0, nb: common.DefaultNb, dim: common.DefaultDim, EnableDynamicField: true, WithRows: false} + _, _ = insertData(ctx, t, mc, dp, common.WithSparseVectorLen(100)) + mc.Flush(ctx, collName, false) + + // create index + err := mc.CreateIndex(ctx, collName, common.DefaultSparseVecFieldName, idx, false) + common.CheckErr(t, err, true) + + // describe index + idx2, err := mc.DescribeIndex(ctx, collName, common.DefaultSparseVecFieldName) + common.CheckErr(t, err, true) + common.CheckIndexResult(t, idx2, idx) + } +} + +// create index on sparse vector with invalid params +func TestCreateSparseIndexInvalidParams(t *testing.T) { + for _, indexType := range []entity.IndexType{"SPARSE_INVERTED_INDEX", "SPARSE_WAND"} { + ctx := createContext(t, time.Second*common.DefaultTimeout) + //connect + mc := createMilvusClient(ctx, t) + + // create collection with all datatype + cp := CollectionParams{CollectionFieldsType: Int64VarcharSparseVec, AutoID: false, EnableDynamicField: true, + ShardsNum: common.DefaultShards, Dim: common.DefaultDim, MaxLength: 300} + collName := createCollection(ctx, t, mc, cp) + + // insert + dp := DataParams{CollectionName: collName, PartitionName: "", CollectionFieldsType: Int64VarcharSparseVec, + start: 0, nb: common.DefaultNb, dim: common.DefaultDim, EnableDynamicField: true, WithRows: false} + _, _ = insertData(ctx, t, mc, dp, common.WithSparseVectorLen(100)) + mc.Flush(ctx, collName, false) + + // create index with invalid metric type + for _, mt := range common.UnsupportedSparseVecMetricsType { + idx := entity.NewGenericIndex(common.DefaultSparseVecFieldName, indexType, map[string]string{"drop_ratio_build": "0.2", "metric_type": string(mt)}) + err := mc.CreateIndex(ctx, collName, common.DefaultSparseVecFieldName, idx, false) + common.CheckErr(t, err, false, "only IP is the supported metric type for sparse index") + } + + // create index with invalid drop_ratio_build + for _, drb := range []string{"a", "-0.1", "1.3"} { + idx := entity.NewGenericIndex(common.DefaultSparseVecFieldName, indexType, map[string]string{"drop_ratio_build": drb, "metric_type": "IP"}) + err := mc.CreateIndex(ctx, collName, common.DefaultSparseVecFieldName, idx, false) + common.CheckErr(t, err, false, "must be in range [0, 1)") + } + + // create index and describe index + idx := entity.NewGenericIndex(common.DefaultSparseVecFieldName, indexType, map[string]string{"drop_ratio_build": "0", "metric_type": "IP"}) + err := mc.CreateIndex(ctx, collName, common.DefaultSparseVecFieldName, idx, false) + common.CheckErr(t, err, true) + + descIdx, _ := mc.DescribeIndex(ctx, collName, common.DefaultSparseVecFieldName) + log.Println(descIdx[0].Name(), descIdx[0].IndexType(), descIdx[0].Params()) + } +} + +//create sparse unsupported index: other vector index and scalar index and auto index +func TestCreateSparseUnsupportedIndex(t *testing.T) { + ctx := createContext(t, time.Second*common.DefaultTimeout) + //connect + mc := createMilvusClient(ctx, t) + + // create collection with all datatype + cp := CollectionParams{CollectionFieldsType: Int64VarcharSparseVec, AutoID: false, EnableDynamicField: true, + ShardsNum: common.DefaultShards, Dim: common.DefaultDim, MaxLength: 300} + collName := createCollection(ctx, t, mc, cp) + + // insert + dp := DataParams{CollectionName: collName, PartitionName: "", CollectionFieldsType: Int64VarcharSparseVec, + start: 0, nb: common.DefaultNb, dim: common.DefaultDim, EnableDynamicField: true, WithRows: false} + _, _ = insertData(ctx, t, mc, dp, common.WithSparseVectorLen(100)) + mc.Flush(ctx, collName, false) + + // create unsupported vector index on sparse field + autoIdx, _ := entity.NewIndexAUTOINDEX(entity.IP) + vectorIndex := append(common.GenAllFloatIndex(entity.IP), autoIdx) + for _, idx := range vectorIndex { + log.Println(idx.IndexType(), idx.Params()) + err := mc.CreateIndex(ctx, collName, common.DefaultSparseVecFieldName, idx, false) + common.CheckErr(t, err, false, "data type should be FloatVector, Float16Vector or BFloat16Vector", + "HNSW only support float vector data type") + } + + // create scalar index on sparse vector + for _, idx := range []entity.Index{ + entity.NewScalarIndex(), + entity.NewScalarIndexWithType(entity.Trie), + entity.NewScalarIndexWithType(entity.Sorted), + entity.NewScalarIndexWithType(entity.Inverted), + } { + log.Println(idx.IndexType(), idx.Params()) + err := mc.CreateIndex(ctx, collName, common.DefaultSparseVecFieldName, idx, false) + common.CheckErr(t, err, false, "TRIE are only supported on varchar field", + "STL_SORT are only supported on numeric field", "HNSW only support float vector data type", + "INVERTED are not supported on SparseFloatVector field") + } +} + // test new index by Generic index func TestCreateIndexGeneric(t *testing.T) { ctx := createContext(t, time.Second*common.DefaultTimeout) diff --git a/test/testcases/insert_test.go b/test/testcases/insert_test.go index 20a505f63..62d7434a7 100644 --- a/test/testcases/insert_test.go +++ b/test/testcases/insert_test.go @@ -6,6 +6,7 @@ import ( "context" "fmt" "log" + "math" "math/rand" "strconv" "testing" @@ -426,7 +427,7 @@ func TestInsertEmptyArray(t *testing.T) { collName := createCollection(ctx, t, mc, cp) // prepare and insert data - var capacity int64 = 0 + var capacity int64 dp := DataParams{CollectionName: collName, PartitionName: "", CollectionFieldsType: Int64FloatVecArray, start: 0, nb: common.DefaultNb, dim: common.DefaultDim, EnableDynamicField: true, WithRows: false} _, _ = insertData(ctx, t, mc, dp, common.WithArrayCapacity(capacity)) @@ -525,3 +526,123 @@ func TestInsertArrayDataCapacityExceed(t *testing.T) { common.CheckErr(t, err, false, "array length exceeds max capacity") } } + +// test insert sparse vector column and rows +func TestInsertSparseData(t *testing.T) { + ctx := createContext(t, time.Second*common.DefaultTimeout) + // connect + mc := createMilvusClient(ctx, t) + + cp := CollectionParams{CollectionFieldsType: Int64VarcharSparseVec, AutoID: false, EnableDynamicField: true, + ShardsNum: common.DefaultShards, Dim: common.DefaultDim, MaxLength: common.TestMaxLen} + collName := createCollection(ctx, t, mc, cp) + + // insert data column + intColumn1 := common.GenColumnData(0, common.DefaultNb, entity.FieldTypeInt64, common.DefaultIntFieldName) + data := []entity.Column{ + intColumn1, + common.GenColumnData(0, common.DefaultNb, entity.FieldTypeVarChar, common.DefaultVarcharFieldName), + common.GenColumnData(0, common.DefaultNb, entity.FieldTypeFloatVector, common.DefaultFloatVecFieldName, common.WithVectorDim(common.DefaultDim)), + common.GenColumnData(0, common.DefaultNb, entity.FieldTypeSparseVector, common.DefaultSparseVecFieldName, common.WithSparseVectorLen(20)), + } + ids, err := mc.Insert(ctx, collName, "", data...) + common.CheckErr(t, err, true) + common.CheckInsertResult(t, ids, intColumn1) + + // insert rows + rows := common.GenDefaultSparseRows(common.DefaultNb, common.DefaultNb, common.DefaultDim, 50, true) + ids2, err := mc.InsertRows(ctx, collName, "", rows) + common.CheckErr(t, err, true) + require.Equal(t, ids2.Len(), common.DefaultNb) + + // flush and verify + err = mc.Flush(ctx, collName, false) + common.CheckErr(t, err, true) + stats, _ := mc.GetCollectionStatistics(ctx, collName) + require.Equal(t, strconv.Itoa(common.DefaultNb*2), stats[common.RowCount]) +} + +// the dimension of a sparse embedding can be any value from 0 to (maximum of uint32 - 1) +func TestInsertSparseDataMaxDim(t *testing.T) { + // invalid sparse vector: positions >= uint32 + ctx := createContext(t, time.Second*common.DefaultTimeout) + // connect + mc := createMilvusClient(ctx, t) + + cp := CollectionParams{CollectionFieldsType: Int64VarcharSparseVec, AutoID: false, EnableDynamicField: true, + ShardsNum: common.DefaultShards, Dim: common.DefaultDim, MaxLength: common.TestMaxLen} + collName := createCollection(ctx, t, mc, cp) + + // insert data column + pkColumn := common.GenColumnData(0, 1, entity.FieldTypeInt64, common.DefaultIntFieldName) + data := []entity.Column{ + pkColumn, + common.GenColumnData(0, 1, entity.FieldTypeVarChar, common.DefaultVarcharFieldName), + common.GenColumnData(0, 1, entity.FieldTypeFloatVector, common.DefaultFloatVecFieldName, common.WithVectorDim(common.DefaultDim)), + } + // sparse vector with max dim + positions := []uint32{0, math.MaxUint32 - 10, math.MaxUint32 - 1} + values := []float32{0.453, 5.0776, 100.098} + sparseVec, err := entity.NewSliceSparseEmbedding(positions, values) + common.CheckErr(t, err, true) + data = append(data, entity.NewColumnSparseVectors(common.DefaultSparseVecFieldName, []entity.SparseEmbedding{sparseVec})) + ids, err := mc.Insert(ctx, collName, "", data...) + common.CheckErr(t, err, true) + common.CheckInsertResult(t, ids, pkColumn) +} + +func TestInsertSparseInvalidVector(t *testing.T) { + // invalid sparse vector: len(positions) != len(values) + positions := []uint32{1, 10} + values := []float32{0.4, 5.0, 0.34} + _, err := entity.NewSliceSparseEmbedding(positions, values) + common.CheckErr(t, err, false, "invalid sparse embedding input, positions shall have same number of values") + + // invalid sparse vector: positions >= uint32 + ctx := createContext(t, time.Second*common.DefaultTimeout) + // connect + mc := createMilvusClient(ctx, t) + + cp := CollectionParams{CollectionFieldsType: Int64VarcharSparseVec, AutoID: false, EnableDynamicField: true, + ShardsNum: common.DefaultShards, Dim: common.DefaultDim, MaxLength: common.TestMaxLen} + collName := createCollection(ctx, t, mc, cp) + + // insert data column + data := []entity.Column{ + common.GenColumnData(0, 1, entity.FieldTypeInt64, common.DefaultIntFieldName), + common.GenColumnData(0, 1, entity.FieldTypeVarChar, common.DefaultVarcharFieldName), + common.GenColumnData(0, 1, entity.FieldTypeFloatVector, common.DefaultFloatVecFieldName, common.WithVectorDim(common.DefaultDim)), + } + // invalid sparse vector: position > (maximum of uint32 - 1) + positions = []uint32{math.MaxUint32} + values = []float32{0.4} + sparseVec, err := entity.NewSliceSparseEmbedding(positions, values) + common.CheckErr(t, err, true) + data = append(data, entity.NewColumnSparseVectors(common.DefaultSparseVecFieldName, []entity.SparseEmbedding{sparseVec})) + _, err = mc.Insert(ctx, collName, "", data...) + common.CheckErr(t, err, false, "invalid index in sparse float vector: must be less than 2^32-1") +} + +func TestInsertSparseVectorSamePosition(t *testing.T) { + // invalid sparse vector: positions >= uint32 + ctx := createContext(t, time.Second*common.DefaultTimeout) + // connect + mc := createMilvusClient(ctx, t) + + cp := CollectionParams{CollectionFieldsType: Int64VarcharSparseVec, AutoID: false, EnableDynamicField: true, + ShardsNum: common.DefaultShards, Dim: common.DefaultDim, MaxLength: common.TestMaxLen} + collName := createCollection(ctx, t, mc, cp) + + //insert data column + data := []entity.Column{ + common.GenColumnData(0, 1, entity.FieldTypeInt64, common.DefaultIntFieldName), + common.GenColumnData(0, 1, entity.FieldTypeVarChar, common.DefaultVarcharFieldName), + common.GenColumnData(0, 1, entity.FieldTypeFloatVector, common.DefaultFloatVecFieldName, common.WithVectorDim(common.DefaultDim)), + } + //invalid sparse vector: position > (maximum of uint32 - 1) + sparseVec, err := entity.NewSliceSparseEmbedding([]uint32{2, 10, 2}, []float32{0.4, 0.5, 0.6}) + common.CheckErr(t, err, true) + data = append(data, entity.NewColumnSparseVectors(common.DefaultSparseVecFieldName, []entity.SparseEmbedding{sparseVec})) + _, err = mc.Insert(ctx, collName, "", data...) + common.CheckErr(t, err, false, "unsorted or same indices in sparse float vector") +} diff --git a/test/testcases/main_test.go b/test/testcases/main_test.go index 74d21e5c6..db7e16fc5 100644 --- a/test/testcases/main_test.go +++ b/test/testcases/main_test.go @@ -242,13 +242,14 @@ func createVarcharCollectionWithDataIndex(ctx context.Context, t *testing.T, mc } const ( - Int64FloatVec CollectionFieldsType = "PkInt64FloatVec" // int64 + float + floatVec - Int64BinaryVec CollectionFieldsType = "Int64BinaryVec" // int64 + float + binaryVec - VarcharBinaryVec CollectionFieldsType = "PkVarcharBinaryVec" // varchar + binaryVec - Int64FloatVecJSON CollectionFieldsType = "PkInt64FloatVecJson" // int64 + float + floatVec + json - Int64FloatVecArray CollectionFieldsType = "Int64FloatVecArray" // int64 + float + floatVec + all array - AllVectors CollectionFieldsType = "AllVectors" // int64 + fp32Vec + fp16Vec + binaryVec - AllFields CollectionFieldsType = "AllFields" // all scalar fields + floatVec + Int64FloatVec CollectionFieldsType = "PkInt64FloatVec" // int64 + float + floatVec + Int64BinaryVec CollectionFieldsType = "Int64BinaryVec" // int64 + float + binaryVec + VarcharBinaryVec CollectionFieldsType = "PkVarcharBinaryVec" // varchar + binaryVec + Int64FloatVecJSON CollectionFieldsType = "PkInt64FloatVecJson" // int64 + float + floatVec + json + Int64FloatVecArray CollectionFieldsType = "Int64FloatVecArray" // int64 + float + floatVec + all array + Int64VarcharSparseVec CollectionFieldsType = "Int64VarcharSparseVec" // int64 + varchar + float32Vec + sparseVec + AllVectors CollectionFieldsType = "AllVectors" // int64 + fp32Vec + fp16Vec + binaryVec + AllFields CollectionFieldsType = "AllFields" // all scalar fields + floatVec ) func createCollection(ctx context.Context, t *testing.T, mc *base.MilvusClient, cp CollectionParams, opts ...client.CreateCollectionOption) string { @@ -271,6 +272,14 @@ func createCollection(ctx context.Context, t *testing.T, mc *base.MilvusClient, case Int64FloatVecArray: fields = common.GenDefaultFields(cp.AutoID) fields = append(fields, common.GenAllArrayFieldsWithCapacity(cp.MaxCapacity)...) + case Int64VarcharSparseVec: + fields = []*entity.Field{ + common.GenField(common.DefaultIntFieldName, entity.FieldTypeInt64, common.WithIsPrimaryKey(true), common.WithAutoID(cp.AutoID)), + common.GenField(common.DefaultVarcharFieldName, entity.FieldTypeVarChar, common.WithMaxLength(cp.MaxLength)), + common.GenField(common.DefaultFloatVecFieldName, entity.FieldTypeFloatVector, common.WithDim(cp.Dim)), + common.GenField(common.DefaultSparseVecFieldName, entity.FieldTypeSparseVector), + } + case AllVectors: fields = []*entity.Field{ common.GenField(common.DefaultIntFieldName, entity.FieldTypeInt64, common.WithIsPrimaryKey(true), common.WithAutoID(cp.AutoID)), @@ -347,7 +356,15 @@ func insertData(ctx context.Context, t *testing.T, mc *base.MilvusClient, dp Dat intColumn, floatColumn, vecColumn := common.GenDefaultColumnData(dp.start, dp.nb, dp.dim) data = append(data, intColumn, floatColumn, vecColumn) } - + case Int64VarcharSparseVec: + if dp.WithRows { + rows = common.GenDefaultSparseRows(dp.start, dp.nb, dp.dim, dp.maxLenSparse, dp.EnableDynamicField) + } else { + intColumn, _, vecColumn := common.GenDefaultColumnData(dp.start, dp.nb, dp.dim) + varColumn := common.GenColumnData(dp.start, dp.nb, entity.FieldTypeVarChar, common.DefaultVarcharFieldName) + sparseColumn := common.GenColumnData(dp.start, dp.nb, entity.FieldTypeSparseVector, common.DefaultSparseVecFieldName, opts...) + data = append(data, intColumn, varColumn, vecColumn, sparseColumn) + } case AllVectors: if dp.WithRows { rows = common.GenAllVectorsRows(dp.start, dp.nb, dp.dim, dp.EnableDynamicField) diff --git a/test/testcases/option.go b/test/testcases/option.go index 171f69669..6a88973d0 100644 --- a/test/testcases/option.go +++ b/test/testcases/option.go @@ -33,6 +33,7 @@ type DataParams struct { EnableDynamicField bool // whether insert dynamic field data WithRows bool DoInsert bool + maxLenSparse int } func (d DataParams) IsEmpty() bool { diff --git a/test/testcases/query_test.go b/test/testcases/query_test.go index 0b384fb55..1aa41fdc7 100644 --- a/test/testcases/query_test.go +++ b/test/testcases/query_test.go @@ -1075,6 +1075,49 @@ func TestQueryCountAfterDml(t *testing.T) { require.Equal(t, int64(common.DefaultNb+upsertNb), countAfterCompact.GetColumn(common.QueryCountFieldName).(*entity.ColumnInt64).Data()[0]) } +func TestQuerySparseVector(t *testing.T) { + t.Parallel() + idxInverted := entity.NewGenericIndex(common.DefaultSparseVecFieldName, "SPARSE_INVERTED_INDEX", map[string]string{"drop_ratio_build": "0.2", "metric_type": "IP"}) + idxWand := entity.NewGenericIndex(common.DefaultSparseVecFieldName, "SPARSE_WAND", map[string]string{"drop_ratio_build": "0.3", "metric_type": "IP"}) + for _, idx := range []entity.Index{idxInverted, idxWand} { + ctx := createContext(t, time.Second*common.DefaultTimeout*2) + // connect + mc := createMilvusClient(ctx, t) + + // create -> insert [0, 3000) -> flush -> index -> load + cp := CollectionParams{CollectionFieldsType: Int64VarcharSparseVec, AutoID: false, EnableDynamicField: false, + ShardsNum: common.DefaultShards, Dim: common.DefaultDim, MaxLength: common.TestMaxLen} + collName := createCollection(ctx, t, mc, cp) + + // index + idxHnsw, _ := entity.NewIndexHNSW(entity.L2, 8, 96) + mc.CreateIndex(ctx, collName, common.DefaultFloatVecFieldName, idxHnsw, false) + mc.CreateIndex(ctx, collName, common.DefaultSparseVecFieldName, idx, false) + + // insert + intColumn, _, floatColumn := common.GenDefaultColumnData(0, common.DefaultNb, common.DefaultDim) + varColumn := common.GenColumnData(0, common.DefaultNb, entity.FieldTypeVarChar, common.DefaultVarcharFieldName) + sparseColumn := common.GenColumnData(0, common.DefaultNb, entity.FieldTypeSparseVector, common.DefaultSparseVecFieldName) + mc.Insert(ctx, collName, "", intColumn, varColumn, floatColumn, sparseColumn) + mc.Flush(ctx, collName, false) + mc.LoadCollection(ctx, collName, false) + + // count(*) + countRes, _ := mc.Query(ctx, collName, []string{}, fmt.Sprintf("%s >=0", common.DefaultIntFieldName), []string{common.QueryCountFieldName}) + require.Equal(t, int64(common.DefaultNb), countRes.GetColumn(common.QueryCountFieldName).(*entity.ColumnInt64).Data()[0]) + + // query + queryResult, err := mc.Query(ctx, collName, []string{}, fmt.Sprintf("%s == 0", common.DefaultIntFieldName), []string{"*"}) + common.CheckErr(t, err, true) + expIntColumn := entity.NewColumnInt64(common.DefaultIntFieldName, intColumn.(*entity.ColumnInt64).Data()[:1]) + expVarcharColumn := entity.NewColumnVarChar(common.DefaultVarcharFieldName, varColumn.(*entity.ColumnVarChar).Data()[:1]) + expVecColumn := entity.NewColumnFloatVector(common.DefaultFloatVecFieldName, int(common.DefaultDim), floatColumn.(*entity.ColumnFloatVector).Data()[:1]) + expSparseColumn := entity.NewColumnSparseVectors(common.DefaultSparseVecFieldName, sparseColumn.(*entity.ColumnSparseFloatVector).Data()[:1]) + common.CheckOutputFields(t, queryResult, []string{common.DefaultIntFieldName, common.DefaultVarcharFieldName, common.DefaultFloatVecFieldName, common.DefaultSparseVecFieldName}) + common.CheckQueryResult(t, queryResult, []entity.Column{expIntColumn, expVarcharColumn, expVecColumn, expSparseColumn}) + } +} + // TODO offset and limit // TODO consistency level // TODO ignore growing diff --git a/test/testcases/search_test.go b/test/testcases/search_test.go index e00657fa9..ccde2e54f 100644 --- a/test/testcases/search_test.go +++ b/test/testcases/search_test.go @@ -1552,7 +1552,114 @@ func TestSearchMultiVectors(t *testing.T) { } // TODO iterator search } +} + +func TestSearchSparseVector(t *testing.T) { + t.Skip("https://github.com/milvus-io/milvus-sdk-go/issues/725") + idxInverted := entity.NewGenericIndex(common.DefaultSparseVecFieldName, "SPARSE_INVERTED_INDEX", map[string]string{"drop_ratio_build": "0.2", "metric_type": "IP"}) + idxWand := entity.NewGenericIndex(common.DefaultSparseVecFieldName, "SPARSE_WAND", map[string]string{"drop_ratio_build": "0.3", "metric_type": "IP"}) + for _, idx := range []entity.Index{idxInverted, idxWand} { + ctx := createContext(t, time.Second*common.DefaultTimeout*2) + // connect + mc := createMilvusClient(ctx, t) + + // create -> insert [0, 3000) -> flush -> index -> load + cp := CollectionParams{CollectionFieldsType: Int64VarcharSparseVec, AutoID: false, EnableDynamicField: true, + ShardsNum: common.DefaultShards, Dim: common.DefaultDim, MaxLength: common.TestMaxLen} + + dp := DataParams{DoInsert: true, CollectionFieldsType: Int64VarcharSparseVec, start: 0, nb: common.DefaultNb * 4, + dim: common.DefaultDim, EnableDynamicField: true} + + // index params + idxHnsw, _ := entity.NewIndexHNSW(entity.L2, 8, 96) + ips := []IndexParams{ + {BuildIndex: true, Index: idx, FieldName: common.DefaultSparseVecFieldName, async: false}, + {BuildIndex: true, Index: idxHnsw, FieldName: common.DefaultFloatVecFieldName, async: false}, + } + collName := prepareCollection(ctx, t, mc, cp, WithDataParams(dp), WithIndexParams(ips), WithCreateOption(client.WithConsistencyLevel(entity.ClStrong))) + + // search + queryVec := common.GenSearchVectors(common.DefaultNq, common.DefaultDim, entity.FieldTypeSparseVector) + sp, _ := entity.NewIndexSparseInvertedSearchParam(0.2) + resSearch, errSearch := mc.Search(ctx, collName, []string{}, "", []string{"*"}, queryVec, common.DefaultSparseVecFieldName, + entity.IP, common.DefaultTopK, sp) + common.CheckErr(t, errSearch, true) + common.CheckSearchResult(t, resSearch, common.DefaultNq, common.DefaultTopK) + outputFields := []string{common.DefaultIntFieldName, common.DefaultVarcharFieldName, common.DefaultFloatVecFieldName, + common.DefaultSparseVecFieldName, common.DefaultDynamicFieldName} + common.CheckOutputFields(t, resSearch[0].Fields, outputFields) + } +} + +// test sparse vector unsupported search: range search, groupBy search, TODO iterator search +func TestSearchSparseVectorNotSupported(t *testing.T) { + ctx := createContext(t, time.Second*common.DefaultTimeout*2) + // connect + mc := createMilvusClient(ctx, t) + + // create -> insert [0, 3000) -> flush -> index -> load + cp := CollectionParams{CollectionFieldsType: Int64VarcharSparseVec, AutoID: false, EnableDynamicField: true, + ShardsNum: common.DefaultShards, Dim: common.DefaultDim, MaxLength: common.TestMaxLen} + + dp := DataParams{DoInsert: true, CollectionFieldsType: Int64VarcharSparseVec, start: 0, nb: common.DefaultNb * 2, + dim: common.DefaultDim, EnableDynamicField: true} + + // index params + idxHnsw, _ := entity.NewIndexHNSW(entity.L2, 8, 96) + idxWand := entity.NewGenericIndex(common.DefaultSparseVecFieldName, "SPARSE_WAND", map[string]string{"drop_ratio_build": "0.3", "metric_type": "IP"}) + ips := []IndexParams{ + {BuildIndex: true, Index: idxWand, FieldName: common.DefaultSparseVecFieldName, async: false}, + {BuildIndex: true, Index: idxHnsw, FieldName: common.DefaultFloatVecFieldName, async: false}, + } + collName := prepareCollection(ctx, t, mc, cp, WithDataParams(dp), WithIndexParams(ips), WithCreateOption(client.WithConsistencyLevel(entity.ClStrong))) + + // range search + queryVec := common.GenSearchVectors(common.DefaultNq, common.DefaultDim, entity.FieldTypeSparseVector) + sp, _ := entity.NewIndexSparseInvertedSearchParam(0.3) + sp.AddRadius(10) + sp.AddRangeFilter(100) + _, errSearch := mc.Search(ctx, collName, []string{}, "", []string{"*"}, queryVec, common.DefaultSparseVecFieldName, + entity.IP, common.DefaultTopK, sp) + common.CheckErr(t, errSearch, false, "RangeSearch not supported for current index type") + + // TODO sparse not supported groupBy search + //sp1, _ := entity.NewIndexSparseInvertedSearchParam(0.2) + //_, errSearch = mc.Search(ctx, collName, []string{}, "", []string{"*"}, queryVec, common.DefaultSparseVecFieldName, + // entity.IP, common.DefaultTopK, sp1, client.WithGroupByField(common.DefaultVarcharFieldName)) + //common.CheckErr(t, errSearch, false, "RangeSearch not supported for current index type") +} + +// test sparse vector unsupported search: range search, groupBy search, TODO iterator search +func TestSearchSparseVectorInvalidParams(t *testing.T) { + t.Skip("https://github.com/milvus-io/milvus-sdk-go/issues/724") + ctx := createContext(t, time.Second*common.DefaultTimeout*2) + // connect + mc := createMilvusClient(ctx, t) + + // create -> insert [0, 3000) -> flush -> index -> load + cp := CollectionParams{CollectionFieldsType: Int64VarcharSparseVec, AutoID: false, EnableDynamicField: true, + ShardsNum: common.DefaultShards, Dim: common.DefaultDim, MaxLength: common.TestMaxLen} + + dp := DataParams{DoInsert: true, CollectionFieldsType: Int64VarcharSparseVec, start: 0, nb: common.DefaultNb * 2, + dim: common.DefaultDim, EnableDynamicField: true} + // index params + idxHnsw, _ := entity.NewIndexHNSW(entity.L2, 8, 96) + idxWand := entity.NewGenericIndex(common.DefaultSparseVecFieldName, "SPARSE_WAND", map[string]string{"drop_ratio_build": "0.3", "metric_type": "IP"}) + ips := []IndexParams{ + {BuildIndex: true, Index: idxWand, FieldName: common.DefaultSparseVecFieldName, async: false}, + {BuildIndex: true, Index: idxHnsw, FieldName: common.DefaultFloatVecFieldName, async: false}, + } + collName := prepareCollection(ctx, t, mc, cp, WithDataParams(dp), WithIndexParams(ips), WithCreateOption(client.WithConsistencyLevel(entity.ClStrong))) + + // search with invalid params + queryVec := common.GenSearchVectors(common.DefaultNq, common.DefaultDim, entity.FieldTypeSparseVector) + for _, dropRatio := range []float64{1.2, -0.3} { + sp, _ := entity.NewIndexSparseInvertedSearchParam(dropRatio) + _, errSearch := mc.Search(ctx, collName, []string{}, "", []string{"*"}, queryVec, common.DefaultSparseVecFieldName, + entity.IP, common.DefaultTopK, sp) + common.CheckErr(t, errSearch, false, "invalid") + } } // TODO offset and limit