From 7c44b8806c8aece2f6431cae67efed89c05aeec7 Mon Sep 17 00:00:00 2001 From: ryjiang Date: Mon, 18 Nov 2024 15:27:00 +0800 Subject: [PATCH] [2.5] support full text search (#374) * WIP Signed-off-by: ryjiang * WIP Signed-off-by: ryjiang * finish full text search Signed-off-by: ryjiang * updat test Signed-off-by: ryjiang --------- Signed-off-by: ryjiang --- milvus/types/Collection.ts | 2 +- milvus/utils/Format.ts | 89 +++++++------- package.json | 2 +- test/grpc/FullTextSearch.spec.ts | 205 +++++++++++++++++++++++++++++++ test/grpc/Functions.spec.ts | 4 +- test/tools/collection.ts | 2 +- test/utils/Format.spec.ts | 54 ++++++-- 7 files changed, 305 insertions(+), 53 deletions(-) create mode 100644 test/grpc/FullTextSearch.spec.ts diff --git a/milvus/types/Collection.ts b/milvus/types/Collection.ts index 9f5084f2..2b410d07 100644 --- a/milvus/types/Collection.ts +++ b/milvus/types/Collection.ts @@ -83,7 +83,7 @@ export interface FieldType { nullable?: boolean; enable_match?: boolean; tokenizer_params?: Record; - enable_tokenizer?: boolean; + enable_analyzer?: boolean; } export interface ShowCollectionsReq extends GrpcTimeOut { diff --git a/milvus/utils/Format.ts b/milvus/utils/Format.ts index a524c255..1fa94855 100644 --- a/milvus/utils/Format.ts +++ b/milvus/utils/Format.ts @@ -189,11 +189,11 @@ export const formatAddress = (address: string) => { }; /** - * Assigns properties with keys `dim` or `max_length` to the `type_params` object of a `FieldType` object. - * If the property exists in the `field` object, it is converted to a string and then deleted from the `field` object. - * If the property already exists in the `type_params` object, it is also converted to a string. + * Assigns specified properties from the `field` object to `type_params` within the `FieldType` object. + * Converts properties to strings, serializing objects as JSON strings if needed, then removes them from `field`. * - * @param field The `FieldType` object to modify. + * @param field - The `FieldType` object to modify. + * @param typeParamKeys - Keys to assign to `type_params` if present in `field`. * @returns The modified `FieldType` object. */ export const assignTypeParams = ( @@ -203,31 +203,30 @@ export const assignTypeParams = ( 'max_length', 'max_capacity', 'enable_match', - 'enable_tokenizer', - 'tokenizer_params', + 'enable_analyzer', + 'analyzer_params', ] -) => { - let newField = cloneObj(field); +): FieldType => { + const newField = cloneObj(field); + + // Initialize `type_params` if undefined + newField.type_params ??= {}; + typeParamKeys.forEach(key => { - if (newField.hasOwnProperty(key)) { - // if the property exists in the field object, assign it to the type_params object - newField.type_params = newField.type_params || {}; - newField.type_params[key] = - typeof newField[key as keyof FieldType] !== 'object' - ? String(newField[key as keyof FieldType] ?? '') - : (newField[key as keyof FieldType] as TypeParam); - // delete the property from the field object + if (key in newField) { + const value = newField[key as keyof FieldType]; + // Convert the value to a string, JSON-stringify if it’s an object + newField.type_params![key] = + typeof value === 'object' ? JSON.stringify(value) : String(value ?? ''); delete newField[key as keyof FieldType]; } - - if (newField.type_params && newField.type_params[key]) { - // if the property already exists in the type_params object, convert it to a string, - newField.type_params[key] = - typeof newField.type_params[key] !== 'object' - ? String(newField.type_params[key]) - : newField.type_params[key]; - } }); + + // delete type_params if it's empty + if (!Object.keys(newField.type_params).length) { + delete newField.type_params; + } + return newField; }; @@ -328,7 +327,27 @@ export const formatCollectionSchema = ( fields = (data as CreateCollectionWithSchemaReq).schema; } - const payload = { + let payload = {} as any; + + const functionOutputFields: string[] = []; + + // if functions is set, parse its params to key-value pairs, and delete inputs and outputs + if (functions) { + payload.functions = functions.map((func: any) => { + const { input_field_names, output_field_names, ...rest } = func; + + functionOutputFields.push(...output_field_names); + + return schemaTypes.functionSchemaType.create({ + ...rest, + inputFieldNames: input_field_names, + outputFieldNames: output_field_names, + params: parseToKeyValue(func.params, true), + }); + }); + } + + payload = { name: collection_name, description: description || '', enableDynamicField: !!enableDynamicField || !!enable_dynamic_field, @@ -352,7 +371,8 @@ export const formatCollectionSchema = ( isPrimaryKey: !!is_primary_key, isPartitionKey: !!is_partition_key || field.name === partition_key_field, - isFunctionOutput: !!is_function_output, + isFunctionOutput: + !!is_function_output || functionOutputFields.includes(field.name), isClusteringKey: !!field.is_clustering_key || field.name === clustring_key_field, }; @@ -372,21 +392,8 @@ export const formatCollectionSchema = ( } return schemaTypes.fieldSchemaType.create(createObj); }), - functions: [], - } as any; - - // if functions is set, parse its params to key-value pairs, and delete inputs and outputs - if (functions) { - payload.functions = functions.map((func: any) => { - const { input_field_names, output_field_names, ...rest } = func; - return schemaTypes.functionSchemaType.create({ - ...rest, - inputFieldNames: input_field_names, - outputFieldNames: output_field_names, - params: parseToKeyValue(func.params, true), - }); - }); - } + ...payload, + }; return payload; }; diff --git a/package.json b/package.json index e28c53ef..57b1ed03 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "@zilliz/milvus2-sdk-node", "author": "ued@zilliz.com", - "milvusVersion": "master-20241105-bd04cac4-amd64", + "milvusVersion": "master-20241118-12ed40e1-amd64", "version": "2.4.9", "main": "dist/milvus", "files": [ diff --git a/test/grpc/FullTextSearch.spec.ts b/test/grpc/FullTextSearch.spec.ts new file mode 100644 index 00000000..ede2d0a5 --- /dev/null +++ b/test/grpc/FullTextSearch.spec.ts @@ -0,0 +1,205 @@ +import { + MilvusClient, + DataType, + ErrorCode, + MetricType, + ConsistencyLevelEnum, + IndexType, +} from '../../milvus'; +import { + IP, + genCollectionParams, + GENERATE_NAME, + generateInsertData, + dynamicFields, +} from '../tools'; + +const milvusClient = new MilvusClient({ address: IP, logLevel: 'info' }); +const COLLECTION = GENERATE_NAME(); +const dbParam = { + db_name: 'FullTextSearch', +}; +const numPartitions = 3; + +// create +const createCollectionParams = genCollectionParams({ + collectionName: COLLECTION, + dim: [4], + vectorType: [DataType.FloatVector], + autoID: false, + partitionKeyEnabled: true, + numPartitions, + enableDynamic: true, + fields: [ + { + name: 'text', + description: 'text field', + data_type: DataType.VarChar, + max_length: 200, + is_partition_key: false, + enable_analyzer: true, + enable_match: true, + analyzer_params: { tokenizer: 'jieba' }, + }, + ], +}); + +describe(`Full text search API`, () => { + beforeAll(async () => { + // create db and use db + await milvusClient.createDatabase(dbParam); + await milvusClient.use(dbParam); + }); + afterAll(async () => { + await milvusClient.dropCollection({ + collection_name: COLLECTION, + }); + await milvusClient.dropDatabase(dbParam); + }); + + it(`Create schema with function collection should success`, async () => { + const create = await milvusClient.createCollection(createCollectionParams); + + expect(create.error_code).toEqual(ErrorCode.SUCCESS); + + // describe + const describe = await milvusClient.describeCollection({ + collection_name: COLLECTION, + }); + // expect the 'vector' field to be created + expect(describe.schema.fields.length).toEqual( + createCollectionParams.fields.length + ); + + // find varchar field + const text = describe.schema.fields.find(field => field.name === 'text'); + + const enableMatch = text?.type_params?.find( + param => param.key === 'enable_match' + ); + + const enableAnalyzer = text?.type_params?.find( + param => param.key === 'enable_analyzer' + ); + + const analyzerParams = text?.type_params?.find( + param => param.key === 'analyzer_params' + ); + + expect(enableMatch?.value).toEqual('true'); + expect(enableAnalyzer?.value).toEqual('true'); + expect(JSON.parse(analyzerParams?.value as any)).toEqual({ + tokenizer: 'jieba', + }); + }); + + it(`Insert data with function field should success`, async () => { + const data = generateInsertData( + [...createCollectionParams.fields, ...dynamicFields], + 10 + ); + + const insert = await milvusClient.insert({ + collection_name: COLLECTION, + fields_data: data, + }); + + expect(insert.status.error_code).toEqual(ErrorCode.SUCCESS); + }); + + it(`Create index on function output field should success`, async () => { + const createIndex = await milvusClient.createIndex({ + collection_name: COLLECTION, + index_name: 't2', + field_name: 'vector', + index_type: IndexType.AUTOINDEX, + metric_type: MetricType.COSINE, + }); + + expect(createIndex.error_code).toEqual(ErrorCode.SUCCESS); + + // load + const load = await milvusClient.loadCollection({ + collection_name: COLLECTION, + }); + + expect(load.error_code).toEqual(ErrorCode.SUCCESS); + }); + + it(`query with function output field should success`, async () => { + // query + const query = await milvusClient.query({ + collection_name: COLLECTION, + limit: 10, + expr: 'id > 0', + output_fields: ['text'], + filter: "TEXT_MATCH(text, 'apple')", + consistency_level: ConsistencyLevelEnum.Strong, + }); + + expect(query.status.error_code).toEqual(ErrorCode.SUCCESS); + // every text value should be 'apple' + query.data.forEach(item => { + expect(item.text).toEqual('apple'); + }); + }); + + it(`search with text should success`, async () => { + // search nq = 1 + const search = await milvusClient.search({ + collection_name: COLLECTION, + limit: 10, + data: [1, 2, 3, 4], + output_fields: ['text'], + filter: "TEXT_MATCH(text, 'apple')", + params: { drop_ratio_search: 0.6 }, + consistency_level: ConsistencyLevelEnum.Strong, + }); + + expect(search.status.error_code).toEqual(ErrorCode.SUCCESS); + // expect text value to be 'apple' + expect(search.results[0].text).toEqual('apple'); + + // nq > 1 + const search2 = await milvusClient.search({ + collection_name: COLLECTION, + limit: 10, + data: [ + [1, 2, 3, 4], + [5, 6, 7, 8], + ], + output_fields: ['*'], + filter: "TEXT_MATCH(text, 'apple')", + params: { drop_ratio_search: 0.6 }, + consistency_level: ConsistencyLevelEnum.Strong, + }); + + expect(search2.status.error_code).toEqual(ErrorCode.SUCCESS); + // expect text value to be 'apple' + expect(search2.results[0][0].text).toEqual('apple'); + + // multiple search + const search3 = await milvusClient.search({ + collection_name: COLLECTION, + limit: 10, + data: [ + { + data: [1, 2, 3, 4], + anns_field: 'vector', + params: { nprobe: 2 }, + }, + { + data: [5, 6, 7, 8], + anns_field: 'vector', + }, + ], + filter: "TEXT_MATCH(text, 'apple')", + output_fields: ['text'], + consistency_level: ConsistencyLevelEnum.Strong, + }); + + expect(search3.status.error_code).toEqual(ErrorCode.SUCCESS); + // expect text value to be 'apple' + expect(search3.results[0].text).toEqual('apple'); + }); +}); diff --git a/test/grpc/Functions.spec.ts b/test/grpc/Functions.spec.ts index d9ee95f2..fcf554a7 100644 --- a/test/grpc/Functions.spec.ts +++ b/test/grpc/Functions.spec.ts @@ -14,7 +14,7 @@ import { dynamicFields, } from '../tools'; -const milvusClient = new MilvusClient({ address: IP, logLevel: 'info' }); +const milvusClient = new MilvusClient({ address: IP, logLevel: 'debug' }); const COLLECTION = GENERATE_NAME(); const dbParam = { db_name: 'Functions', @@ -37,7 +37,7 @@ const createCollectionParams = genCollectionParams({ data_type: DataType.VarChar, max_length: 20, is_partition_key: false, - enable_tokenizer: true, + enable_analyzer: true, }, { name: 'sparse', diff --git a/test/tools/collection.ts b/test/tools/collection.ts index a44ef4a0..a77d733a 100644 --- a/test/tools/collection.ts +++ b/test/tools/collection.ts @@ -132,7 +132,7 @@ export const genCollectionParams = (data: { default_value: DEFAULT_STRING_VALUE, max_length: MAX_LENGTH, is_partition_key: partitionKeyEnabled, - enable_tokenizer: true, + enable_analyzer: true, }, { name: 'json', diff --git a/test/utils/Format.spec.ts b/test/utils/Format.spec.ts index 4a00d05a..66c794c4 100644 --- a/test/utils/Format.spec.ts +++ b/test/utils/Format.spec.ts @@ -29,6 +29,7 @@ import { formatSearchData, buildSearchRequest, FieldSchema, + CreateCollectionReq, buildSearchParams, SearchSimpleReq, } from '../../milvus'; @@ -178,15 +179,15 @@ describe('utils/format', () => { expect(methodName).toBe('123'); }); - it('should assign properties with keys `dim` or `max_length` to the `type_params`, `enable_match`, `tokenizer_params`, `enable_tokenizer` object and delete them from the `field` object', () => { + it('should assign properties with keys `dim` or `max_length` to the `type_params`, `enable_match`, `analyzer_params`, `enable_analyzer` object and delete them from the `field` object', () => { const field = { name: 'vector', data_type: 'BinaryVector', dim: 128, max_length: 100, enable_match: true, - tokenizer_params: { key: 'value' }, - enable_tokenizer: true, + analyzer_params: { key: 'value' }, + enable_analyzer: true, } as FieldType; const expectedOutput = { name: 'vector', @@ -195,8 +196,8 @@ describe('utils/format', () => { dim: '128', max_length: '100', enable_match: 'true', - tokenizer_params: { key: 'value' }, - enable_tokenizer: 'true', + analyzer_params: JSON.stringify({ key: 'value' }), + enable_analyzer: 'true', }, }; expect(assignTypeParams(field)).toEqual(expectedOutput); @@ -291,8 +292,23 @@ describe('utils/format', () => { max_capacity: 64, element_type: DataType.Int64, }, + { + name: 'sparse', + data_type: DataType.SparseFloatVector, + description: 'sparse field', + }, ], - } as any; + functions: [ + { + name: 'bm25f1', + description: 'bm25 function', + type: 1, + input_field_names: ['testField1'], + output_field_names: ['sparse'], + params: { a: 1 }, + }, + ], + } as CreateCollectionReq; const schemaProtoPath = path.resolve( __dirname, @@ -346,18 +362,42 @@ describe('utils/format', () => { isPrimaryKey: false, isPartitionKey: false, isFunctionOutput: false, + isClusteringKey: false, elementType: 5, element_type: 5, + }, + { + typeParams: [], + indexParams: [], + name: 'sparse', + description: 'sparse field', + data_type: 104, + dataType: 104, + isPrimaryKey: false, + isPartitionKey: false, + isFunctionOutput: true, isClusteringKey: false, }, ], - functions: [], + functions: [ + { + inputFieldNames: ['testField1'], + inputFieldIds: [], + outputFieldNames: ['sparse'], + outputFieldIds: [], + params: [{ key: 'a', value: '1' }], + name: 'bm25f1', + description: 'bm25 function', + type: 1, + }, + ], }; const payload = formatCollectionSchema(data, { fieldSchemaType, functionSchemaType, }); + expect(payload).toEqual(expectedResult); });