From 205231b9c77c9eb067e064d8182904e208c11ad1 Mon Sep 17 00:00:00 2001 From: "cai.zhang" Date: Mon, 23 Dec 2024 13:40:49 +0800 Subject: [PATCH] fix: Decode unicode for json key in expression (#38651) issue: #38626 Signed-off-by: Cai Zhang --- .../parser/planparserv2/parser_visitor.go | 2 + .../parser/planparserv2/plan_parser_v2.go | 32 --------------- .../planparserv2/plan_parser_v2_test.go | 33 +++++++++++++++ internal/parser/planparserv2/utils.go | 40 +++++++++++++++++++ internal/parser/planparserv2/utils_test.go | 7 ++++ 5 files changed, 82 insertions(+), 32 deletions(-) diff --git a/internal/parser/planparserv2/parser_visitor.go b/internal/parser/planparserv2/parser_visitor.go index ffc4f13623459..f9f3c0f7be784 100644 --- a/internal/parser/planparserv2/parser_visitor.go +++ b/internal/parser/planparserv2/parser_visitor.go @@ -28,6 +28,7 @@ func (v *ParserVisitor) VisitParens(ctx *parser.ParensContext) interface{} { } func (v *ParserVisitor) translateIdentifier(identifier string) (*ExprWithType, error) { + identifier = decodeUnicode(identifier) field, err := v.schema.GetFieldFromNameDefaultJSON(identifier) if err != nil { return nil, err @@ -1005,6 +1006,7 @@ func (v *ParserVisitor) VisitBitOr(ctx *parser.BitOrContext) interface{} { */ // More tests refer to plan_parser_v2_test.go::Test_JSONExpr func (v *ParserVisitor) getColumnInfoFromJSONIdentifier(identifier string) (*planpb.ColumnInfo, error) { + identifier = decodeUnicode(identifier) fieldName := strings.Split(identifier, "[")[0] nestedPath := make([]string, 0) field, err := v.schema.GetFieldFromNameDefaultJSON(fieldName) diff --git a/internal/parser/planparserv2/plan_parser_v2.go b/internal/parser/planparserv2/plan_parser_v2.go index e298bcaa53a24..0c8e9f73561f4 100644 --- a/internal/parser/planparserv2/plan_parser_v2.go +++ b/internal/parser/planparserv2/plan_parser_v2.go @@ -2,9 +2,7 @@ package planparserv2 import ( "fmt" - "strings" "time" - "unicode" "github.com/antlr4-go/antlr/v4" "github.com/hashicorp/golang-lru/v2/expirable" @@ -153,36 +151,6 @@ func CreateRetrievePlan(schema *typeutil.SchemaHelper, exprStr string, exprTempl return planNode, nil } -func convertHanToASCII(s string) string { - var builder strings.Builder - builder.Grow(len(s) * 6) - skipCur := false - n := len(s) - for i, r := range s { - if skipCur { - builder.WriteRune(r) - skipCur = false - continue - } - if r == '\\' { - if i+1 < n && !isEscapeCh(s[i+1]) { - return s - } - skipCur = true - builder.WriteRune(r) - continue - } - - if unicode.Is(unicode.Han, r) { - builder.WriteString(formatUnicode(uint32(r))) - } else { - builder.WriteRune(r) - } - } - - return builder.String() -} - func CreateSearchPlan(schema *typeutil.SchemaHelper, exprStr string, vectorFieldName string, queryInfo *planpb.QueryInfo, exprTemplateValues map[string]*schemapb.TemplateValue) (*planpb.PlanNode, error) { parse := func() (*planpb.Expr, error) { if len(exprStr) <= 0 { diff --git a/internal/parser/planparserv2/plan_parser_v2_test.go b/internal/parser/planparserv2/plan_parser_v2_test.go index 68b2943a2fd9c..b3f7056ff80bc 100644 --- a/internal/parser/planparserv2/plan_parser_v2_test.go +++ b/internal/parser/planparserv2/plan_parser_v2_test.go @@ -1473,3 +1473,36 @@ func BenchmarkTemplateWithString(b *testing.B) { assert.NotNil(b, plan) } } + +func TestNestedPathWithChinese(t *testing.T) { + schema := newTestSchemaHelper(t) + + expr := `A["姓名"] == "小明"` + plan, err := CreateSearchPlan(schema, expr, "FloatVectorField", &planpb.QueryInfo{ + Topk: 0, + MetricType: "", + SearchParams: "", + RoundDecimal: 0, + }, nil) + assert.NoError(t, err, expr) + paths := plan.GetVectorAnns().GetPredicates().GetUnaryRangeExpr().GetColumnInfo().GetNestedPath() + assert.NotNil(t, paths) + assert.Equal(t, 2, len(paths)) + assert.Equal(t, "A", paths[0]) + assert.Equal(t, "姓名", paths[1]) + + expr = `A["年份"]["月份"] == "九月"` + plan, err = CreateSearchPlan(schema, expr, "FloatVectorField", &planpb.QueryInfo{ + Topk: 0, + MetricType: "", + SearchParams: "", + RoundDecimal: 0, + }, nil) + assert.NoError(t, err, expr) + paths = plan.GetVectorAnns().GetPredicates().GetUnaryRangeExpr().GetColumnInfo().GetNestedPath() + assert.NotNil(t, paths) + assert.Equal(t, 3, len(paths)) + assert.Equal(t, "A", paths[0]) + assert.Equal(t, "年份", paths[1]) + assert.Equal(t, "月份", paths[2]) +} diff --git a/internal/parser/planparserv2/utils.go b/internal/parser/planparserv2/utils.go index e6e49b07a6bf1..4cb1e60dbe0df 100644 --- a/internal/parser/planparserv2/utils.go +++ b/internal/parser/planparserv2/utils.go @@ -2,8 +2,10 @@ package planparserv2 import ( "fmt" + "regexp" "strconv" "strings" + "unicode" "github.com/milvus-io/milvus-proto/go-api/v2/schemapb" "github.com/milvus-io/milvus/internal/json" @@ -730,3 +732,41 @@ func parseJSONValue(value interface{}) (*planpb.GenericValue, schemapb.DataType, return nil, schemapb.DataType_None, fmt.Errorf("%v is of unknown type: %T\n", value, v) } } + +func convertHanToASCII(s string) string { + var builder strings.Builder + builder.Grow(len(s) * 6) + skipCur := false + n := len(s) + for i, r := range s { + if skipCur { + builder.WriteRune(r) + skipCur = false + continue + } + if r == '\\' { + if i+1 < n && !isEscapeCh(s[i+1]) { + return s + } + skipCur = true + builder.WriteRune(r) + continue + } + + if unicode.Is(unicode.Han, r) { + builder.WriteString(formatUnicode(uint32(r))) + } else { + builder.WriteRune(r) + } + } + + return builder.String() +} + +func decodeUnicode(input string) string { + re := regexp.MustCompile(`\\u[0-9a-fA-F]{4}`) + return re.ReplaceAllStringFunc(input, func(match string) string { + code, _ := strconv.ParseInt(match[2:], 16, 32) + return string(rune(code)) + }) +} diff --git a/internal/parser/planparserv2/utils_test.go b/internal/parser/planparserv2/utils_test.go index 4648c4dca0bad..91bfaeba098e2 100644 --- a/internal/parser/planparserv2/utils_test.go +++ b/internal/parser/planparserv2/utils_test.go @@ -328,3 +328,10 @@ func Test_getArrayElementType(t *testing.T) { assert.Equal(t, schemapb.DataType_None, getArrayElementType(expr)) }) } + +func Test_decodeUnicode(t *testing.T) { + s1 := "A[\"\\u5e74\\u4efd\"][\"\\u6708\\u4efd\"]" + + assert.NotEqual(t, `A["年份"]["月份"]`, s1) + assert.Equal(t, `A["年份"]["月份"]`, decodeUnicode(s1)) +}