Skip to content

Commit

Permalink
fix: Decode unicode for json key in expression (#38651)
Browse files Browse the repository at this point in the history
issue: #38626

Signed-off-by: Cai Zhang <[email protected]>
  • Loading branch information
xiaocai2333 authored Dec 23, 2024
1 parent 9e1bb3b commit 205231b
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 32 deletions.
2 changes: 2 additions & 0 deletions internal/parser/planparserv2/parser_visitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ func (v *ParserVisitor) VisitParens(ctx *parser.ParensContext) interface{} {
}

func (v *ParserVisitor) translateIdentifier(identifier string) (*ExprWithType, error) {
identifier = decodeUnicode(identifier)
field, err := v.schema.GetFieldFromNameDefaultJSON(identifier)
if err != nil {
return nil, err
Expand Down Expand Up @@ -1005,6 +1006,7 @@ func (v *ParserVisitor) VisitBitOr(ctx *parser.BitOrContext) interface{} {
*/
// More tests refer to plan_parser_v2_test.go::Test_JSONExpr
func (v *ParserVisitor) getColumnInfoFromJSONIdentifier(identifier string) (*planpb.ColumnInfo, error) {
identifier = decodeUnicode(identifier)
fieldName := strings.Split(identifier, "[")[0]
nestedPath := make([]string, 0)
field, err := v.schema.GetFieldFromNameDefaultJSON(fieldName)
Expand Down
32 changes: 0 additions & 32 deletions internal/parser/planparserv2/plan_parser_v2.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@ package planparserv2

import (
"fmt"
"strings"
"time"
"unicode"

"github.com/antlr4-go/antlr/v4"
"github.com/hashicorp/golang-lru/v2/expirable"
Expand Down Expand Up @@ -153,36 +151,6 @@ func CreateRetrievePlan(schema *typeutil.SchemaHelper, exprStr string, exprTempl
return planNode, nil
}

func convertHanToASCII(s string) string {
var builder strings.Builder
builder.Grow(len(s) * 6)
skipCur := false
n := len(s)
for i, r := range s {
if skipCur {
builder.WriteRune(r)
skipCur = false
continue
}
if r == '\\' {
if i+1 < n && !isEscapeCh(s[i+1]) {
return s
}
skipCur = true
builder.WriteRune(r)
continue
}

if unicode.Is(unicode.Han, r) {
builder.WriteString(formatUnicode(uint32(r)))
} else {
builder.WriteRune(r)
}
}

return builder.String()
}

func CreateSearchPlan(schema *typeutil.SchemaHelper, exprStr string, vectorFieldName string, queryInfo *planpb.QueryInfo, exprTemplateValues map[string]*schemapb.TemplateValue) (*planpb.PlanNode, error) {
parse := func() (*planpb.Expr, error) {
if len(exprStr) <= 0 {
Expand Down
33 changes: 33 additions & 0 deletions internal/parser/planparserv2/plan_parser_v2_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1473,3 +1473,36 @@ func BenchmarkTemplateWithString(b *testing.B) {
assert.NotNil(b, plan)
}
}

func TestNestedPathWithChinese(t *testing.T) {
schema := newTestSchemaHelper(t)

expr := `A["姓名"] == "小明"`
plan, err := CreateSearchPlan(schema, expr, "FloatVectorField", &planpb.QueryInfo{
Topk: 0,
MetricType: "",
SearchParams: "",
RoundDecimal: 0,
}, nil)
assert.NoError(t, err, expr)
paths := plan.GetVectorAnns().GetPredicates().GetUnaryRangeExpr().GetColumnInfo().GetNestedPath()
assert.NotNil(t, paths)
assert.Equal(t, 2, len(paths))
assert.Equal(t, "A", paths[0])
assert.Equal(t, "姓名", paths[1])

expr = `A["年份"]["月份"] == "九月"`
plan, err = CreateSearchPlan(schema, expr, "FloatVectorField", &planpb.QueryInfo{
Topk: 0,
MetricType: "",
SearchParams: "",
RoundDecimal: 0,
}, nil)
assert.NoError(t, err, expr)
paths = plan.GetVectorAnns().GetPredicates().GetUnaryRangeExpr().GetColumnInfo().GetNestedPath()
assert.NotNil(t, paths)
assert.Equal(t, 3, len(paths))
assert.Equal(t, "A", paths[0])
assert.Equal(t, "年份", paths[1])
assert.Equal(t, "月份", paths[2])
}
40 changes: 40 additions & 0 deletions internal/parser/planparserv2/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@ package planparserv2

import (
"fmt"
"regexp"
"strconv"
"strings"
"unicode"

"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/json"
Expand Down Expand Up @@ -730,3 +732,41 @@ func parseJSONValue(value interface{}) (*planpb.GenericValue, schemapb.DataType,
return nil, schemapb.DataType_None, fmt.Errorf("%v is of unknown type: %T\n", value, v)
}
}

func convertHanToASCII(s string) string {
var builder strings.Builder
builder.Grow(len(s) * 6)
skipCur := false
n := len(s)
for i, r := range s {
if skipCur {
builder.WriteRune(r)
skipCur = false
continue
}
if r == '\\' {
if i+1 < n && !isEscapeCh(s[i+1]) {
return s
}
skipCur = true
builder.WriteRune(r)
continue
}

if unicode.Is(unicode.Han, r) {
builder.WriteString(formatUnicode(uint32(r)))
} else {
builder.WriteRune(r)
}
}

return builder.String()
}

func decodeUnicode(input string) string {
re := regexp.MustCompile(`\\u[0-9a-fA-F]{4}`)
return re.ReplaceAllStringFunc(input, func(match string) string {
code, _ := strconv.ParseInt(match[2:], 16, 32)
return string(rune(code))
})
}
7 changes: 7 additions & 0 deletions internal/parser/planparserv2/utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -328,3 +328,10 @@ func Test_getArrayElementType(t *testing.T) {
assert.Equal(t, schemapb.DataType_None, getArrayElementType(expr))
})
}

func Test_decodeUnicode(t *testing.T) {
s1 := "A[\"\\u5e74\\u4efd\"][\"\\u6708\\u4efd\"]"

assert.NotEqual(t, `A["年份"]["月份"]`, s1)
assert.Equal(t, `A["年份"]["月份"]`, decodeUnicode(s1))
}

0 comments on commit 205231b

Please sign in to comment.