Skip to content

Commit

Permalink
enhance: Add max length check for csv import (milvus-io#37077)
Browse files Browse the repository at this point in the history
1. Add max length check for csv import.
2. Tidy import options.
3. Tidy common import util functions.

issue: milvus-io#34150

---------

Signed-off-by: bigsheeper <[email protected]>
  • Loading branch information
bigsheeper authored Oct 25, 2024
1 parent 088d5d7 commit b45cf2d
Show file tree
Hide file tree
Showing 16 changed files with 277 additions and 78 deletions.
17 changes: 5 additions & 12 deletions internal/datacoord/services.go
Original file line number Diff line number Diff line change
Expand Up @@ -1681,19 +1681,12 @@ func (s *Server) ImportV2(ctx context.Context, in *internalpb.ImportRequestInter
zap.Int64("collection", in.GetCollectionID()),
zap.Int64s("partitions", in.GetPartitionIDs()),
zap.Strings("channels", in.GetChannelNames()))
log.Info("receive import request", zap.Any("files", in.GetFiles()))
log.Info("receive import request", zap.Any("files", in.GetFiles()), zap.Any("options", in.GetOptions()))

var timeoutTs uint64 = math.MaxUint64
timeoutStr, err := funcutil.GetAttrByKeyFromRepeatedKV("timeout", in.GetOptions())
if err == nil {
// Specifies the timeout duration for import, such as "300s", "1.5h" or "1h45m".
dur, err := time.ParseDuration(timeoutStr)
if err != nil {
resp.Status = merr.Status(merr.WrapErrImportFailed(fmt.Sprint("parse import timeout failed, err=%w", err)))
return resp, nil
}
curTs := tsoutil.GetCurrentTime()
timeoutTs = tsoutil.AddPhysicalDurationOnTs(curTs, dur)
timeoutTs, err := importutilv2.GetTimeoutTs(in.GetOptions())
if err != nil {
resp.Status = merr.Status(merr.WrapErrImportFailed(err.Error()))
return resp, nil
}

files := in.GetFiles()
Expand Down
1 change: 1 addition & 0 deletions internal/proxy/impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -6247,6 +6247,7 @@ func (node *Proxy) ImportV2(ctx context.Context, req *internalpb.ImportRequest)
zap.String("partition name", req.GetPartitionName()),
zap.Any("files", req.GetFiles()),
zap.String("role", typeutil.ProxyRole),
zap.Any("options", req.GetOptions()),
)

resp := &internalpb.ImportResponse{
Expand Down
11 changes: 11 additions & 0 deletions internal/util/importutilv2/common/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,14 @@ func CheckArrayCapacity(arrLength int, maxCapacity int64) error {
}
return nil
}

func EstimateReadCountPerBatch(bufferSize int, schema *schemapb.CollectionSchema) (int64, error) {
sizePerRecord, err := typeutil.EstimateMaxSizePerRecord(schema)
if err != nil {
return 0, err
}
if 1000*sizePerRecord <= bufferSize {
return 1000, nil
}
return int64(bufferSize) / int64(sizePerRecord), nil
}
68 changes: 68 additions & 0 deletions internal/util/importutilv2/common/util_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package common

import (
"testing"

"github.com/stretchr/testify/assert"

"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/common"
)

func TestUtil_EstimateReadCountPerBatch(t *testing.T) {
schema := &schemapb.CollectionSchema{
Fields: []*schemapb.FieldSchema{
{
FieldID: 100,
Name: "pk",
IsPrimaryKey: true,
DataType: schemapb.DataType_Int64,
},
{
FieldID: 101,
Name: "vec",
DataType: schemapb.DataType_FloatVector,
TypeParams: []*commonpb.KeyValuePair{
{
Key: common.DimKey,
Value: "128",
},
},
},
},
}
count, err := EstimateReadCountPerBatch(16*1024*1024, schema)
assert.NoError(t, err)
assert.Equal(t, int64(1000), count)

schema.Fields = append(schema.Fields, &schemapb.FieldSchema{
FieldID: 102,
Name: "vec2",
DataType: schemapb.DataType_FloatVector,
TypeParams: []*commonpb.KeyValuePair{
{
Key: common.DimKey,
Value: "invalidDim",
},
},
})
_, err = EstimateReadCountPerBatch(16*1024*1024, schema)
assert.Error(t, err)
}
39 changes: 21 additions & 18 deletions internal/util/importutilv2/csv/reader.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package csv

import (
Expand All @@ -11,6 +27,7 @@ import (

"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/storage"
"github.com/milvus-io/milvus/internal/util/importutilv2/common"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/merr"
)
Expand All @@ -36,13 +53,10 @@ func NewReader(ctx context.Context, cm storage.ChunkManager, schema *schemapb.Co
if err != nil {
return nil, merr.WrapErrImportFailed(fmt.Sprintf("read csv file failed, path=%s, err=%s", path, err.Error()))
}
// count, err := estimateReadCountPerBatch(bufferSize, schema)
// if err != nil {
// return nil, err
// }

// set the interval for determining if the buffer is exceeded
var count int64 = 1000
count, err := common.EstimateReadCountPerBatch(bufferSize, schema)
if err != nil {
return nil, err
}

csvReader := csv.NewReader(cmReader)
csvReader.Comma = sep
Expand Down Expand Up @@ -119,14 +133,3 @@ func (r *reader) Size() (int64, error) {
r.fileSize.Store(size)
return size, nil
}

// func estimateReadCountPerBatch(bufferSize int, schema *schemapb.CollectionSchema) (int64, error) {
// sizePerRecord, err := typeutil.EstimateMaxSizePerRecord(schema)
// if err != nil {
// return 0, err
// }
// if 1000*sizePerRecord <= bufferSize {
// return 1000, nil
// }
// return int64(bufferSize) / int64(sizePerRecord), nil
// }
20 changes: 20 additions & 0 deletions internal/util/importutilv2/csv/reader_test.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package csv

import (
Expand Down Expand Up @@ -72,6 +88,10 @@ func (suite *ReaderSuite) run(dataType schemapb.DataType, elemType schemapb.Data
Key: common.MaxLengthKey,
Value: "128",
},
{
Key: common.MaxCapacityKey,
Value: "256",
},
},
Nullable: nullable,
},
Expand Down
32 changes: 32 additions & 0 deletions internal/util/importutilv2/csv/row_parser.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package csv

import (
Expand All @@ -9,7 +25,9 @@ import (
"github.com/samber/lo"

"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/util/importutilv2/common"
"github.com/milvus-io/milvus/pkg/util/merr"
"github.com/milvus-io/milvus/pkg/util/parameterutil"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)

Expand Down Expand Up @@ -224,6 +242,13 @@ func (r *rowParser) parseEntity(field *schemapb.FieldSchema, obj string) (any, e
if nullable && obj == r.nullkey {
return nil, nil
}
maxLength, err := parameterutil.GetMaxLength(field)
if err != nil {
return nil, err
}
if err = common.CheckVarcharLength(obj, maxLength); err != nil {
return nil, err
}
return obj, nil
case schemapb.DataType_BinaryVector:
if nullable && obj == r.nullkey {
Expand Down Expand Up @@ -323,6 +348,13 @@ func (r *rowParser) parseEntity(field *schemapb.FieldSchema, obj string) (any, e
if err != nil {
return nil, r.wrapTypeError(obj, field)
}
maxCapacity, err := parameterutil.GetMaxCapacity(field)
if err != nil {
return nil, err
}
if err = common.CheckArrayCapacity(len(vec), maxCapacity); err != nil {
return nil, err
}
// elements in array not support null value
scalarFieldData, err := r.arrayToFieldData(vec, field.GetElementType())
if err != nil {
Expand Down
16 changes: 16 additions & 0 deletions internal/util/importutilv2/csv/row_parser_test.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package csv

import (
Expand Down
15 changes: 2 additions & 13 deletions internal/util/importutilv2/json/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ import (

"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/storage"
"github.com/milvus-io/milvus/internal/util/importutilv2/common"
"github.com/milvus-io/milvus/pkg/util/merr"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)

const (
Expand Down Expand Up @@ -58,7 +58,7 @@ func NewReader(ctx context.Context, cm storage.ChunkManager, schema *schemapb.Co
if err != nil {
return nil, merr.WrapErrImportFailed(fmt.Sprintf("read json file failed, path=%s, err=%s", path, err.Error()))
}
count, err := estimateReadCountPerBatch(bufferSize, schema)
count, err := common.EstimateReadCountPerBatch(bufferSize, schema)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -181,14 +181,3 @@ func (j *reader) Size() (int64, error) {
}

func (j *reader) Close() {}

func estimateReadCountPerBatch(bufferSize int, schema *schemapb.CollectionSchema) (int64, error) {
sizePerRecord, err := typeutil.EstimateMaxSizePerRecord(schema)
if err != nil {
return 0, err
}
if 1000*sizePerRecord <= bufferSize {
return 1000, nil
}
return int64(bufferSize) / int64(sizePerRecord), nil
}
7 changes: 3 additions & 4 deletions internal/util/importutilv2/json/row_parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -399,17 +399,16 @@ func (r *rowParser) parseEntity(fieldID int64, obj any) (any, error) {
}
case schemapb.DataType_Array:
arr, ok := obj.([]interface{})

if !ok {
return nil, r.wrapTypeError(obj, fieldID)
}
maxCapacity, err := parameterutil.GetMaxCapacity(r.id2Field[fieldID])
if err != nil {
return nil, err
}
if err = common.CheckArrayCapacity(len(arr), maxCapacity); err != nil {
return nil, err
}
if !ok {
return nil, r.wrapTypeError(obj, fieldID)
}
scalarFieldData, err := r.arrayToFieldData(arr, r.id2Field[fieldID].GetElementType())
if err != nil {
return nil, err
Expand Down
2 changes: 1 addition & 1 deletion internal/util/importutilv2/numpy/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ func NewReader(ctx context.Context, cm storage.ChunkManager, schema *schemapb.Co
fields := lo.KeyBy(schema.GetFields(), func(field *schemapb.FieldSchema) int64 {
return field.GetFieldID()
})
count, err := calcRowCount(bufferSize, schema)
count, err := common.EstimateReadCountPerBatch(bufferSize, schema)
if err != nil {
return nil, err
}
Expand Down
10 changes: 0 additions & 10 deletions internal/util/importutilv2/numpy/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ import (

"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/util/merr"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)

var (
Expand Down Expand Up @@ -241,12 +240,3 @@ func validateHeader(npyReader *npy.Reader, field *schemapb.FieldSchema, dim int)
}
return nil
}

func calcRowCount(bufferSize int, schema *schemapb.CollectionSchema) (int64, error) {
sizePerRecord, err := typeutil.EstimateMaxSizePerRecord(schema)
if err != nil {
return 0, err
}
rowCount := int64(bufferSize) / int64(sizePerRecord)
return rowCount, nil
}
Loading

0 comments on commit b45cf2d

Please sign in to comment.