change data format

Signed-off-by: lixinguo <[email protected]>
milvus-io · Jul 17, 2024 · ab9f49e · ab9f49e
1 parent dca95cf
commit ab9f49e
Show file tree

Hide file tree

Showing 19 changed files with 418 additions and 486 deletions.
diff --git a/configs/milvus.yaml b/configs/milvus.yaml
@@ -45,11 +45,11 @@ etcd:
     dir: default.etcd # Embedded Etcd only. please adjust in embedded Milvus: /tmp/milvus/etcdData/
   auth:
     enabled: false # Whether to enable authentication
-    userName:  # username for etcd authentication
-    password:  # password for etcd authentication
+    userName: # username for etcd authentication
+    password: # password for etcd authentication
 
 metastore:
-  type: etcd # Default value: etcd, Valid values: [etcd, tikv] 
+  type: etcd # Default value: etcd, Valid values: [etcd, tikv]
 
 # Related configuration of tikv, used to store Milvus metadata.
 # Notice that when TiKV is enabled for metastore, you still need to have etcd for service discovery.
@@ -63,9 +63,9 @@ tikv:
   snapshotScanSize: 256 # batch size of tikv snapshot scan
   ssl:
     enabled: false # Whether to support TiKV secure connection mode
-    tlsCert:  # path to your cert file
-    tlsKey:  # path to your key file
-    tlsCACert:  # path to your CACert file
+    tlsCert: # path to your cert file
+    tlsKey: # path to your key file
+    tlsCACert: # path to your CACert file
 
 localStorage:
   path: /var/lib/milvus/data/ # please adjust in embedded Milvus: /tmp/milvus/data/
@@ -97,12 +97,12 @@ minio:
   cloudProvider: aws
   # Custom endpoint for fetch IAM role credentials. when useIAM is true & cloudProvider is "aws".
   # Leave it empty if you want to use AWS default endpoint
-  iamEndpoint: 
+  iamEndpoint:
   logLevel: fatal # Log level for aws sdk log. Supported level:  off, fatal, error, warn, info, debug, trace
-  region:  # Specify minio storage system location region
+  region: # Specify minio storage system location region
   useVirtualHost: false # Whether use virtual host mode for bucket
   requestTimeoutMs: 10000 # minio timeout for request time in milliseconds
-  # The maximum number of objects requested per batch in minio ListObjects rpc, 
+  # The maximum number of objects requested per batch in minio ListObjects rpc,
   # 0 means using oss client by default, decrease these configration if ListObjects timeout
   listObjectsMaxKeys: 0
 
@@ -137,11 +137,11 @@ pulsar:
 
 # If you want to enable kafka, needs to comment the pulsar configs
 # kafka:
-#   brokerList: 
-#   saslUsername: 
-#   saslPassword: 
-#   saslMechanisms: 
-#   securityProtocol: 
+#   brokerList:
+#   saslUsername:
+#   saslPassword:
+#   saslMechanisms:
+#   securityProtocol:
 #   ssl:
 #     enabled: false # whether to enable ssl mode
 #     tlsCert:  # path to client's public key (PEM) used for authentication
@@ -179,8 +179,8 @@ natsmq:
       logSizeLimit: 536870912 # Size in bytes after the log file rolls over to a new one
     retention:
       maxAge: 4320 # Maximum age of any message in the P-channel
-      maxBytes:  # How many bytes the single P-channel may contain. Removing oldest messages if the P-channel exceeds this size
-      maxMsgs:  # How many message the single P-channel may contain. Removing oldest messages if the P-channel exceeds this limit
+      maxBytes: # How many bytes the single P-channel may contain. Removing oldest messages if the P-channel exceeds this size
+      maxMsgs: # How many message the single P-channel may contain. Removing oldest messages if the P-channel exceeds this limit
 
 # Related configuration of rootCoord, used to handle data definition language (DDL) and data control language (DCL) requests
 rootCoord:
@@ -191,7 +191,7 @@ rootCoord:
   maxDatabaseNum: 64 # Maximum number of database
   maxGeneralCapacity: 65536 # upper limit for the sum of of product of partitionNumber and shardNumber
   gracefulStopTimeout: 5 # seconds. force stop node without graceful stop
-  ip:  # if not specified, use the first unicastable address
+  ip: # if not specified, use the first unicastable address
   port: 53100
   grpc:
     serverMaxSendSize: 536870912
@@ -224,7 +224,7 @@ proxy:
     enable: false # if use access log
     minioEnable: false # if upload sealed access log file to minio
     localPath: /tmp/milvus_access
-    filename:  # Log filename, leave empty to use stdout.
+    filename: # Log filename, leave empty to use stdout.
     maxSize: 64 # Max size for a single file, in MB.
     cacheSize: 0 # Size of log write cache, in B
     cacheFlushInterval: 3 # time interval of auto flush write cache, in Seconds. (Close auto flush if interval was 0)
@@ -245,10 +245,10 @@ proxy:
   http:
     enabled: true # Whether to enable the http server
     debug_mode: false # Whether to enable http server debug mode
-    port:  # high-level restful api
+    port: # high-level restful api
     acceptTypeAllowInt64: true # high-level restful api, whether http client can deal with int64
     enablePprof: true # Whether to enable pprof middleware on the metrics port
-  ip:  # if not specified, use the first unicastable address
+  ip: # if not specified, use the first unicastable address
   port: 19530
   internalPort: 19529
   grpc:
@@ -306,7 +306,7 @@ queryCoord:
   enableStoppingBalance: true # whether enable stopping balance
   channelExclusiveNodeFactor: 4 # the least node number for enable channel's exclusive mode
   cleanExcludeSegmentInterval: 60 # the time duration of clean pipeline exclude segment which used for filter invalid data, in seconds
-  ip:  # if not specified, use the first unicastable address
+  ip: # if not specified, use the first unicastable address
   port: 19531
   grpc:
     serverMaxSendSize: 536870912
@@ -335,9 +335,9 @@ queryNode:
     enabled: true
     memoryLimit: 2147483648 # 2 GB, 2 * 1024 *1024 *1024
     readAheadPolicy: willneed # The read ahead policy of chunk cache, options: `normal, random, sequential, willneed, dontneed`
-    # options: async, sync, disable. 
-    # Specifies the necessity for warming up the chunk cache. 
-    # 1. If set to "sync" or "async" the original vector data will be synchronously/asynchronously loaded into the 
+    # options: async, sync, disable.
+    # Specifies the necessity for warming up the chunk cache.
+    # 1. If set to "sync" or "async" the original vector data will be synchronously/asynchronously loaded into the
     # chunk cache during the load process. This approach has the potential to substantially reduce query/search latency
     # for a specific duration post-load, albeit accompanied by a concurrent increase in disk usage;
     # 2. If set to "disable" original vector data will only be loaded into the chunk cache during search/query.
@@ -387,7 +387,7 @@ queryNode:
       maxParallelism: 1024 # Maximum number of tasks executed in parallel in the flowgraph
   enableSegmentPrune: false # use partition prune function on shard delegator
   queryStreamBatchSize: 4194304 # return batch size of stream query
-  ip:  # if not specified, use the first unicastable address
+  ip: # if not specified, use the first unicastable address
   port: 21123
   grpc:
     serverMaxSendSize: 536870912
@@ -409,7 +409,7 @@ indexNode:
     buildParallel: 1
   enableDisk: true # enable index node build disk vector index
   maxDiskUsagePercentage: 95
-  ip:  # if not specified, use the first unicastable address
+  ip: # if not specified, use the first unicastable address
   port: 21121
   grpc:
     serverMaxSendSize: 536870912
@@ -449,7 +449,7 @@ dataCoord:
     compactableProportion: 0.85
     # over (compactableProportion * segment max # of rows) rows.
     # MUST BE GREATER THAN OR EQUAL TO <smallProportion>!!!
-    # During compaction, the size of segment # of rows is able to exceed segment max # of rows by (expansionRate-1) * 100%. 
+    # During compaction, the size of segment # of rows is able to exceed segment max # of rows by (expansionRate-1) * 100%.
     expansionRate: 1.25
     segmentFlushInterval: 2 # the minimal interval duration(unit: Seconds) between flusing operation on same segment
   autoUpgradeSegmentIndex: false # whether auto upgrade segment index to index engine's version
@@ -475,7 +475,7 @@ dataCoord:
       # clustering compaction will try best to distribute data into segments with size range in [preferSegmentSize, maxSegmentSize].
       # data will be clustered by preferSegmentSize, if a cluster is larger than maxSegmentSize, will spilt it into multi segment
       # buffer between (preferSegmentSize, maxSegmentSize) is left for new data in the same cluster(range), to avoid globally redistribute too often
-      preferSegmentSize: 512m       
+      preferSegmentSize: 512m
       maxSegmentSize: 1024m
       maxTrainSizeRatio: 0.8 # max data size ratio in analyze, if data is larger than it, will down sampling to meet this limit
       maxCentroidsNum: 10240
@@ -511,7 +511,7 @@ dataCoord:
     maxImportFileNumPerReq: 1024 # The maximum number of files allowed per single import request.
     waitForIndex: true # Indicates whether the import operation waits for the completion of index building.
   gracefulStopTimeout: 5 # seconds. force stop node without graceful stop
-  ip:  # if not specified, use the first unicastable address
+  ip: # if not specified, use the first unicastable address
   port: 13333
   grpc:
     serverMaxSendSize: 536870912
@@ -559,7 +559,7 @@ dataNode:
     levelZeroBatchMemoryRatio: 0.05 # The minimal memory ratio of free memory for level zero compaction executing in batch mode
     levelZeroMaxBatchSize: -1 # Max batch size refers to the max number of L1/L2 segments in a batch when executing L0 compaction. Default to -1, any value that is less than 1 means no limit. Valid range: >= 1.
   gracefulStopTimeout: 1800 # seconds. force stop node without graceful stop
-  ip:  # if not specified, use the first unicastable address
+  ip: # if not specified, use the first unicastable address
   port: 21124
   grpc:
     serverMaxSendSize: 536870912
@@ -576,7 +576,7 @@ dataNode:
 log:
   level: info # Only supports debug, info, warn, error, panic, or fatal. Default 'info'.
   file:
-    rootPath:  # root dir path to put logs, default "" means no log file will print. please adjust in embedded Milvus: /tmp/milvus/logs
+    rootPath: # root dir path to put logs, default "" means no log file will print. please adjust in embedded Milvus: /tmp/milvus/logs
     maxSize: 300 # MB
     maxAge: 10 # Maximum time for log retention in day.
     maxBackups: 20
@@ -634,7 +634,7 @@ common:
     authorizationEnabled: false
     # The superusers will ignore some system check processes,
     # like the old password verification when updating the credential
-    superUsers: 
+    superUsers:
     tlsMode: 0
   session:
     ttl: 30 # ttl value when session granting a lease to register service
@@ -835,9 +835,9 @@ trace:
   # Fractions >= 1 will always sample. Fractions < 0 are treated as zero.
   sampleFraction: 0
   jaeger:
-    url:  # when exporter is jaeger should set the jaeger's URL
+    url: # when exporter is jaeger should set the jaeger's URL
   otlp:
-    endpoint:  # example: "127.0.0.1:4318"
+    endpoint: # example: "127.0.0.1:4318"
     secure: true
 
 #when using GPU indexing, Milvus will utilize a memory pool to avoid frequent memory allocation and deallocation.
@@ -847,5 +847,5 @@ trace:
 #milvus will automatically initialize half of the available GPU memory,
 #maxMemSize will the whole available GPU memory.
 gpu:
-  initMemSize:  # Gpu Memory Pool init size
-  maxMemSize:  # Gpu Memory Pool Max size
+  initMemSize: # Gpu Memory Pool init size
+  maxMemSize: # Gpu Memory Pool Max size
diff --git a/internal/core/src/common/FieldData.cpp b/internal/core/src/common/FieldData.cpp
@@ -43,7 +43,7 @@ FieldDataImpl<Type, is_type_entire_row>::FillFieldData(const void* source,
     }
     std::copy_n(static_cast<const Type*>(source),
                 element_count * dim_,
-                field_data_.data() + length_ * dim_);
+                data_.data() + length_ * dim_);
     length_ += element_count;
 }
 
@@ -64,15 +64,15 @@ FieldDataImpl<Type, is_type_entire_row>::FillFieldData(
     }
     std::copy_n(static_cast<const Type*>(field_data),
                 element_count * dim_,
-                field_data_.data() + length_ * dim_);
+                data_.data() + length_ * dim_);
 
     ssize_t byte_count = (element_count + 7) / 8;
     // Note: if 'nullable == true` and valid_data is nullptr
     // means null_count == 0, will fill it with 0xFF
     if (valid_data == nullptr) {
-        std::fill_n(valid_data_.get(), byte_count, 0xFF);
+        valid_data_.resize(byte_count, 0xFF);
     } else {
-        std::copy_n(valid_data, byte_count, valid_data_.get());
+        std::copy_n(valid_data, byte_count, valid_data_.data());
     }
 
     length_ += element_count;

diff --git a/internal/core/src/common/FieldData.h b/internal/core/src/common/FieldData.h
@@ -37,9 +37,11 @@ class FieldData : public FieldDataImpl<Type, true> {
               1, data_type, nullable, buffered_num_rows) {
     }
     static_assert(IsScalar<Type> || std::is_same_v<Type, PkType>);
-    explicit FieldData(DataType data_type, FixedVector<Type>&& inner_data)
+    explicit FieldData(DataType data_type,
+                       bool nullable,
+                       FixedVector<Type>&& inner_data)
         : FieldDataImpl<Type, true>::FieldDataImpl(
-              1, data_type, std::move(inner_data)) {
+              1, data_type, nullable, std::move(inner_data)) {
     }
 };
 
@@ -125,7 +127,7 @@ class FieldData<BFloat16Vector> : public FieldDataImpl<bfloat16, false> {
                        DataType data_type,
                        int64_t buffered_num_rows = 0)
         : FieldDataImpl<bfloat16, false>::FieldDataImpl(
-              dim,data_type, false,buffered_num_rows) {
+              dim, data_type, false, buffered_num_rows) {
     }
 };