Skip to content

Commit

Permalink
feat: support json index
Browse files Browse the repository at this point in the history
This PR adds json index support for json and dynamic fields. Now you can only do unary query like 'a["b"] > 1' using this index. We will support more filter type later.

basic usage:
```
collection.create_index("json_field", {"index_type": "INVERTED",
    "params": {"json_cast_type": DataType.STRING, "json_path":
'json_field["a"]["b"]'}})
```

There are some limits to use this index:
1. If a record does not have the json path you specify, it will be ignored and there will not be an error.
2. If a value of the json path fails to be cast to the type you specify,  it will be ignored and there will not be an error.
3. A specific json path can have only one json index.
4. If you try to create more than one json indexes for one json field, sdk(pymilvus<=2.4.7) may return immediately because of internal implementation. This will be fixed in a later version.

Signed-off-by: sunby <[email protected]>
  • Loading branch information
sunby committed Dec 25, 2024
1 parent 636e107 commit 8f79960
Show file tree
Hide file tree
Showing 49 changed files with 1,318 additions and 293 deletions.
3 changes: 3 additions & 0 deletions internal/core/src/common/Consts.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,6 @@ const int64_t DEFAULT_BITMAP_INDEX_BUILD_MODE_BOUND = 500;
const int64_t DEFAULT_HYBRID_INDEX_BITMAP_CARDINALITY_LIMIT = 100;

const size_t MARISA_NULL_KEY_ID = -1;

const std::string JSON_CAST_TYPE = "json_cast_type";
const std::string JSON_PATH = "json_path";
14 changes: 14 additions & 0 deletions internal/core/src/common/FieldDataInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -641,6 +641,20 @@ class FieldDataJsonImpl : public FieldDataImpl<Json, true> {
}
length_ += n;
}

// only for test
void
add_json_data(const std::vector<Json>& json) {
std::lock_guard lck(tell_mutex_);
if (length_ + json.size() > get_num_rows()) {
resize_field_data(length_ + json.size());
}

for (size_t i = 0; i < json.size(); ++i) {
data_[length_ + i] = json[i];
}
length_ += json.size();
}
};

class FieldDataSparseVectorImpl
Expand Down
5 changes: 5 additions & 0 deletions internal/core/src/common/FieldMeta.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,11 @@ class FieldMeta {
return IsVectorDataType(type_);
}

bool
is_json() const {
return type_ == DataType::JSON;
}

bool
is_string() const {
return IsStringDataType(type_);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,7 @@ class PhyBinaryArithOpEvalRangeExpr : public SegmentExpr {
name,
segment,
expr->column_.field_id_,
expr->column_.nested_path_,
active_count,
batch_size),
expr_(expr) {
Expand Down
1 change: 1 addition & 0 deletions internal/core/src/exec/expression/BinaryRangeExpr.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ class PhyBinaryRangeFilterExpr : public SegmentExpr {
name,
segment,
expr->column_.field_id_,
expr->column_.nested_path_,
active_count,
batch_size),
expr_(expr) {
Expand Down
1 change: 1 addition & 0 deletions internal/core/src/exec/expression/ExistsExpr.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class PhyExistsFilterExpr : public SegmentExpr {
name,
segment,
expr->column_.field_id_,
expr->column_.nested_path_,
active_count,
batch_size),
expr_(expr) {
Expand Down
43 changes: 36 additions & 7 deletions internal/core/src/exec/expression/Expr.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,15 @@
#include <memory>
#include <string>

#include "common/FieldDataInterface.h"
#include "common/Json.h"
#include "common/Types.h"
#include "exec/expression/EvalCtx.h"
#include "exec/expression/VectorFunction.h"
#include "exec/expression/Utils.h"
#include "exec/QueryContext.h"
#include "expr/ITypeExpr.h"
#include "log/Log.h"
#include "query/PlanProto.h"

namespace milvus {
Expand Down Expand Up @@ -109,12 +112,15 @@ class SegmentExpr : public Expr {
SegmentExpr(const std::vector<ExprPtr>&& input,
const std::string& name,
const segcore::SegmentInternalInterface* segment,
const FieldId& field_id,
const FieldId field_id,
const std::vector<std::string> nested_path,
int64_t active_count,
int64_t batch_size)
: Expr(DataType::BOOL, std::move(input), name),
segment_(segment),
field_id_(field_id),
nested_path_(nested_path),

active_count_(active_count),
batch_size_(batch_size) {
size_per_chunk_ = segment_->size_per_chunk();
Expand All @@ -129,6 +135,7 @@ class SegmentExpr : public Expr {
InitSegmentExpr() {
auto& schema = segment_->get_schema();
auto& field_meta = schema[field_id_];
field_type_ = field_meta.get_data_type();

if (schema.get_primary_field_id().has_value() &&
schema.get_primary_field_id().value() == field_id_ &&
Expand All @@ -137,9 +144,16 @@ class SegmentExpr : public Expr {
pk_type_ = field_meta.get_data_type();
}

is_index_mode_ = segment_->HasIndex(field_id_);
if (is_index_mode_) {
num_index_chunk_ = segment_->num_chunk_index(field_id_);
if (field_meta.get_data_type() == DataType::JSON) {
auto pointer = milvus::Json::pointer(nested_path_);
if (is_index_mode_ = segment_->HasIndex(field_id_, pointer)) {
num_index_chunk_ = 1;
}
} else {
is_index_mode_ = segment_->HasIndex(field_id_);
if (is_index_mode_) {
num_index_chunk_ = segment_->num_chunk_index(field_id_);
}
}
// if index not include raw data, also need load data
if (segment_->HasFieldData(field_id_)) {
Expand Down Expand Up @@ -767,9 +781,21 @@ class SegmentExpr : public Expr {
// It avoids indexing execute for every batch because indexing
// executing costs quite much time.
if (cached_index_chunk_id_ != i) {
const Index& index =
segment_->chunk_scalar_index<IndexInnerType>(field_id_, i);
auto* index_ptr = const_cast<Index*>(&index);
Index* index_ptr = nullptr;

if (field_type_ == DataType::JSON) {
auto pointer = milvus::Json::pointer(nested_path_);

const Index& index =
segment_->chunk_scalar_index<IndexInnerType>(
field_id_, pointer, i);
index_ptr = const_cast<Index*>(&index);
} else {
const Index& index =
segment_->chunk_scalar_index<IndexInnerType>(field_id_,
i);
index_ptr = const_cast<Index*>(&index);
}
cached_index_chunk_res_ = std::move(func(index_ptr, values...));
auto valid_result = index_ptr->IsNotNull();
cached_index_chunk_valid_res_ = std::move(valid_result);
Expand Down Expand Up @@ -1067,6 +1093,9 @@ class SegmentExpr : public Expr {
DataType pk_type_;
int64_t batch_size_;

std::vector<std::string> nested_path_;
DataType field_type_;

bool is_index_mode_{false};
bool is_data_mode_{false};
// sometimes need to skip index and using raw data
Expand Down
1 change: 1 addition & 0 deletions internal/core/src/exec/expression/JsonContainsExpr.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class PhyJsonContainsFilterExpr : public SegmentExpr {
name,
segment,
expr->column_.field_id_,
expr->column_.nested_path_,
active_count,
batch_size),
expr_(expr) {
Expand Down
1 change: 1 addition & 0 deletions internal/core/src/exec/expression/TermExpr.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ class PhyTermFilterExpr : public SegmentExpr {
name,
segment,
expr->column_.field_id_,
expr->column_.nested_path_,
active_count,
batch_size),
expr_(expr),
Expand Down
Loading

0 comments on commit 8f79960

Please sign in to comment.