Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Wrap init segcore tracing with golang timeout #33494

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions internal/core/src/common/Tracer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,13 @@
opts.transport_format = jaeger::TransportFormat::kThriftHttp;
opts.endpoint = cfg.jaegerURL;
exporter = jaeger::JaegerExporterFactory::Create(opts);
LOG_INFO("init jaeger exporter, endpoint:", opts.endpoint);
LOG_INFO("init jaeger exporter, endpoint: {}", opts.endpoint);
} else if (cfg.exporter == "otlp") {
auto opts = otlp::OtlpGrpcExporterOptions{};
opts.endpoint = cfg.otlpEndpoint;
opts.use_ssl_credentials = cfg.oltpSecure;
exporter = otlp::OtlpGrpcExporterFactory::Create(opts);
LOG_INFO("init otlp exporter, endpoint:", opts.endpoint);
LOG_INFO("init otlp exporter, endpoint: {}", opts.endpoint);

Check warning on line 64 in internal/core/src/common/Tracer.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/common/Tracer.cpp#L64

Added line #L64 was not covered by tests
} else {
LOG_INFO("Empty Trace");
enable_trace = false;
Expand Down
35 changes: 33 additions & 2 deletions internal/util/initcore/init_core.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

import (
"fmt"
"time"
"unsafe"

"github.com/cockroachdb/errors"
Expand Down Expand Up @@ -61,7 +62,13 @@
otlpEndpoint: endpoint,
nodeID: nodeID,
}
C.InitTrace(&config)
// oltp grpc may hangs forever, add timeout logic at go side
timeout := params.TraceCfg.InitTimeoutSeconds.GetAsDuration(time.Second)
callWithTimeout(func() {
C.InitTrace(&config)
}, func() {
panic("init segcore tracing timeout, See issue #33483")

Check warning on line 70 in internal/util/initcore/init_core.go

View check run for this annotation

Codecov / codecov/patch

internal/util/initcore/init_core.go#L70

Added line #L70 was not covered by tests
}, timeout)
}

func ResetTraceConfig(params *paramtable.ComponentParam) {
Expand All @@ -81,7 +88,31 @@
otlpEndpoint: endpoint,
nodeID: nodeID,
}
C.SetTrace(&config)

// oltp grpc may hangs forever, add timeout logic at go side
timeout := params.TraceCfg.InitTimeoutSeconds.GetAsDuration(time.Second)
callWithTimeout(func() {
C.SetTrace(&config)
}, func() {
panic("set segcore tracing timeout, See issue #33483")
}, timeout)
}

func callWithTimeout(fn func(), timeoutHandler func(), timeout time.Duration) {
if timeout > 0 {
ch := make(chan struct{})
go func() {
defer close(ch)
fn()
}()
select {
case <-ch:
case <-time.After(timeout):
timeoutHandler()
}
} else {
fn()

Check warning on line 114 in internal/util/initcore/init_core.go

View check run for this annotation

Codecov / codecov/patch

internal/util/initcore/init_core.go#L113-L114

Added lines #L113 - L114 were not covered by tests
}
}

func InitRemoteChunkManager(params *paramtable.ComponentParam) error {
Expand Down
16 changes: 16 additions & 0 deletions internal/util/initcore/init_core_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ package initcore
import (
"testing"

"github.com/stretchr/testify/assert"

"github.com/milvus-io/milvus/pkg/util/paramtable"
)

Expand All @@ -29,3 +31,17 @@ func TestTracer(t *testing.T) {
paramtable.Get().Save(paramtable.Get().TraceCfg.Exporter.Key, "stdout")
ResetTraceConfig(paramtable.Get())
}

func TestOtlpHang(t *testing.T) {
paramtable.Init()
InitTraceConfig(paramtable.Get())

paramtable.Get().Save(paramtable.Get().TraceCfg.Exporter.Key, "otlp")
paramtable.Get().Save(paramtable.Get().TraceCfg.InitTimeoutSeconds.Key, "1")
defer paramtable.Get().Reset(paramtable.Get().TraceCfg.Exporter.Key)
defer paramtable.Get().Reset(paramtable.Get().TraceCfg.InitTimeoutSeconds.Key)

assert.Panics(t, func() {
ResetTraceConfig(paramtable.Get())
})
}
20 changes: 15 additions & 5 deletions pkg/util/paramtable/component_param.go
Original file line number Diff line number Diff line change
Expand Up @@ -767,11 +767,12 @@ func (t *gpuConfig) init(base *BaseTable) {
}

type traceConfig struct {
Exporter ParamItem `refreshable:"false"`
SampleFraction ParamItem `refreshable:"false"`
JaegerURL ParamItem `refreshable:"false"`
OtlpEndpoint ParamItem `refreshable:"false"`
OtlpSecure ParamItem `refreshable:"false"`
Exporter ParamItem `refreshable:"false"`
SampleFraction ParamItem `refreshable:"false"`
JaegerURL ParamItem `refreshable:"false"`
OtlpEndpoint ParamItem `refreshable:"false"`
OtlpSecure ParamItem `refreshable:"false"`
InitTimeoutSeconds ParamItem `refreshable:"false"`
}

func (t *traceConfig) init(base *BaseTable) {
Expand Down Expand Up @@ -819,6 +820,15 @@ Fractions >= 1 will always sample. Fractions < 0 are treated as zero.`,
Export: true,
}
t.OtlpSecure.Init(base.mgr)

t.InitTimeoutSeconds = ParamItem{
Key: "trace.initTimeoutSeconds",
Version: "2.4.4",
DefaultValue: "10",
Export: true,
Doc: "segcore initialization timeout in seconds, preventing otlp grpc hangs forever",
}
t.InitTimeoutSeconds.Init(base.mgr)
}

type logConfig struct {
Expand Down
Loading