From 07a05db51b03144452b1443383242528a0c48c5d Mon Sep 17 00:00:00 2001 From: congqixia Date: Wed, 26 Jun 2024 19:44:05 +0800 Subject: [PATCH] fix: [2.4] Wrap init segcore tracing with golang timeout (#33494) (#34191) Cherry-pick from master pr: #33494 See also #33483 Wrap `C.InitTrace` & `C.SetTrace` with timeout preventing otlp initializtion hangs forever when endpoint is not set correctly Signed-off-by: Congqi Xia --- internal/core/src/common/Tracer.cpp | 4 +-- internal/util/initcore/init_core.go | 35 ++++++++++++++++++++++-- internal/util/initcore/init_core_test.go | 16 +++++++++++ pkg/util/paramtable/component_param.go | 20 ++++++++++---- 4 files changed, 66 insertions(+), 9 deletions(-) diff --git a/internal/core/src/common/Tracer.cpp b/internal/core/src/common/Tracer.cpp index 4711ef76ae3ef..d80dd301215e9 100644 --- a/internal/core/src/common/Tracer.cpp +++ b/internal/core/src/common/Tracer.cpp @@ -55,13 +55,13 @@ initTelemetry(const TraceConfig& cfg) { opts.transport_format = jaeger::TransportFormat::kThriftHttp; opts.endpoint = cfg.jaegerURL; exporter = jaeger::JaegerExporterFactory::Create(opts); - LOG_INFO("init jaeger exporter, endpoint:", opts.endpoint); + LOG_INFO("init jaeger exporter, endpoint: {}", opts.endpoint); } else if (cfg.exporter == "otlp") { auto opts = otlp::OtlpGrpcExporterOptions{}; opts.endpoint = cfg.otlpEndpoint; opts.use_ssl_credentials = cfg.oltpSecure; exporter = otlp::OtlpGrpcExporterFactory::Create(opts); - LOG_INFO("init otlp exporter, endpoint:", opts.endpoint); + LOG_INFO("init otlp exporter, endpoint: {}", opts.endpoint); } else { LOG_INFO("Empty Trace"); enable_trace = false; diff --git a/internal/util/initcore/init_core.go b/internal/util/initcore/init_core.go index 1f3ccaf533dba..356a81d4bf79a 100644 --- a/internal/util/initcore/init_core.go +++ b/internal/util/initcore/init_core.go @@ -30,6 +30,7 @@ import "C" import ( "fmt" "path" + "time" "unsafe" "github.com/cockroachdb/errors" @@ -62,7 +63,13 @@ func InitTraceConfig(params *paramtable.ComponentParam) { otlpEndpoint: endpoint, nodeID: nodeID, } - C.InitTrace(&config) + // oltp grpc may hangs forever, add timeout logic at go side + timeout := params.TraceCfg.InitTimeoutSeconds.GetAsDuration(time.Second) + callWithTimeout(func() { + C.InitTrace(&config) + }, func() { + panic("init segcore tracing timeout, See issue #33483") + }, timeout) } func ResetTraceConfig(params *paramtable.ComponentParam) { @@ -82,7 +89,31 @@ func ResetTraceConfig(params *paramtable.ComponentParam) { otlpEndpoint: endpoint, nodeID: nodeID, } - C.SetTrace(&config) + + // oltp grpc may hangs forever, add timeout logic at go side + timeout := params.TraceCfg.InitTimeoutSeconds.GetAsDuration(time.Second) + callWithTimeout(func() { + C.SetTrace(&config) + }, func() { + panic("set segcore tracing timeout, See issue #33483") + }, timeout) +} + +func callWithTimeout(fn func(), timeoutHandler func(), timeout time.Duration) { + if timeout > 0 { + ch := make(chan struct{}) + go func() { + defer close(ch) + fn() + }() + select { + case <-ch: + case <-time.After(timeout): + timeoutHandler() + } + } else { + fn() + } } func InitRemoteChunkManager(params *paramtable.ComponentParam) error { diff --git a/internal/util/initcore/init_core_test.go b/internal/util/initcore/init_core_test.go index fadc061042a8e..15d1b089a8989 100644 --- a/internal/util/initcore/init_core_test.go +++ b/internal/util/initcore/init_core_test.go @@ -19,6 +19,8 @@ package initcore import ( "testing" + "github.com/stretchr/testify/assert" + "github.com/milvus-io/milvus/pkg/util/paramtable" ) @@ -29,3 +31,17 @@ func TestTracer(t *testing.T) { paramtable.Get().Save(paramtable.Get().TraceCfg.Exporter.Key, "stdout") ResetTraceConfig(paramtable.Get()) } + +func TestOtlpHang(t *testing.T) { + paramtable.Init() + InitTraceConfig(paramtable.Get()) + + paramtable.Get().Save(paramtable.Get().TraceCfg.Exporter.Key, "otlp") + paramtable.Get().Save(paramtable.Get().TraceCfg.InitTimeoutSeconds.Key, "1") + defer paramtable.Get().Reset(paramtable.Get().TraceCfg.Exporter.Key) + defer paramtable.Get().Reset(paramtable.Get().TraceCfg.InitTimeoutSeconds.Key) + + assert.Panics(t, func() { + ResetTraceConfig(paramtable.Get()) + }) +} diff --git a/pkg/util/paramtable/component_param.go b/pkg/util/paramtable/component_param.go index c207a160d3898..9a842867ca103 100644 --- a/pkg/util/paramtable/component_param.go +++ b/pkg/util/paramtable/component_param.go @@ -777,11 +777,12 @@ func (t *gpuConfig) init(base *BaseTable) { } type traceConfig struct { - Exporter ParamItem `refreshable:"false"` - SampleFraction ParamItem `refreshable:"false"` - JaegerURL ParamItem `refreshable:"false"` - OtlpEndpoint ParamItem `refreshable:"false"` - OtlpSecure ParamItem `refreshable:"false"` + Exporter ParamItem `refreshable:"false"` + SampleFraction ParamItem `refreshable:"false"` + JaegerURL ParamItem `refreshable:"false"` + OtlpEndpoint ParamItem `refreshable:"false"` + OtlpSecure ParamItem `refreshable:"false"` + InitTimeoutSeconds ParamItem `refreshable:"false"` } func (t *traceConfig) init(base *BaseTable) { @@ -829,6 +830,15 @@ Fractions >= 1 will always sample. Fractions < 0 are treated as zero.`, Export: true, } t.OtlpSecure.Init(base.mgr) + + t.InitTimeoutSeconds = ParamItem{ + Key: "trace.initTimeoutSeconds", + Version: "2.4.4", + DefaultValue: "10", + Export: true, + Doc: "segcore initialization timeout in seconds, preventing otlp grpc hangs forever", + } + t.InitTimeoutSeconds.Init(base.mgr) } type logConfig struct {