diff --git a/Makefile b/Makefile
index a86dc825c3b8d..47052fb52d2c6 100644
--- a/Makefile
+++ b/Makefile
@@ -95,7 +95,7 @@ milvus-gpu: build-cpp-gpu print-gpu-build-info
 	@source $(PWD)/scripts/setenv.sh && \
 		mkdir -p $(INSTALL_PATH) && go env -w CGO_ENABLED="1" && \
 		CGO_LDFLAGS="$(CGO_LDFLAGS)" CGO_CFLAGS="$(CGO_CFLAGS)" GO111MODULE=on $(GO) build -pgo=$(PGO_PATH)/default.pgo -ldflags="-r $${RPATH} -X '$(OBJPREFIX).BuildTags=$(BUILD_TAGS_GPU)' -X '$(OBJPREFIX).BuildTime=$(BUILD_TIME)' -X '$(OBJPREFIX).GitCommit=$(GIT_COMMIT)' -X '$(OBJPREFIX).GoVersion=$(GO_VERSION)'" \
-		-tags $(MILVUS_GO_BUILD_TAGS) -o $(INSTALL_PATH)/milvus $(PWD)/cmd/main.go 1>/dev/null
+		-tags "$(MILVUS_GO_BUILD_TAGS),cuda" -o $(INSTALL_PATH)/milvus $(PWD)/cmd/main.go 1>/dev/null
 
 get-build-deps:
 	@(env bash $(PWD)/scripts/install_deps.sh)
diff --git a/configs/milvus.yaml b/configs/milvus.yaml
index e508238fbeb2d..b52b9e8b2ed20 100644
--- a/configs/milvus.yaml
+++ b/configs/milvus.yaml
@@ -1098,6 +1098,7 @@ trace:
 gpu:
   initMemSize: 2048 # Gpu Memory Pool init size
   maxMemSize: 4096 # Gpu Memory Pool Max size
+  overloadedMemoryThresholdPercentage: 95 # The threshold of GPU memory usage (in percentage) in a query node to trigger the sealed segment balancing.
 
 # Any configuration related to the streaming node server.
 streamingNode:
diff --git a/internal/querynodev2/segments/segment_interface.go b/internal/querynodev2/segments/segment_interface.go
index 400886ccd5edf..2210021d5269d 100644
--- a/internal/querynodev2/segments/segment_interface.go
+++ b/internal/querynodev2/segments/segment_interface.go
@@ -31,9 +31,10 @@ import (
 
 // ResourceUsage is used to estimate the resource usage of a sealed segment.
 type ResourceUsage struct {
-	MemorySize     uint64
-	DiskSize       uint64
-	MmapFieldCount int
+	MemorySize         uint64
+	DiskSize           uint64
+	MmapFieldCount     int
+	FieldGpuMemorySize []uint64
 }
 
 // Segment is the interface of a segment implementation.
diff --git a/internal/querynodev2/segments/segment_loader.go b/internal/querynodev2/segments/segment_loader.go
index 6e883ab30781a..2bf883136afb9 100644
--- a/internal/querynodev2/segments/segment_loader.go
+++ b/internal/querynodev2/segments/segment_loader.go
@@ -27,6 +27,7 @@ import (
 	"context"
 	"fmt"
 	"io"
+	"math"
 	"path"
 	"runtime/debug"
 	"strconv"
@@ -47,6 +48,7 @@ import (
 	"github.com/milvus-io/milvus/internal/querynodev2/pkoracle"
 	"github.com/milvus-io/milvus/internal/storage"
 	"github.com/milvus-io/milvus/internal/util/segcore"
+	"github.com/milvus-io/milvus/internal/util/vecindexmgr"
 	"github.com/milvus-io/milvus/pkg/common"
 	"github.com/milvus-io/milvus/pkg/log"
 	"github.com/milvus-io/milvus/pkg/metrics"
@@ -1384,6 +1386,7 @@ func (loader *segmentLoader) checkSegmentSize(ctx context.Context, segmentLoadIn
 	maxSegmentSize := uint64(0)
 	predictMemUsage := memUsage
 	predictDiskUsage := diskUsage
+	var predictGpuMemUsage []uint64
 	mmapFieldCount := 0
 	for _, loadInfo := range segmentLoadInfos {
 		collection := loader.manager.Collection.Get(loadInfo.GetCollectionID())
@@ -1406,6 +1409,7 @@ func (loader *segmentLoader) checkSegmentSize(ctx context.Context, segmentLoadIn
 		mmapFieldCount += usage.MmapFieldCount
 		predictDiskUsage += usage.DiskSize
 		predictMemUsage += usage.MemorySize
+		predictGpuMemUsage = usage.FieldGpuMemorySize
 		if usage.MemorySize > maxSegmentSize {
 			maxSegmentSize = usage.MemorySize
 		}
@@ -1440,6 +1444,10 @@ func (loader *segmentLoader) checkSegmentSize(ctx context.Context, segmentLoadIn
 			paramtable.Get().QueryNodeCfg.MaxDiskUsagePercentage.GetAsFloat()))
 	}
 
+	err = checkSegmentGpuMemSize(predictGpuMemUsage, float32(paramtable.Get().GpuConfig.OverloadedMemoryThresholdPercentage.GetAsFloat()))
+	if err != nil {
+		return 0, 0, err
+	}
 	return predictMemUsage - memUsage, predictDiskUsage - diskUsage, nil
 }
 
@@ -1448,6 +1456,7 @@ func getResourceUsageEstimateOfSegment(schema *schemapb.CollectionSchema, loadIn
 	var segmentMemorySize, segmentDiskSize uint64
 	var indexMemorySize uint64
 	var mmapFieldCount int
+	var fieldGpuMemorySize []uint64
 
 	fieldID2IndexInfo := make(map[int64]*querypb.FieldIndexInfo)
 	for _, fieldIndexInfo := range loadInfo.IndexInfos {
@@ -1492,9 +1501,11 @@ func getResourceUsageEstimateOfSegment(schema *schemapb.CollectionSchema, loadIn
 					loadInfo.GetSegmentID(),
 					fieldIndexInfo.GetBuildID())
 			}
-
 			indexMemorySize += estimateResult.MaxMemoryCost
 			segmentDiskSize += estimateResult.MaxDiskCost
+			if vecindexmgr.GetVecIndexMgrInstance().IsGPUVecIndex(common.GetIndexType(fieldIndexInfo.IndexParams)) {
+				fieldGpuMemorySize = append(fieldGpuMemorySize, estimateResult.MaxMemoryCost)
+			}
 			if !estimateResult.HasRawData && !isVectorType {
 				shouldCalculateDataSize = true
 			}
@@ -1555,9 +1566,10 @@ func getResourceUsageEstimateOfSegment(schema *schemapb.CollectionSchema, loadIn
 		segmentMemorySize += uint64(float64(memSize) * expansionFactor)
 	}
 	return &ResourceUsage{
-		MemorySize:     segmentMemorySize + indexMemorySize,
-		DiskSize:       segmentDiskSize,
-		MmapFieldCount: mmapFieldCount,
+		MemorySize:         segmentMemorySize + indexMemorySize,
+		DiskSize:           segmentDiskSize,
+		MmapFieldCount:     mmapFieldCount,
+		FieldGpuMemorySize: fieldGpuMemorySize,
 	}, nil
 }
 
@@ -1680,3 +1692,40 @@ func getBinlogDataMemorySize(fieldBinlog *datapb.FieldBinlog) int64 {
 
 	return fieldSize
 }
+
+func checkSegmentGpuMemSize(fieldGpuMemSizeList []uint64, OverloadedMemoryThresholdPercentage float32) error {
+	gpuInfos, err := hardware.GetAllGPUMemoryInfo()
+	if err != nil {
+		if len(fieldGpuMemSizeList) == 0 {
+			return nil
+		}
+		return err
+	}
+	var availableGpuMem []uint64
+	var maxGpuMemSize []uint64
+	for _, gpuInfo := range gpuInfos {
+		availableGpuMem = append(availableGpuMem, gpuInfo.TotalMemory-gpuInfo.FreeMemory)
+		maxGpuMemSize = append(maxGpuMemSize, uint64(float32(gpuInfo.TotalMemory)*OverloadedMemoryThresholdPercentage))
+	}
+	currentGpuMem := availableGpuMem
+	for _, fieldGpuMem := range fieldGpuMemSizeList {
+		var minId int = -1
+		var minGpuMem uint64 = math.MaxUint64
+		for i := int(0); i < len(gpuInfos); i++ {
+			GpuiMem := currentGpuMem[i] + fieldGpuMem
+			if GpuiMem < maxGpuMemSize[i] && GpuiMem < minGpuMem {
+				minId = i
+				minGpuMem = GpuiMem
+			}
+		}
+		if minId == -1 {
+			return fmt.Errorf("load segment failed, GPU OOM if loaded, GpuMemUsage(bytes) = %v, availableGpuMem(bytes) = %v, maxGPUMem(bytes) = %v",
+				fieldGpuMem,
+				availableGpuMem,
+				maxGpuMemSize)
+		}
+		currentGpuMem[minId] += minGpuMem
+
+	}
+	return nil
+}
diff --git a/pkg/util/hardware/gpu_mem_info.go b/pkg/util/hardware/gpu_mem_info.go
new file mode 100644
index 0000000000000..5fd83cd49d448
--- /dev/null
+++ b/pkg/util/hardware/gpu_mem_info.go
@@ -0,0 +1,18 @@
+//go:build !cuda
+// +build !cuda
+
+package hardware
+
+import "errors"
+
+// GPUMemoryInfo holds information about a GPU's memory
+type GPUMemoryInfo struct {
+	TotalMemory uint64 // Total memory available on the GPU
+	FreeMemory  uint64 // Free memory available on the GPU
+}
+
+// GetAllGPUMemoryInfo returns mock GPU memory information for non-CUDA builds
+func GetAllGPUMemoryInfo() ([]GPUMemoryInfo, error) {
+	// Mock error to indicate no CUDA support
+	return nil, errors.New("CUDA not supported: failed to retrieve GPU memory info or no GPUs found")
+}
diff --git a/pkg/util/hardware/gpu_mem_info_cuda.go b/pkg/util/hardware/gpu_mem_info_cuda.go
new file mode 100644
index 0000000000000..ab77a8f793303
--- /dev/null
+++ b/pkg/util/hardware/gpu_mem_info_cuda.go
@@ -0,0 +1,90 @@
+//go:build cuda
+// +build cuda
+
+package hardware
+
+/*
+#cgo CFLAGS: -I/usr/local/cuda/include
+#cgo LDFLAGS: -L/usr/local/cuda/lib64 -lcudart
+#include <cuda_runtime.h>
+#include <stdlib.h>
+
+// Structure to store GPU memory info
+typedef struct {
+    size_t totalMemory;
+    size_t freeMemory;
+} GPUMemoryInfo;
+
+// Function to get memory info for all GPUs
+int getAllGPUMemoryInfo(GPUMemoryInfo** infos) {
+    int deviceCount = 0;
+    cudaError_t err = cudaGetDeviceCount(&deviceCount);
+    if (err != cudaSuccess || deviceCount == 0) {
+        return 0; // No GPUs found or error occurred
+    }
+
+    // Allocate memory for the output array
+    *infos = (GPUMemoryInfo*)malloc(deviceCount * sizeof(GPUMemoryInfo));
+    if (*infos == NULL) {
+        return 0; // Memory allocation failed
+    }
+
+    for (int i = 0; i < deviceCount; ++i) {
+        if (cudaSetDevice(i) != cudaSuccess) {
+            (*infos)[i].totalMemory = 0;
+            (*infos)[i].freeMemory = 0;
+            continue; // Skip if the device cannot be set
+        }
+
+        size_t freeMem = 0, totalMem = 0;
+        if (cudaMemGetInfo(&freeMem, &totalMem) != cudaSuccess) {
+            (*infos)[i].totalMemory = 0;
+            (*infos)[i].freeMemory = 0;
+            continue; // Skip if memory info cannot be fetched
+        }
+
+        (*infos)[i].totalMemory = totalMem;
+        (*infos)[i].freeMemory = freeMem;
+    }
+
+    return deviceCount; // Return the number of devices processed
+}
+*/
+import "C"
+import (
+	"errors"
+	"unsafe"
+)
+
+// GPUMemoryInfo represents a single GPU's memory information.
+type GPUMemoryInfo struct {
+	TotalMemory uint64 // Total memory in bytes
+	FreeMemory  uint64 // Free memory in bytes
+}
+
+// GetAllGPUMemoryInfo retrieves the memory information for all available GPUs.
+// It returns a slice of GPUMemoryInfo and an error if no GPUs are found or retrieval fails.
+func GetAllGPUMemoryInfo() ([]GPUMemoryInfo, error) {
+	var infos *C.GPUMemoryInfo
+
+	// Call the C function to retrieve GPU memory info
+	deviceCount := int(C.getAllGPUMemoryInfo(&infos))
+	if deviceCount == 0 {
+		return nil, errors.New("failed to retrieve GPU memory info or no GPUs found")
+	}
+	defer C.free(unsafe.Pointer(infos)) // Free the allocated memory
+
+	// Convert C array to Go slice
+	gpuInfos := make([]GPUMemoryInfo, 0, deviceCount)
+	infoArray := (*[1 << 30]C.GPUMemoryInfo)(unsafe.Pointer(infos))[:deviceCount:deviceCount]
+
+	for i := 0; i < deviceCount; i++ {
+		info := infoArray[i]
+		gpuInfos = append(gpuInfos, GPUMemoryInfo{
+			TotalMemory: uint64(info.totalMemory),
+			FreeMemory:  uint64(info.freeMemory),
+		})
+	}
+
+	return gpuInfos, nil
+}
diff --git a/pkg/util/paramtable/component_param.go b/pkg/util/paramtable/component_param.go
index 3f8723482311c..12b10f8a9e117 100644
--- a/pkg/util/paramtable/component_param.go
+++ b/pkg/util/paramtable/component_param.go
@@ -960,8 +960,9 @@ This helps Milvus-CDC synchronize incremental data`,
 }
 
 type gpuConfig struct {
-	InitSize ParamItem `refreshable:"false"`
-	MaxSize  ParamItem `refreshable:"false"`
+	InitSize                            ParamItem `refreshable:"false"`
+	MaxSize                             ParamItem `refreshable:"false"`
+	OverloadedMemoryThresholdPercentage ParamItem `refreshable:"false"`
 }
 
 func (t *gpuConfig) init(base *BaseTable) {
@@ -982,6 +983,16 @@ func (t *gpuConfig) init(base *BaseTable) {
 		DefaultValue: "4096",
 	}
 	t.MaxSize.Init(base.mgr)
+	t.OverloadedMemoryThresholdPercentage = ParamItem{
+		Key:          "gpu.overloadedMemoryThresholdPercentage",
+		Version:      "2.5.0",
+		DefaultValue: "95",
+		PanicIfEmpty: true,
+		Formatter: func(v string) string {
+			return fmt.Sprintf("%f", getAsFloat(v)/100)
+		},
+	}
+	t.OverloadedMemoryThresholdPercentage.Init(base.mgr)
 }
 
 type traceConfig struct {