You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hello, we are facing issue where collections are gone when all pods are restarted in EKS cluster i.e. due to monthly node rollouts, all Milvus related pods are starting correctly, however all collections are gone, even though we have persistence enabled. Except using defaults here's our helm values we are passing to the chart.
milvus:
cluster:
enabled: true
image:
all:
repository: 1111111111111111.dkr.ecr.us-west-2.amazonaws.com/docker-hub/milvusdb/milvus
tag: v2.4.13
tools:
repository: 1111111111111111.dkr.ecr.us-west-2.amazonaws.com/docker-hub/milvusdb/milvus-config-tool
tag: v0.1.2
heaptrack:
image:
repository: 1111111111111111.dkr.ecr.us-west-2.amazonaws.com/docker-hub/milvusdb/heaptrack
serviceAccount:
create: true
name: milvus-prd
annotations:
eks.amazonaws.com/role-arn: "arn:aws:iam::1111111111111111:role/milvus-prd-milvus-prd-api-ol-us-west-2-eks-ts-k8s-local"
extraConfigFiles:
user.yaml: |+
proxy:
http:
enabled: true
maxUserNum: 100
maxRoleNum: 10
common:
security:
authorizationEnabled: true
ingress:
enabled: true
ingressClassName: "nginx-internal"
annotations:
kubernetes.io/ingress.class: nginx-internal
nginx.ingress.kubernetes.io/backend-protocol: GRPC
nginx.ingress.kubernetes.io/listen-ports-ssl: '[19530]'
nginx.ingress.kubernetes.io/proxy-body-size: 4m
nginx.ingress.kubernetes.io/ssl-passthrough: "true"
nginx.org/proxy-buffer-size: 128k
nginx.org/proxy-buffers: "4 256k"
nginx.org/proxy-busy-buffers-size: 256k
alb.ingress.kubernetes.io/certificate-arn: "arn:aws:acm:us-west-2:1111111111111111:certificate/38a4163f-4229-4943-9b78-d1ca7ec0e32d"
rules:
- host: "milvus-api-prd.us-west-2.orchestration.lan.ts.net"
path: "/"
pathType: "Prefix"
service:
type: NodePort
nodePort: "32009"
metrics:
enabled: true
serviceMonitor:
enabled: true
additionalLabels:
release: kube-prometheus-stack
log:
level: "info"
persistence:
persistentVolumeClaim:
storageClass: ebs-sc
### API ###
proxy:
enabled: true
http:
enabled: true
debugMode:
enabled: true
replicas: 2
resources:
limits:
cpu: 1
memory: 2Gi
nodeSelector:
ts.net/workload: production-milvus
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: ts.net/workload
operator: In
values:
- production-milvus
# To prevent spawning pods in same AZ
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchLabels:
app.kubernetes.io/name: milvus
component: proxy
topologyKey: topology.kubernetes.io/zone
tolerations:
- key: "dedicated"
operator: "Equal"
value: "milvus-prd"
effect: "NoSchedule"
# No recommended for HA usecases, the standby mixcoord only takes over after the active one is down for 60 seconds
mixCoordinator:
enabled: false
rootCoordinator:
enabled: true
replicas: 2
activeStandby:
enabled: true
nodeSelector:
ts.net/workload: production-milvus
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: ts.net/workload
operator: In
values:
- production-milvus
# To prevent spawning pods in same AZ
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchLabels:
app.kubernetes.io/name: milvus
component: rootcoord
topologyKey: topology.kubernetes.io/zone
tolerations:
- key: "dedicated"
operator: "Equal"
value: "milvus-prd"
effect: "NoSchedule"
queryCoordinator:
enabled: true
replicas: 2
activeStandby:
enabled: true
nodeSelector:
ts.net/workload: production-milvus
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: ts.net/workload
operator: In
values:
- production-milvus
# To prevent spawning pods in same AZ
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchLabels:
app.kubernetes.io/name: milvus
component: querycoord
topologyKey: topology.kubernetes.io/zone
tolerations:
- key: "dedicated"
operator: "Equal"
value: "milvus-prd"
effect: "NoSchedule"
indexCoordinator:
enabled: true
replicas: 2
activeStandby:
enabled: true
nodeSelector:
ts.net/workload: production-milvus
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: ts.net/workload
operator: In
values:
- production-milvus
# To prevent spawning pods in same AZ
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchLabels:
app.kubernetes.io/name: milvus
component: indexcoord
topologyKey: topology.kubernetes.io/zone
tolerations:
- key: "dedicated"
operator: "Equal"
value: "milvus-prd"
effect: "NoSchedule"
dataCoordinator:
enabled: true
replicas: 2
activeStandby:
enabled: true
nodeSelector:
ts.net/workload: production-milvus
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: ts.net/workload
operator: In
values:
- production-milvus
# To prevent spawning pods in same AZ
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchLabels:
app.kubernetes.io/name: milvus
component: datacoord
topologyKey: topology.kubernetes.io/zone
tolerations:
- key: "dedicated"
operator: "Equal"
value: "milvus-prd"
effect: "NoSchedule"
### Nodes ###
###########################
queryNode:
enabled: true
# You can set the number of replicas to -1 to remove the replicas field in case you want to use HPA
replicas: -1
resources:
limits:
cpu: 2
memory: 4Gi
nodeSelector:
ts.net/workload: production-milvus
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: ts.net/workload
operator: In
values:
- production-milvus
tolerations:
- key: "dedicated"
operator: "Equal"
value: "milvus-prd"
effect: "NoSchedule"
###########################
indexNode:
enabled: true
# You can set the number of replicas to -1 to remove the replicas field in case you want to use HPA
replicas: -1
resources:
limits:
cpu: 2
memory: 4Gi
nodeSelector:
ts.net/workload: production-milvus
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: ts.net/workload
operator: In
values:
- production-milvus
tolerations:
- key: "dedicated"
operator: "Equal"
value: "milvus-prd"
effect: "NoSchedule"
###########################
dataNode:
enabled: true
# You can set the number of replicas to -1 to remove the replicas field in case you want to use HPA
replicas: -1
resources:
limits:
cpu: 1
memory: 4Gi
nodeSelector:
ts.net/workload: production-milvus
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: ts.net/workload
operator: In
values:
- production-milvus
tolerations:
- key: "dedicated"
operator: "Equal"
value: "milvus-prd"
effect: "NoSchedule"
###########################
### UI ###
attu:
enabled: true
image:
repository: 1111111111111111.dkr.ecr.us-west-2.amazonaws.com/docker-hub/zilliz/attu
tag: v2.4.7
pullPolicy: Always
ingress:
enabled: true
ingressClassName: nginx-internal
annotations:
kubernetes.io/ingress.class: nginx-internal
hosts:
- ui-milvus-prd.us-west-2.orchestration.lan.ts.net
### Common components ###
minio:
enabled: false
# # BUG in helm-chart, doesn't render without this values
# tls:
# enabled: false
externalS3:
enabled: true
host: "s3.us-west-2.amazonaws.com"
port: "443"
useSSL: true
bucketName: "milvus-prd-milvus-prd-api-ol-us-west-2-eks-ts-k8s-local"
rootPath: ""
useIAM: true
cloudProvider: "aws"
iamEndpoint: "https://iam.amazonaws.com"
region: "us-west-2"
useVirtualHost: false
etcd:
image:
registry: 1111111111111111.dkr.ecr.us-west-2.amazonaws.com/docker-hub
repository: milvusdb/etcd
tag: "3.5.14-r1"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: ts.net/workload
operator: In
values:
- production-milvus
# To prevent spawning pods in same AZ
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchLabels:
app.kubernetes.io/name: etcd
topologyKey: topology.kubernetes.io/zone
tolerations:
- key: "dedicated"
operator: "Equal"
value: "milvus-prd"
effect: "NoSchedule"
###########################
# Messaging
###########################
pulsar:
affinity:
anti_affinity: false
images:
broker:
repository: 1111111111111111.dkr.ecr.us-west-2.amazonaws.com/docker-hub/apachepulsar/pulsar
tag: 2.9.5
autorecovery:
repository: 1111111111111111.dkr.ecr.us-west-2.amazonaws.com/docker-hub/apachepulsar/pulsar
tag: 2.9.5
zookeeper:
repository: 1111111111111111.dkr.ecr.us-west-2.amazonaws.com/docker-hub/apachepulsar/pulsar
tag: 2.9.5
bookie:
repository: 1111111111111111.dkr.ecr.us-west-2.amazonaws.com/docker-hub/apachepulsar/pulsar
tag: 2.9.5
proxy:
repository: 1111111111111111.dkr.ecr.us-west-2.amazonaws.com/docker-hub/apachepulsar/pulsar
tag: 2.9.5
pulsar_manager:
repository: 1111111111111111.dkr.ecr.us-west-2.amazonaws.com/docker-hub/apachepulsar/pulsar-manager
tag: v0.1.0
pulsar_metadata:
component: pulsar-init
image:
# the image used for running `pulsar-cluster-initialize` job
repository: 1111111111111111.dkr.ecr.us-west-2.amazonaws.com/docker-hub/apachepulsar/pulsar
tag: 2.9.5
zookeeper:
volumes:
data:
size: 20Gi
storageClassName: ebs-sc
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: ts.net/workload
operator: In
values:
- production-milvus-queue
tolerations:
- key: "dedicated"
operator: "Equal"
value: "milvus-queue-prd"
effect: "NoSchedule"
nodeSelector:
ts.net/workload: production-milvus-queue
bookkeeper:
volumes:
journal:
size: 10Gi
storageClassName: ebs-sc
ledgers:
size: 30Gi
storageClassName: ebs-sc
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: ts.net/workload
operator: In
values:
- production-milvus-queue
tolerations:
- key: "dedicated"
operator: "Equal"
value: "milvus-queue-prd"
effect: "NoSchedule"
nodeSelector:
ts.net/workload: production-milvus-queue
broker:
replicaCount: 2
resources:
requests:
cpu: 1
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: ts.net/workload
operator: In
values:
- production-milvus-queue
tolerations:
- key: "dedicated"
operator: "Equal"
value: "milvus-queue-prd"
effect: "NoSchedule"
nodeSelector:
ts.net/workload: production-milvus-queue
autorecovery:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: ts.net/workload
operator: In
values:
- production-milvus-queue
tolerations:
- key: "dedicated"
operator: "Equal"
value: "milvus-queue-prd"
effect: "NoSchedule"
nodeSelector:
ts.net/workload: production-milvus-queue
proxy:
replicaCount: 2
resources:
requests:
cpu: 0.5
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: ts.net/workload
operator: In
values:
- production-milvus-queue
tolerations:
- key: "dedicated"
operator: "Equal"
value: "milvus-queue-prd"
effect: "NoSchedule"
nodeSelector:
ts.net/workload: production-milvus-queue
The text was updated successfully, but these errors were encountered:
Hello, we are facing issue where collections are gone when all pods are restarted in EKS cluster i.e. due to monthly node rollouts, all Milvus related pods are starting correctly, however all collections are gone, even though we have persistence enabled. Except using defaults here's our helm values we are passing to the chart.
The text was updated successfully, but these errors were encountered: