cognigy-monitoring/charts/cognigy-alerts/values.yaml

# Target namespace for alerts
appNamespacesTarget: ".*"

# Default Prometheus Runbook URL
runbookUrl: "https://runbooks.prometheus-operator.dev/runbooks"

# Enable/disable alert rules groups and separate alerts
rules:
  # generic kubernetes alert rules for Cognigy products
  kubernetes:
    enabled: true
    kubePodCrashLooping:
      enabled: true
      severity: critical
    kubeDeploymentReplicasMismatch:
      enabled: true
      severity: warning
      timeout: 15m
    kubeStatefulSetReplicasMismatch:
      enabled: true
      severity: warning
      timeout: 15m
    kubePodNotReady:
      enabled: true
      severity: warning
      timeout: 30m
    podManyRestarts:
      enabled: true
      severity: warning
      timeout: 5m
      restartsPerHour: 3
      restartsTotal: 20
    containerOOMKilled:
      enabled: true
      severity: warning
      timeout: 5m
      threshold: 3
      offset: 3h
    cpuThrottlingBackendServiceHigh:
      enabled: true
      severity: warning
      timeout: 15m
    hpaMaxReplicasReached:
      enabled: true
      severity: warning
      timeout: 15m
  # Traefik alert rules
  traefik:
    enabled: true
    traefikHighHttp5xxErrorRateService:
      enabled: true
      # Threshold of 5xx error rate % for traefikHighHttp5xxErrorRateService to start firing
      threshold: 5
    traefikHighHttp4xxErrorRateService:
      enabled: false
    traefikOpenConnectionsHigh:
      enabled: true
    traefikTlsCertExpireSoon:
      enabled: true
  # NGiNX Ingress alert rules (disabled by default)
  nginx:
    enabled: false
    nginxHighHttp5xxErrorRate:
      enabled: true
    nginxHighHttp4xxErrorRate:
      enabled: false
    nginxLatencyHigh:
      enabled: true
    nginxTlsCertExpireSoon:
      enabled: true
  # RabbitMQ alert rules
  rabbitmq:
    enabled: true
    rabbitmqMemoryHigh:
      enabled: true
    rabbitmqHighWatermarkCrossed:
      enabled: true
    rabbitmqReadyMessageGettingHigh:
      enabled: true
      threshold: 20
    rabbitmqReadyQueueHigh:
      enabled: true
      warnThreshold: 100
      criticalThreshold: 500
    rabbitmqUnackedMessageGettingHigh:
      enabled: true
      threshold: 20
    rabbitmqUnackedQueueHigh:
      enabled: true
      warnThreshold: 300
      criticalThreshold: 700

  # Redis alert rules
  redis:
    enabled: true
    redisBlockedClients:
      enabled: true
      threshold: 50
  # PostgreSQL alert rules
  postgresql:
    enabled: false
    tooManyConnections:
      enabled: true
      threshold: 85
    highRollbackRate:
      enabled: true
      threshold: 5
    deadLocks:
      enabled: true
      threshold: 1
    tooManyLocks:
      enabled: true
      threshold: 0.2
    pgReplicationLag:
      enabled: true
      warnThreshold: 15
      criticalThreshold: 60
    unusedReplicationSlot:
      enabled: true
    patroni:
      enabled: true
      replicationLag:
        enabled: true
        threshold: 500
  # MongoDB alert rules
  mongodb:
    enabled: true
    mongodbDown:
      enabled: true
    mongodbReplicaMemberUnhealthy:
      enabled: true
    mongodbReplicationLag:
      enabled: true
      # Threshold of mongodb replica lagging behind (in seconds)
      threshold: 900
    mongodbNumberCursorsOpen:
      enabled: true
    mongodbCursorsTimeouts:
      enabled: true
    mongodbTooManyConnections:
      enabled: true
    mongodbVirtualMemoryUsage:
      enabled: false
  # velero alert rules
  velero:
    enabled: false

  # Cognigy.AI alert rules
  ai:
    enabled: true
    conversationVolume:
      enabled: true
      threshold: 1000
    nlpMatcher:
      enabled: true
    functionExecution:
      enabled: true
    serviceExecution:
      enabled: true
    serviceHandover:
      enabled: true
      # threshold on number of service handover errors per 5 min
      threshold: 2
      requestLatencyHighQuantile: 0.95
      # threshold (seconds) for high requests latency to external providers
      requestLatencyHighThreshold: 2
      # threshold for 5xx errors to external providers per minute
      requestErrorThreshold: 2

    serviceEndpoint:
      enabled: true
      # Message processing Time quantile to analyze
      messageProcessingTimeQuantile: 0.8
      # Threshold (sec) of Message processing Time for EndpointMessageProcessingTimeHigh to start firing
      messageProcessingTimeThreshold: 50
      # Threshold of increase (%) of Message Processing Time for EndpointMessageProcessingTimeIncreasing to start firing
      messageProcessingTimeIncreasingThreshold: 300
      # Baseline for Message processing Time (sec) to avoid false positives of EndpointMessageProcessingTimeIncreasing alert,
      # increases above messageProcessingTimeIncreasingThreshold and with the baseline below messageProcessingTimeBaseline are ignored
      messageProcessingTimeBaseline: 10

  # Cognigy.AI runtime services rules
  runtime:
    enabled: true
    serviceAI:
      enabled: true
    serviceAppSessionManager:
      enabled: true
    serviceExecution:
      enabled: true
    serviceFunctionExecution:
      enabled: true
    serviceHttp:
      enabled: true
    serviceNlpMatcher:
      enabled: true
    servicePlaybookExecution:
      enabled: true
    serviceSessionStateManager:
      enabled: true

  vg:
    enabled: false
    featureServer:
      enabled: true
      uncaughtErrors:
        enabled: true
        threshold: 5
      taskErrors:
        enabled: true
        threshold: 25
      handoverErrors:
        enabled: true
        threshold: 5
      analyticsDataErrors:
        enabled: true
        threshold: 5
    billingApp:
      enabled: true
      uncaughtErrors:
        enabled: true
        threshold: 0
      updateCallHistoryErrors:
        enabled: true
        threshold: 0
      routeUpdateCallHistoryErrors:
        enabled: true
        threshold: 0
  la:
    enabled: false
    HighHttpErrorRate:
      enabled: true
      threshold: 5
    EndpointSlowRequest:
      enabled: true
      threshold: 10

  insights:
    enabled: true
    analyticsCollector:
      enabled: true
      processedVsStored:
        threshold: 100