-
Notifications
You must be signed in to change notification settings - Fork 1
/
values.yaml
243 lines (235 loc) · 5.97 KB
/
values.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# Target namespace for alerts
appNamespacesTarget: ".*"
# Default Prometheus Runbook URL
runbookUrl: "https://runbooks.prometheus-operator.dev/runbooks"
# Enable/disable alert rules groups and separate alerts
rules:
# generic kubernetes alert rules for Cognigy products
kubernetes:
enabled: true
kubePodCrashLooping:
enabled: true
severity: critical
kubeDeploymentReplicasMismatch:
enabled: true
severity: warning
timeout: 15m
kubeStatefulSetReplicasMismatch:
enabled: true
severity: warning
timeout: 15m
kubePodNotReady:
enabled: true
severity: warning
timeout: 30m
podManyRestarts:
enabled: true
severity: warning
timeout: 5m
restartsPerHour: 3
restartsTotal: 20
containerOOMKilled:
enabled: true
severity: warning
timeout: 5m
threshold: 3
offset: 3h
cpuThrottlingBackendServiceHigh:
enabled: true
severity: warning
timeout: 15m
hpaMaxReplicasReached:
enabled: true
severity: warning
timeout: 15m
# Traefik alert rules
traefik:
enabled: true
traefikHighHttp5xxErrorRateService:
enabled: true
# Threshold of 5xx error rate % for traefikHighHttp5xxErrorRateService to start firing
threshold: 5
traefikHighHttp4xxErrorRateService:
enabled: false
traefikOpenConnectionsHigh:
enabled: true
traefikTlsCertExpireSoon:
enabled: true
# NGiNX Ingress alert rules (disabled by default)
nginx:
enabled: false
nginxHighHttp5xxErrorRate:
enabled: true
nginxHighHttp4xxErrorRate:
enabled: false
nginxLatencyHigh:
enabled: true
nginxTlsCertExpireSoon:
enabled: true
# RabbitMQ alert rules
rabbitmq:
enabled: true
rabbitmqMemoryHigh:
enabled: true
rabbitmqHighWatermarkCrossed:
enabled: true
rabbitmqReadyMessageGettingHigh:
enabled: true
threshold: 20
rabbitmqReadyQueueHigh:
enabled: true
warnThreshold: 100
criticalThreshold: 500
rabbitmqUnackedMessageGettingHigh:
enabled: true
threshold: 20
rabbitmqUnackedQueueHigh:
enabled: true
warnThreshold: 300
criticalThreshold: 700
# Redis alert rules
redis:
enabled: true
redisBlockedClients:
enabled: true
threshold: 50
# PostgreSQL alert rules
postgresql:
enabled: false
tooManyConnections:
enabled: true
threshold: 85
highRollbackRate:
enabled: true
threshold: 5
deadLocks:
enabled: true
threshold: 1
tooManyLocks:
enabled: true
threshold: 0.2
pgReplicationLag:
enabled: true
warnThreshold: 15
criticalThreshold: 60
unusedReplicationSlot:
enabled: true
patroni:
enabled: true
replicationLag:
enabled: true
threshold: 500
# MongoDB alert rules
mongodb:
enabled: true
mongodbDown:
enabled: true
mongodbReplicaMemberUnhealthy:
enabled: true
mongodbReplicationLag:
enabled: true
# Threshold of mongodb replica lagging behind (in seconds)
threshold: 900
mongodbNumberCursorsOpen:
enabled: true
mongodbCursorsTimeouts:
enabled: true
mongodbTooManyConnections:
enabled: true
mongodbVirtualMemoryUsage:
enabled: false
# velero alert rules
velero:
enabled: false
# Cognigy.AI alert rules
ai:
enabled: true
conversationVolume:
enabled: true
threshold: 1000
nlpMatcher:
enabled: true
functionExecution:
enabled: true
serviceExecution:
enabled: true
serviceHandover:
enabled: true
# threshold on number of service handover errors per 5 min
threshold: 2
requestLatencyHighQuantile: 0.95
# threshold (seconds) for high requests latency to external providers
requestLatencyHighThreshold: 2
# threshold for 5xx errors to external providers per minute
requestErrorThreshold: 2
serviceEndpoint:
enabled: true
# Message processing Time quantile to analyze
messageProcessingTimeQuantile: 0.8
# Threshold (sec) of Message processing Time for EndpointMessageProcessingTimeHigh to start firing
messageProcessingTimeThreshold: 50
# Threshold of increase (%) of Message Processing Time for EndpointMessageProcessingTimeIncreasing to start firing
messageProcessingTimeIncreasingThreshold: 300
# Baseline for Message processing Time (sec) to avoid false positives of EndpointMessageProcessingTimeIncreasing alert,
# increases above messageProcessingTimeIncreasingThreshold and with the baseline below messageProcessingTimeBaseline are ignored
messageProcessingTimeBaseline: 10
# Cognigy.AI runtime services rules
runtime:
enabled: true
serviceAI:
enabled: true
serviceAppSessionManager:
enabled: true
serviceExecution:
enabled: true
serviceFunctionExecution:
enabled: true
serviceHttp:
enabled: true
serviceNlpMatcher:
enabled: true
servicePlaybookExecution:
enabled: true
serviceSessionStateManager:
enabled: true
vg:
enabled: false
featureServer:
enabled: true
uncaughtErrors:
enabled: true
threshold: 5
taskErrors:
enabled: true
threshold: 25
handoverErrors:
enabled: true
threshold: 5
analyticsDataErrors:
enabled: true
threshold: 5
billingApp:
enabled: true
uncaughtErrors:
enabled: true
threshold: 0
updateCallHistoryErrors:
enabled: true
threshold: 0
routeUpdateCallHistoryErrors:
enabled: true
threshold: 0
la:
enabled: false
HighHttpErrorRate:
enabled: true
threshold: 5
EndpointSlowRequest:
enabled: true
threshold: 10
insights:
enabled: true
analyticsCollector:
enabled: true
processedVsStored:
threshold: 100