From 9903bcc87d6ed2b2451ed073735974d991b8d769 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexandre=20de=20S=C3=A1?= Date: Thu, 23 Mar 2023 13:56:50 -0300 Subject: [PATCH 1/2] add runbook KubeJobNotCompleted MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Alexandre de Sá --- .../kubernetes/KubeJobNotCompleted.md | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 content/runbooks/kubernetes/KubeJobNotCompleted.md diff --git a/content/runbooks/kubernetes/KubeJobNotCompleted.md b/content/runbooks/kubernetes/KubeJobNotCompleted.md new file mode 100644 index 0000000..78450e0 --- /dev/null +++ b/content/runbooks/kubernetes/KubeJobNotCompleted.md @@ -0,0 +1,25 @@ +--- +title: Kube Job Not Completed +weight: 20 +--- + +# KubeJobNotCompleted + +## Meaning + +Job is taking more than 12h to complete. + +## Impact + +- Long processing of batch jobs. +- Possible issues with scheduling next Job + +## Diagnosis + +- Check job via `kubectl -n $NAMESPACE describe jobs $JOB`. +- Check the pod logs using `kubectl -n $NAMESPACE logs $POD_FROM_JOB` for further information. + +## Mitigation + +- Give it more resources so it finishes faster, if applicable. +- See [Job patterns](https://kubernetes.io/docs/tasks/job/) From d0bc3979823e27e4121d30ffaf3aea79cac47950 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexandre=20de=20S=C3=A1?= Date: Thu, 23 Mar 2023 13:58:49 -0300 Subject: [PATCH 2/2] removed renamed alertname MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Alexandre de Sá --- .../runbooks/kubernetes/KubeJobCompletion.md | 25 ------------------- 1 file changed, 25 deletions(-) delete mode 100644 content/runbooks/kubernetes/KubeJobCompletion.md diff --git a/content/runbooks/kubernetes/KubeJobCompletion.md b/content/runbooks/kubernetes/KubeJobCompletion.md deleted file mode 100644 index 47eea38..0000000 --- a/content/runbooks/kubernetes/KubeJobCompletion.md +++ /dev/null @@ -1,25 +0,0 @@ ---- -title: Kube Job Completion -weight: 20 ---- - -# KubeJobCompletion - -## Meaning - -Job is taking more than 1h to complete. - -## Impact - -- Long processing of batch jobs. -- Possible issues with scheduling next Job - -## Diagnosis - -- Check job via `kubectl -n $NAMESPACE describe jobs $JOB`. -- Check pod events via `kubectl -n $NAMESPACE describe job $JOB`. - -## Mitigation - -- Give it more resources so it finishes faster, if applicable. -- See [Job patterns](https://kubernetes.io/docs/tasks/job/)