From df368ab19ae4484e77f7d7653b654b0576be5b6f Mon Sep 17 00:00:00 2001 From: moraxy Date: Fri, 25 Nov 2022 03:43:31 +0100 Subject: [PATCH] Added new retention policy: onJobFailure This new retention policy prevents the deletion of a pod in case of a failed build result by the job that was using the pod. This is useful for debugging failed jobs because you are still able to open a terminal in any pod and investigate log files or similar, without having to set the retention policy to 'Always' or using workarounds like long timeouts. --- .../pod/retention/OnJobFailure.java | 224 ++++++++++++++++++ .../KubernetesCloud/help-podRetention.html | 1 + .../pod/retention/Messages.properties | 1 + .../pod/retention/PodRetentionTest.java | 17 ++ 4 files changed, 243 insertions(+) create mode 100644 src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/OnJobFailure.java diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/OnJobFailure.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/OnJobFailure.java new file mode 100644 index 0000000000..010a6b8ee6 --- /dev/null +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/OnJobFailure.java @@ -0,0 +1,224 @@ +package org.csanchez.jenkins.plugins.kubernetes.pod.retention; + +import hudson.Extension; +import hudson.model.*; +import io.fabric8.kubernetes.api.model.Pod; +import jenkins.model.Jenkins; + +import org.csanchez.jenkins.plugins.kubernetes.KubernetesCloud; +import org.jenkinsci.Symbol; +import org.kohsuke.stapler.DataBoundConstructor; + + +import java.io.Serializable; +import java.time.Duration; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import java.util.stream.Collectors; + +/** + * This pod retention policy keeps the pod from being terminated if the Jenkins + * job it's associated with fails. + * + * In case of any other result, including errors in determining the result, it + * will default to deleting the pod. + */ +public class OnJobFailure extends PodRetention implements Serializable { + + private static final long serialVersionUID = -6422177946264212816L; + + private static final Logger LOGGER = Logger.getLogger(OnJobFailure.class.getName()); + + private static final String MODULENAME = "OnJobFailure"; + + // small convenience function + private void LOG(Level level, String message) { + LOGGER.log(level, () -> MODULENAME + ": " + message); + } + + @DataBoundConstructor + public OnJobFailure() { + } + + @Override + public boolean shouldDeletePod(KubernetesCloud cloud, Pod pod) { + if (cloud == null || pod == null) { + LOG(Level.INFO, "shouldDeletePod called without actual cloud and pod"); + return true; + } + + // Get the current Jenkins instance to access a list of all jobs + Jenkins jenkins = Jenkins.getInstanceOrNull(); + if (jenkins == null) { + LOG(Level.INFO, "Couldn't get the current Jenkins reference"); + return true; + } + + // All known jobs of the current Jenkins instance + List jobs = jenkins.getAllItems(Job.class); + if (jobs.isEmpty()) { + LOG(Level.INFO, "Jenkins doesn't have any jobs?"); + return true; + } + + // runUrl will be something like "job///" or + // "job//job///" if nested + // this is the trick how we get our job name and run id + String runUrl = pod.getMetadata().getAnnotations().get("runUrl"); + if (runUrl == null) { + LOG(Level.INFO, "The pod has no required 'runUrl' annotation"); + return true; + } + + // everything is in place, get the result + Result result = getResultForJob(runUrl, jobs); + if (result == null) { + // we couldn't get the result for some reason + LOG(Level.INFO, "Couldn't find the result for runUrl: " + runUrl); + return true; + } + + // finally, delete only if successful + boolean delete = result.equals(Result.SUCCESS); + LOG(Level.FINE, "delete = " + delete); + return delete; + } + + /** + * Split up the runUrl string and return the run id + * + * @param runUrl the "runUrl" annotation of the kubernetes pod + * @return the run id as a string + */ + public String getRunId(String runUrl) { + // extract the relevant parts + String[] parts = runUrl.split("/"); + + if (parts.length < 3) { + LOG(Level.INFO, "runUrl has unknown format: " + runUrl); + return null; + } + + return parts[parts.length - 1].trim(); + } + + /** + * Filter the entire job list down to the one job that we're looking for + * + * @param runUrl the "runUrl" annotation of the kubernetes pod + * @param jobs the list of all Jenkins jobs + * @return the matching job, if successful, or null on error + */ + public Job getJob(String runUrl, List jobs) { + // strip the runId to enable matching by jobUrl + Pattern pattern = Pattern.compile("(^job.+/)[0-9]+/?$"); + Matcher matcher = pattern.matcher(runUrl); + String jobUrl = matcher.group(1); + + // find the jobs that match the shortened runUrl annotation + // it should be only one + List matchingJobs = jobs.stream().filter(t -> jobUrl.equals(t.getUrl())).collect(Collectors.toList()); + + // we expect to find exactly one job + if (matchingJobs.size() != 1) { + LOG(Level.INFO, "For some reason we found multiple matching jobs: " + matchingJobs.size()); + return null; + } + + return matchingJobs.get(0); + } + + /** + * Get the result for a particular Jenkins job + * + * @param runUrl the "runUrl" annotation of the kubernetes pod + * @param jobs the list of all Jenkins jobs + * @return the job results, if successful, or null on error + */ + public Result getResultForJob(String runUrl, List jobs) { + // get the id of this particular run + String runId = getRunId(runUrl); + if (runId == null) { + LOG(Level.INFO, "Couldn't get the runId"); + return null; + } + + // get a reference to the job that started the pod + Job job = getJob(runUrl, jobs); + if (job == null) { + LOG(Level.INFO, "Can't find the job for runUrl: " + runUrl); + return null; + } + + // use job and runId to find the particular run + Run run = job.getBuild(runId); + if (run == null) { + LOG(Level.INFO, "Couldn't find the run for runUrl: " + runUrl); + return null; + } + + // get the result + Result result = run.getResult(); + + // and then this sometimes happens: the run has finished and + // Jenkins asks if the pod should be deleted, but the result + // is actually still null. We just repeat querying for 30 + // seconds and then abort if it's still not available + int maxRounds = 30; // arbitrary + + while (result == null && maxRounds > 0) { + LOG(Level.FINE, "result == null, waiting..."); + + maxRounds--; + + try { + Thread.sleep(Duration.ofSeconds(1).toMillis()); + } catch (Exception e) { + LOG(Level.INFO, "Thread.sleep failed: " + e.getMessage()); + } + + // retry getting the result + result = run.getResult(); + } + + // done + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (obj instanceof OnJobFailure) { + return true; + } + return false; + } + + @Override + public int hashCode() { + return this.toString().hashCode(); + } + + @Override + public String toString() { + return Messages.on_Job_Failure(); + } + + @Extension + @Symbol("onJobFailure") + public static class DescriptorImpl extends PodRetentionDescriptor { + @Override + public String getDisplayName() { + return Messages.on_Job_Failure(); + } + } +} diff --git a/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/KubernetesCloud/help-podRetention.html b/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/KubernetesCloud/help-podRetention.html index 9fa0fd5ff3..3dfae9c785 100644 --- a/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/KubernetesCloud/help-podRetention.html +++ b/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/KubernetesCloud/help-podRetention.html @@ -6,6 +6,7 @@
  1. Never - always delete the agent pod.
  2. On Failure - keep the agent pod if it fails during the build.
  3. +
  4. On Job Failure - keep the agent pod if the build itself fails.
  5. Always - always keep the agent pod.

diff --git a/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Messages.properties b/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Messages.properties index 1d8c59d46a..c240b7d547 100644 --- a/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Messages.properties +++ b/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Messages.properties @@ -24,3 +24,4 @@ always=Always _default=Default never=Never on_Failure=On Failure +on_Job_Failure=On Job Failure diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/PodRetentionTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/PodRetentionTest.java index e5a30157a3..bee2a2eb57 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/PodRetentionTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/PodRetentionTest.java @@ -59,6 +59,23 @@ public void testOnFailurePodRetention() { assertTrue(subject.shouldDeletePod(cloud, pod)); } + @Test + public void testOnJobFailurePodRetention() { + OnJobFailure subject = new OnJobFailure(); + + // regular + String runId = subject.getRunId("job/jobname/42/"); + assertEquals("42", runId); + + // nested + runId = subject.getRunId("job/jobname1/job/jobname2/42/"); + assertEquals("42", runId); + + // folder name has numbers + runId = subject.getRunId("job/22/42/"); + assertEquals("42", runId); + } + private PodStatus buildStatus(String phase) { return new PodStatusBuilder().withPhase(phase).build(); }