From 94cf30c7c23090f528787dddff52e81c4cc0eabe Mon Sep 17 00:00:00 2001 From: Sanjay Kumar Srikakulam Date: Tue, 31 Oct 2023 15:55:40 +0000 Subject: [PATCH] Add telegraf script to monitor condor queue jobs including compute resources requested by job and the dates --- group_vars/maintenance.yml | 7 +++++++ .../files/cluster_queue-condor-jobs.sh | 15 +++++++++++++++ roles/hxr.monitor-cluster/tasks/condor.yml | 16 ++++++++++++++++ 3 files changed, 38 insertions(+) create mode 100644 roles/hxr.monitor-cluster/files/cluster_queue-condor-jobs.sh diff --git a/group_vars/maintenance.yml b/group_vars/maintenance.yml index ebf79138d..702294db5 100644 --- a/group_vars/maintenance.yml +++ b/group_vars/maintenance.yml @@ -254,6 +254,13 @@ telegraf_plugins_extra: - timeout = "10s" - data_format = "influx" - interval = "1m" + monitor_condor_queue_jobs: + plugin: "exec" + config: + - commands = ["sudo /usr/bin/monitor-condor-queue-jobs"] + - timeout = "10s" + - data_format = "influx" + - interval = "1m" postgres_extra: plugin: "exec" config: diff --git a/roles/hxr.monitor-cluster/files/cluster_queue-condor-jobs.sh b/roles/hxr.monitor-cluster/files/cluster_queue-condor-jobs.sh new file mode 100644 index 000000000..85ffd4dd7 --- /dev/null +++ b/roles/hxr.monitor-cluster/files/cluster_queue-condor-jobs.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# This script is used to monitor the condor jobs status in the cluster including the compute resources, job submit time to the queue, job start time, job description, etc. +condor_q -global -autoformat ClusterId JobStatus Cmd RemoteHost RequestCpus RequestMemory QDate JobStartDate JobDescription | awk '{ + if ($8 != "undefined") $8 = strftime("%Y-%m-%d %H:%M:%S", $8); + status["0"]="Unexpanded"; status["1"]="Idle"; status["2"]="Running"; status["3"]="Removed"; status["4"]="Completed"; status["5"]="Held"; status["6"]="Submission_err"; + + jobdesc = $9; + + for (i = 10; i <= NF; i++) { + jobdesc = jobdesc "_" $i; + } + + printf "condor_queued_jobs_status,clusterid=\"%s\" jobstatus=\"%s\",cmd=\"%s\",remotehost=\"%s\",requestcpus=%s,requestmemory=%s,qdate=\"%s\",jobstartdate=\"%s\",jobdescription=\"%s\"\n", $1, status[$2], $3, $4, $5, $6, strftime("%Y-%m-%d %H:%M:%S", $7), $8, jobdesc +}' diff --git a/roles/hxr.monitor-cluster/tasks/condor.yml b/roles/hxr.monitor-cluster/tasks/condor.yml index 996d0bcb6..a831a0a61 100644 --- a/roles/hxr.monitor-cluster/tasks/condor.yml +++ b/roles/hxr.monitor-cluster/tasks/condor.yml @@ -46,3 +46,19 @@ insertafter: EOF line: 'telegraf ALL=(ALL) NOPASSWD: /usr/bin/monitor-condor-queue' validate: 'visudo -cf %s' + +- name: "Add condor queue jobs status script" + copy: + src: "cluster_queue-condor-jobs.sh" + dest: "/usr/bin/monitor-condor-queue-jobs" + owner: root + group: root + mode: 0755 + +- name: Allow telegraf to run monitor-condor-queue-jobs + lineinfile: + path: /etc/sudoers + state: present + insertafter: EOF + line: 'telegraf ALL=(ALL) NOPASSWD: /usr/bin/monitor-condor-queue-jobs' + validate: 'visudo -cf %s'