From d50c83d732605375ce7e9c77a3b38b519ede87ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Manuel=20Dom=C3=ADnguez?= Date: Thu, 25 Jul 2024 14:47:11 +0200 Subject: [PATCH] Parallelize CVMFS monitoring to bring back the CVMFS Grafana dashboard The script `/usr/bin/check_cvmfs_repos` installed by the CVMFS monitoring role `hxr.monitor-cvmfs` takes longer than 2 minutes to run (the Telegraf timeout for this script) due to misbehaving CVMFS servers and serial execution. This results in no measurements being registered. ``` Jul 25 13:52:00 cvmfs1-ufr0.internal.galaxyproject.eu telegraf[2616631]: 2024-07-25T11:52:00Z E! [inputs.exec] Error in plugin: exec: command timed out for command "/usr/bin/check_cvmfs_repos": /usr/bin/check_cvmfs_repos: line 9: [: : integer expression expected... ``` Add timeout to `curl` calls in `check_cvmfs_repos` script from CVMFS monitoring role `hxr.monitor-cvmfs` and parallelize all `check_repo` calls so that the script is guaranteed to exit before it times out. --- roles/hxr.monitor-cvmfs/templates/main.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/roles/hxr.monitor-cvmfs/templates/main.sh b/roles/hxr.monitor-cvmfs/templates/main.sh index 91b753a34..a3e08f223 100644 --- a/roles/hxr.monitor-cvmfs/templates/main.sh +++ b/roles/hxr.monitor-cvmfs/templates/main.sh @@ -3,8 +3,8 @@ check_repo() { host="$1" repo="$2" - http_code="$(curl http://$host/cvmfs/$repo/.cvmfspublished -I --silent | head -n 1 | cut -f2 -d' ')" - header="$(curl http://$host/cvmfs/$repo/.cvmfspublished --silent | head -n 12)" + http_code="$(curl --max-time 20 http://$host/cvmfs/$repo/.cvmfspublished -I --silent | head -n 1 | cut -f2 -d' ')" + header="$(curl --max-time 20 http://$host/cvmfs/$repo/.cvmfspublished --silent | head -n 12)" if [ "$http_code" -eq "200" ]; then # https://cvmfs.readthedocs.io/en/stable/cpt-details.html#repository-manifest-cvmfspublished @@ -21,6 +21,8 @@ check_repo() { {% for host in cvmfs_check_servers.hosts %} {% for repo in cvmfs_check_servers.repos %} -check_repo {{ host }} {{ repo }} +check_repo {{ host }} {{ repo }} & {% endfor %} {% endfor %} + +wait