Skip to content

Commit

Permalink
Role for cleaning up /mnt/tmpdisk on workers
Browse files Browse the repository at this point in the history
  • Loading branch information
cat-bro committed Jan 22, 2024
1 parent c9e3880 commit df825f9
Show file tree
Hide file tree
Showing 12 changed files with 127 additions and 0 deletions.
1 change: 1 addition & 0 deletions galaxy-workers_playbook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
- usegalaxy_eu.apptainer
- geerlingguy.docker
- acl-on-startup
- clean-tmpdisk
post_tasks:
- name: restart munge
systemd:
Expand Down
2 changes: 2 additions & 0 deletions group_vars/pulsar_paw/pulsar-paw_workers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ slurm_roles: ['exec']

add_hosts_head: yes

clean_tmpdisk_tmp_dir: /tmp # no mounted tmpdisk

shared_mounts:
- path: /mnt/custom-indices
src: pulsar-paw:/mnt/custom-indices
Expand Down
1 change: 1 addition & 0 deletions pulsar-high-mem1_playbook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
- pulsar-post-tasks
- slurm-post-tasks
- slg.galaxy_stats
- clean-tmpdisk
post_tasks:
- name: Create worker tmpdir on /mnt
file:
Expand Down
1 change: 1 addition & 0 deletions pulsar-high-mem2_playbook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
- pulsar-post-tasks
- slurm-post-tasks
- slg.galaxy_stats
- clean-tmpdisk
post_tasks:
- name: Create worker tmpdir on /mnt
file:
Expand Down
1 change: 1 addition & 0 deletions pulsar-mel2_worker_playbook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
- geerlingguy.docker
- acl-on-startup
- dj-wasabi.telegraf
- clean-tmpdisk
post_tasks:
- name: Restart munge service
service:
Expand Down
1 change: 1 addition & 0 deletions pulsar-mel3_worker_playbook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
- geerlingguy.docker
- acl-on-startup
- dj-wasabi.telegraf
- clean-tmpdisk
post_tasks:
- name: Restart munge service
service:
Expand Down
1 change: 1 addition & 0 deletions pulsar-nci-training_workers_playbook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
- geerlingguy.docker
- acl-on-startup
- dj-wasabi.telegraf
- clean-tmpdisk
post_tasks:
- name: Restart munge service
service:
Expand Down
1 change: 1 addition & 0 deletions pulsar-paw_worker_playbook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
- geerlingguy.docker
- acl-on-startup
- dj-wasabi.telegraf
- clean-tmpdisk
post_tasks:
- name: Restart munge service
service:
Expand Down
1 change: 1 addition & 0 deletions pulsar-qld-high-mem_playbook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
- pulsar-post-tasks
- slurm-post-tasks
- slg.galaxy_stats
- clean-tmpdisk
post_tasks:
- name: Create worker tmpdir on /mnt
file:
Expand Down
10 changes: 10 additions & 0 deletions roles/clean-tmpdisk/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
clean_tmpdisk_worker_node_name: "{{ ansible_hostname }}"
clean_tmpdisk_tmp_dir: /mnt/tmpdisk

clean_tmpdisk_script_dir: "/home/{{ clean_tmpdisk_user }}/clean_tmpdisk"
clean_tmpdisk_user: ubunt

clean_tmpdisk_cron_hour: "19"
clean_tmpdisk_cron_minute: "00"

clean_tmpdisk_enable_cron_job: false # set this to false for testing phase, then default to true
17 changes: 17 additions & 0 deletions roles/clean-tmpdisk/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
---
- block:
- name: Make the script directory
file:
state: directory
path: "{{ clean_tmpdisk_script_dir }}"
- name: Template cleanup script
template:
src: clean_tmpdisk.py.j2
dest: "{{ clean_tmpdisk_script_dir }}/clean_tmpdisk.py"
- cron:
hour: clean_tmpdisk_cron_hour
minute: clean_tmpdisk_cron_minute
job: "python {{ clean_tmpdisk_script_dir }}/clean_tmpdisk.py"
disabled: "{{ not clean_tmpdisk_enable_cron_job }}" # set clean_tmpdisk_enable_cron_job to false at first
become: true
become_user: "{{ clean_tmpdisk_user }}"
90 changes: 90 additions & 0 deletions roles/clean-tmpdisk/templates/clean_tmpdisk.py.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import subprocess
import argparse
import re

MINUTES_IN_DAY=1440
MINUTES_IN_HOUR=60

worker_node = "{{ clean_tmpdisk_worker_node_name }}"
tmp_dir = "{{ clean_tmpdisk_tmp_dir }}"

squeue_format="%8i %.50j %.9T %.12M %.40N" # slurm_id, name, state, runtime, node

accepted_tmp_dirs = ['/tmp', '/mnt/tmpdisk']
if not tmp_dir in accepted_tmp_dirs: # safety feature to stop this from accidentally being set to something we don't want
raise Exception(f'specified clean_tmpdisk_tmp_dir must be one of {str(accepted_tmp_dirs)}')

def to_minutes(slurm_time):
pattern_dhms = re.compile('(?P<days>\d+)-(?P<hours>\d+):(?P<minutes>\d+):(?P<seconds>\d+)')
pattern_hms = re.compile('(?:(?P<hours>\d+):)?(?P<minutes>\d+):(?P<seconds>\d+)')
match = re.match(pattern_dhms, slurm_time)
if not match:
match = re.match(pattern_hms, slurm_time)
values = {}
for val in ['days', 'hours', 'minutes']:
values[val] = int(match.groupdict().get(val)) if match.groupdict().get(val) else 0
minutes = MINUTES_IN_DAY * values['days'] + MINUTES_IN_HOUR * values['hours'] + values['minutes'] + 1
return minutes

def main():
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--dry_run', action='store_true', help="Print but do not execute command")
parser.add_argument('-l', '--list_only', action='store_true', help="List files to remove (but do not remove them)")
parser.add_argument('-v', '--verbose', action='store_true', help="Be verbose")
args = parser.parse_args()

list_only = args.list_only
dry_run = args.dry_run or args.list_only # enforce dry run in the case of printing the list
verbose = args.verbose

squeue_details = subprocess.check_output(f'squeue --format=\'{squeue_format}\'', shell=True)

squeue_details = squeue_details.decode('utf-8')

if verbose:
print("Squeue output:\n")
print(squeue_details)

squeue_details_rows = squeue_details.split('\n')[1:]
jobs = []
for row in squeue_details_rows:
if not row.strip():
continue
slurm_id, name, state, runtime, node = re.split('\s+', row.strip())
if node != worker_node or state == 'PENDING':
continue
job = {
'slurm_id': slurm_id,
'name': name,
'state': state,
'runtime_minutes': to_minutes(runtime),
'node': node,
}
jobs.append(job)

if verbose:
print('\nJobs dict:\n')
for job in jobs:
print(job)

max_job_time = sorted(jobs, key=lambda x: x['runtime_minutes'], reverse=True)[0]['runtime_minutes'] if jobs else 0

delete_files_command = f"sudo find {tmp_dir} -type f -mmin +{max_job_time + 30} -exec rm {{}} \;"

if list_only:
list_command = f'sudo find {tmp_dir} -type f -mmin +{max_job_time + 30} -printf "%p %TY-%Tm-%Td\n"'
print(f'\nFiles to delete:\n')
file_list = subprocess.check_output(list_command, shell=True)
file_list = file_list.decode('utf-8')
print(file_list)

if dry_run:
print(f'\nCommands to run to remove files from dir {tmp_dir} last modified more than {max_job_time + 30} minutes ago')
print(delete_files_command)

else:
subprocess.check_output(delete_files_command, shell=True)


if __name__ == '__main__':
main()

0 comments on commit df825f9

Please sign in to comment.