diff --git a/galaxy-workers_playbook.yml b/galaxy-workers_playbook.yml index ea2001409..8028f54b3 100644 --- a/galaxy-workers_playbook.yml +++ b/galaxy-workers_playbook.yml @@ -31,6 +31,7 @@ - usegalaxy_eu.apptainer - geerlingguy.docker - acl-on-startup + - clean-tmpdisk post_tasks: - name: restart munge systemd: diff --git a/group_vars/pulsar_paw/pulsar-paw_workers.yml b/group_vars/pulsar_paw/pulsar-paw_workers.yml index 47912d404..f4698e4d2 100644 --- a/group_vars/pulsar_paw/pulsar-paw_workers.yml +++ b/group_vars/pulsar_paw/pulsar-paw_workers.yml @@ -2,6 +2,8 @@ slurm_roles: ['exec'] add_hosts_head: yes +clean_tmpdisk_tmp_dir: /tmp # no mounted tmpdisk + shared_mounts: - path: /mnt/custom-indices src: pulsar-paw:/mnt/custom-indices diff --git a/pulsar-high-mem1_playbook.yml b/pulsar-high-mem1_playbook.yml index f1b6c0dad..e4e1c725d 100644 --- a/pulsar-high-mem1_playbook.yml +++ b/pulsar-high-mem1_playbook.yml @@ -34,6 +34,7 @@ - pulsar-post-tasks - slurm-post-tasks - slg.galaxy_stats + - clean-tmpdisk post_tasks: - name: Create worker tmpdir on /mnt file: diff --git a/pulsar-high-mem2_playbook.yml b/pulsar-high-mem2_playbook.yml index df9eaf376..b1ecf1338 100644 --- a/pulsar-high-mem2_playbook.yml +++ b/pulsar-high-mem2_playbook.yml @@ -34,6 +34,7 @@ - pulsar-post-tasks - slurm-post-tasks - slg.galaxy_stats + - clean-tmpdisk post_tasks: - name: Create worker tmpdir on /mnt file: diff --git a/pulsar-mel2_worker_playbook.yml b/pulsar-mel2_worker_playbook.yml index 070f028d0..68f95ae77 100644 --- a/pulsar-mel2_worker_playbook.yml +++ b/pulsar-mel2_worker_playbook.yml @@ -25,6 +25,7 @@ - geerlingguy.docker - acl-on-startup - dj-wasabi.telegraf + - clean-tmpdisk post_tasks: - name: Restart munge service service: diff --git a/pulsar-mel3_worker_playbook.yml b/pulsar-mel3_worker_playbook.yml index 4a1ab05b2..66f1f4051 100644 --- a/pulsar-mel3_worker_playbook.yml +++ b/pulsar-mel3_worker_playbook.yml @@ -25,6 +25,7 @@ - geerlingguy.docker - acl-on-startup - dj-wasabi.telegraf + - clean-tmpdisk post_tasks: - name: Restart munge service service: diff --git a/pulsar-nci-training_workers_playbook.yml b/pulsar-nci-training_workers_playbook.yml index 24713b460..4a5456e73 100644 --- a/pulsar-nci-training_workers_playbook.yml +++ b/pulsar-nci-training_workers_playbook.yml @@ -36,6 +36,7 @@ - geerlingguy.docker - acl-on-startup - dj-wasabi.telegraf + - clean-tmpdisk post_tasks: - name: Restart munge service service: diff --git a/pulsar-paw_worker_playbook.yml b/pulsar-paw_worker_playbook.yml index 864e32cb2..332629107 100644 --- a/pulsar-paw_worker_playbook.yml +++ b/pulsar-paw_worker_playbook.yml @@ -21,6 +21,7 @@ - geerlingguy.docker - acl-on-startup - dj-wasabi.telegraf + - clean-tmpdisk post_tasks: - name: Restart munge service service: diff --git a/pulsar-qld-high-mem_playbook.yml b/pulsar-qld-high-mem_playbook.yml index ce21a5979..c912a227f 100644 --- a/pulsar-qld-high-mem_playbook.yml +++ b/pulsar-qld-high-mem_playbook.yml @@ -37,6 +37,7 @@ - pulsar-post-tasks - slurm-post-tasks - slg.galaxy_stats + - clean-tmpdisk post_tasks: - name: Create worker tmpdir on /mnt file: diff --git a/roles/clean-tmpdisk/defaults/main.yml b/roles/clean-tmpdisk/defaults/main.yml new file mode 100644 index 000000000..c31ec863c --- /dev/null +++ b/roles/clean-tmpdisk/defaults/main.yml @@ -0,0 +1,10 @@ +clean_tmpdisk_worker_node_name: "{{ ansible_hostname }}" +clean_tmpdisk_tmp_dir: /mnt/tmpdisk + +clean_tmpdisk_script_dir: "/home/{{ clean_tmpdisk_user }}/clean_tmpdisk" +clean_tmpdisk_user: ubunt + +clean_tmpdisk_cron_hour: "19" +clean_tmpdisk_cron_minute: "00" + +clean_tmpdisk_enable_cron_job: false # set this to false for testing phase, then default to true \ No newline at end of file diff --git a/roles/clean-tmpdisk/tasks/main.yml b/roles/clean-tmpdisk/tasks/main.yml new file mode 100644 index 000000000..192131a01 --- /dev/null +++ b/roles/clean-tmpdisk/tasks/main.yml @@ -0,0 +1,17 @@ +--- +- block: + - name: Make the script directory + file: + state: directory + path: "{{ clean_tmpdisk_script_dir }}" + - name: Template cleanup script + template: + src: clean_tmpdisk.py.j2 + dest: "{{ clean_tmpdisk_script_dir }}/clean_tmpdisk.py" + - cron: + hour: clean_tmpdisk_cron_hour + minute: clean_tmpdisk_cron_minute + job: "python {{ clean_tmpdisk_script_dir }}/clean_tmpdisk.py" + disabled: "{{ not clean_tmpdisk_enable_cron_job }}" # set clean_tmpdisk_enable_cron_job to false at first + become: true + become_user: "{{ clean_tmpdisk_user }}" diff --git a/roles/clean-tmpdisk/templates/clean_tmpdisk.py.j2 b/roles/clean-tmpdisk/templates/clean_tmpdisk.py.j2 new file mode 100644 index 000000000..99d5c0f05 --- /dev/null +++ b/roles/clean-tmpdisk/templates/clean_tmpdisk.py.j2 @@ -0,0 +1,90 @@ +import subprocess +import argparse +import re + +MINUTES_IN_DAY=1440 +MINUTES_IN_HOUR=60 + +worker_node = "{{ clean_tmpdisk_worker_node_name }}" +tmp_dir = "{{ clean_tmpdisk_tmp_dir }}" + +squeue_format="%8i %.50j %.9T %.12M %.40N" # slurm_id, name, state, runtime, node + +accepted_tmp_dirs = ['/tmp', '/mnt/tmpdisk'] +if not tmp_dir in accepted_tmp_dirs: # safety feature to stop this from accidentally being set to something we don't want + raise Exception(f'specified clean_tmpdisk_tmp_dir must be one of {str(accepted_tmp_dirs)}') + +def to_minutes(slurm_time): + pattern_dhms = re.compile('(?P\d+)-(?P\d+):(?P\d+):(?P\d+)') + pattern_hms = re.compile('(?:(?P\d+):)?(?P\d+):(?P\d+)') + match = re.match(pattern_dhms, slurm_time) + if not match: + match = re.match(pattern_hms, slurm_time) + values = {} + for val in ['days', 'hours', 'minutes']: + values[val] = int(match.groupdict().get(val)) if match.groupdict().get(val) else 0 + minutes = MINUTES_IN_DAY * values['days'] + MINUTES_IN_HOUR * values['hours'] + values['minutes'] + 1 + return minutes + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-d', '--dry_run', action='store_true', help="Print but do not execute command") + parser.add_argument('-l', '--list_only', action='store_true', help="List files to remove (but do not remove them)") + parser.add_argument('-v', '--verbose', action='store_true', help="Be verbose") + args = parser.parse_args() + + list_only = args.list_only + dry_run = args.dry_run or args.list_only # enforce dry run in the case of printing the list + verbose = args.verbose + + squeue_details = subprocess.check_output(f'squeue --format=\'{squeue_format}\'', shell=True) + + squeue_details = squeue_details.decode('utf-8') + + if verbose: + print("Squeue output:\n") + print(squeue_details) + + squeue_details_rows = squeue_details.split('\n')[1:] + jobs = [] + for row in squeue_details_rows: + if not row.strip(): + continue + slurm_id, name, state, runtime, node = re.split('\s+', row.strip()) + if node != worker_node or state == 'PENDING': + continue + job = { + 'slurm_id': slurm_id, + 'name': name, + 'state': state, + 'runtime_minutes': to_minutes(runtime), + 'node': node, + } + jobs.append(job) + + if verbose: + print('\nJobs dict:\n') + for job in jobs: + print(job) + + max_job_time = sorted(jobs, key=lambda x: x['runtime_minutes'], reverse=True)[0]['runtime_minutes'] if jobs else 0 + + delete_files_command = f"sudo find {tmp_dir} -type f -mmin +{max_job_time + 30} -exec rm {{}} \;" + + if list_only: + list_command = f'sudo find {tmp_dir} -type f -mmin +{max_job_time + 30} -printf "%p %TY-%Tm-%Td\n"' + print(f'\nFiles to delete:\n') + file_list = subprocess.check_output(list_command, shell=True) + file_list = file_list.decode('utf-8') + print(file_list) + + if dry_run: + print(f'\nCommands to run to remove files from dir {tmp_dir} last modified more than {max_job_time + 30} minutes ago') + print(delete_files_command) + + else: + subprocess.check_output(delete_files_command, shell=True) + + +if __name__ == '__main__': + main()