Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HTCondor clusters playbook #951

Merged
merged 2 commits into from
Oct 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions group_vars/htcondor-manager.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Configure the HTCondor central manager node.
---
htcondor_role_manager: true
File renamed without changes.
8 changes: 8 additions & 0 deletions group_vars/htcondor-secondary/vars.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Configure nodes in the secondary HTCondor cluster.
#
# Nodes in the secondary HTCondor cluster belong both to the
# "htcondor-secondary" (with group priority > 1) and "htcondor" groups. They
# thus inherit variables from the latter.
---
htcondor_server: "build.galaxyproject.eu"
htcondor_port: 9628
10 changes: 10 additions & 0 deletions group_vars/htcondor-secondary/vault.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
$ANSIBLE_VAULT;1.1;AES256

Check warning on line 1 in group_vars/htcondor-secondary/vault.yml

View workflow job for this annotation

GitHub Actions / Lint

1:1 [document-start] missing document start "---"
31353533313831356632376636636564653732313930623263376437313362386632623732306136
3465326632326138646330353164336363653764396237370a393562613834343765313835656362
66633030353534663831323939386335316130343137396139633038366438613731376130663564
6635643366613463390a663637643834366632643730666131323737633966393335343734663731
63346138623034333265633465376633313537313062633633353261623934333037646532303132
63643364633136613265333461623036313964383932336335623236623462316437303964346163
32386236303765353936333563303934323964383039626233613333396431383936326530343931
33636531343831663864373365613036333964343534616664356462383066623238326138373435
3566
3 changes: 3 additions & 0 deletions group_vars/htcondor-submit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Configure HTCondor submit nodes.
---
htcondor_role_submit: true
53 changes: 53 additions & 0 deletions group_vars/htcondor/vars.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Configure nodes in the HTCondor cluster.
---
htcondor_server: "condor-cm.galaxyproject.eu"
htcondor_domain: bi.uni-freiburg.de
htcondor_port: 9618
htcondor_version: 23.0
htcondor_channel: 23.0
htcondor_firewall_condor: "{{ true if htcondor_port == 9618 else false }}"
htcondor_firewall_nfs: false
htcondor_role_execute: false
htcondor_role_manager: false
htcondor_role_submit: false
htcondor_password: "{{ vault_htcondor_password }}"

# Settings specific to the `condor_config.local.j2` configuration file.
htcondor_allow_write: "10.5.68.0/24, 132.230.223.0/24,132.230.153.0/28"
htcondor_allow_negotiator: "132.230.223.239,$(CONDOR_HOST),$(ALLOW_WRITE)"
htcondor_allow_administrator: "$(ALLOW_NEGOTIATOR)"
htcondor_system_periodic_hold: "{{ 30 * 24 * 60 * 60 }}"
htcondor_system_periodic_remove: "{{ 2 * 24 * 60 * 60 }}"
# htcondor_network_interface -> Defined per-host in host_vars.
htcondor_master_update_interval: 150
htcondor_classad_lifetime: 300
htcondor_negotiator_interval: 15
htcondor_negotiator_update_interval: 100
htcondor_schedd_interval: 60
htcondor_job_start_count: 250
htcondor_job_start_delay: 0
htcondor_claim_worklife: 120
htcondor_negotiator_post_job_rank: "isUndefined(RemoteOwner) * (10000 - TotalLoadAvg)"

# Settings specific to the `usegalaxy_eu.htcondor` role (to be replaced with
# `grycap.htcondor`).
condor_host: "{{ htcondor_server }}"
condor_fs_domain: "{{ htcondor_domain }}"
condor_uid_domain: "{{ htcondor_domain }}"
condor_allow_write: "{{ htcondor_allow_write }}"
# condor_daemons -> Defined per-host in host_vars.
condor_allow_negotiator: "{{ htcondor_allow_negotiator }}"
condor_allow_administrator: "{{ htcondor_allow_administrator }}"
condor_system_periodic_hold: "{{ htcondor_system_periodic_hold }}"
condor_system_periodic_remove: "{{ htcondor_system_periodic_remove }}"
condor_network_interface: "{{ htcondor_network_interface }}"
condor_extra: |
MASTER_UPDATE_INTERVAL = {{ htcondor_master_update_interval }}
CLASSAD_LIFETIME = {{ htcondor_classad_lifetime }}
NEGOTIATOR_INTERVAL = {{ htcondor_negotiator_interval }}
NEGOTIATOR_UPDATE_INTERVAL = {{ htcondor_negotiator_update_interval }}
SCHEDD_INTERVAL = {{ htcondor_schedd_interval }}
JOB_START_COUNT = {{ htcondor_job_start_count }}
JOB_START_DELAY = {{ htcondor_job_start_delay }}
CLAIM_WORKLIFE = {{ htcondor_claim_worklife }}
NEGOTIATOR_POST_JOB_RANK = {{ htcondor_negotiator_post_job_rank }}
10 changes: 10 additions & 0 deletions group_vars/htcondor/vault.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
$ANSIBLE_VAULT;1.1;AES256

Check warning on line 1 in group_vars/htcondor/vault.yml

View workflow job for this annotation

GitHub Actions / Lint

1:1 [document-start] missing document start "---"
36336166336332656436376537343036353234366164616236393139313932343538313133373639
3064333637333539353566396361666362666539353231360a646430356366343632633637326462
39333232646363656438316533666664613935353336313064323038313564383734373433656330
3161396636623764660a636332303565396630666134626235636363636434623537333933653537
37383165643433633630353961623930653139653132303235306539613332346662323764356563
65303062333738616266383339366165643264633038323533306365623034656563333731393465
66386263353433303832363936323138386637636366663338336263323835663730616639393831
32333161633131323534306565626530616364386261646439336436303834386265396161333133
3130
27 changes: 0 additions & 27 deletions group_vars/sn06.yml
Original file line number Diff line number Diff line change
Expand Up @@ -222,33 +222,6 @@ galaxy_systemd_memory_limit: 120
galaxy_systemd_memory_limit_handler: 30
galaxy_systemd_memory_limit_workflow: 15

# HTCondor
condor_host: "condor-cm.galaxyproject.eu"
condor_fs_domain: bi.uni-freiburg.de
condor_uid_domain: bi.uni-freiburg.de
condor_allow_write: "10.5.68.0/24, 132.230.223.0/24,132.230.153.0/28"
condor_daemons:
- COLLECTOR
- NEGOTIATOR
- MASTER
- SCHEDD
condor_allow_negotiator: "132.230.223.239,$(CONDOR_HOST),$(ALLOW_WRITE)"
condor_allow_administrator: "$(ALLOW_NEGOTIATOR)"

condor_system_periodic_hold: "{{ 30 * 24 * 60 * 60 }}"
condor_system_periodic_remove: "{{ 2 * 24 * 60 * 60 }}"
condor_network_interface: ens802f0.223
condor_extra: |
MASTER_UPDATE_INTERVAL = 150
CLASSAD_LIFETIME = 300
NEGOTIATOR_INTERVAL = 15
NEGOTIATOR_UPDATE_INTERVAL = 100
SCHEDD_INTERVAL = 60
JOB_START_COUNT = 250
JOB_START_DELAY = 0
CLAIM_WORKLIFE = 120
NEGOTIATOR_POST_JOB_RANK = isUndefined(RemoteOwner) * (10000 - TotalLoadAvg)

# gie_proxy
gie_proxy_dir: "{{ galaxy_root }}/gie-proxy/proxy"
gie_proxy_git_version: main
Expand Down
2 changes: 2 additions & 0 deletions host_vars/build.galaxyproject.eu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
---
htcondor_network_interface: ens802f0.223
2 changes: 2 additions & 0 deletions host_vars/nspawn-htcondor.sn06.galaxyproject.eu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
---
htcondor_network_interface: ens802f0.223
9 changes: 9 additions & 0 deletions host_vars/sn06.galaxyproject.eu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
htcondor_network_interface: ens802f0.223

# Settings specific to the `usegalaxy_eu.htcondor` role.
condor_daemons:
- COLLECTOR
- NEGOTIATOR
- MASTER
- SCHEDD
50 changes: 43 additions & 7 deletions hosts
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,50 @@ maintenance.galaxyproject.eu
[all:vars]
ansible_ssh_user=centos

[central-manager]
manager.vgcn.galaxyproject.eu ansible_ssh_user=root
[htcondor:children]
htcondor-manager
htcondor-submit
htcondor-secondary

[central-manager-secondary]
manager-secondary.galaxyproject.eu ansible_host=127.0.0.1 ansible_port=2222 ansible_ssh_user=root ansible_ssh_common_args='-o HostKeyAlias=manager-secondary.galaxyproject.eu -o ProxyCommand="ssh -W %h:%p -q [email protected]"'
[htcondor-manager]
sn06.galaxyproject.eu

[htcondor-manager:children]
htcondor-secondary-manager

[htcondor-manager:vars]
ansible_group_priority=2

[htcondor-submit]
sn06.galaxyproject.eu

[htcondor-submit:children]
htcondor-secondary-submit

[htcondor-submit:vars]
ansible_group_priority=2

[central-manager-secondary-host]
sn06.galaxyproject.eu ansible_ssh_user=root
[htcondor-secondary:children]
htcondor-secondary-manager
htcondor-secondary-submit

[htcondor-secondary:vars]
ansible_group_priority=3

[htcondor-secondary-manager]
build.galaxyproject.eu ansible_ssh_user=root

[htcondor-secondary-manager:vars]
ansible_group_priority=4

[htcondor-secondary-submit]
nspawn-htcondor.sn06.galaxyproject.eu ansible_host=127.0.0.1 ansible_port=2222 ansible_ssh_user=root ansible_ssh_common_args='-o HostKeyAlias=nspawn-htcondor.sn06.galaxyproject.eu -o ProxyCommand="ssh -W %h:%p -q [email protected]"'

[htcondor-secondary-submit:vars]
ansible_group_priority=4

[htcondor-secondary-submit-host]
sn06.galaxyproject.eu

[central-manager-secondary-host:vars]
[htcondor-secondary-submit-host:vars]
ansible_group_priority=2
41 changes: 37 additions & 4 deletions htcondor-secondary.yml → htcondor.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
---
- name: Systemd-nspawn container aimed at running a second HTCondor installation.
hosts: central-manager-secondary-host
hosts: htcondor-secondary-submit-host
handlers:
- name: Reload sshd # (in the container)
when: nspawn_ssh | default(no)
Expand All @@ -11,7 +11,7 @@
changed_when: true
vars_files:
- mounts/mountpoints.yml
- secret_group_vars/central-manager-secondary-host.yml
- secret_group_vars/htcondor-secondary-submit-host.yml
pre_tasks:
# Because it is already disabled for sn06 and this setup is needed just
# temporarily.
Expand Down Expand Up @@ -177,5 +177,38 @@
key: "[127.0.0.1]:{{ nspawn_ssh_config.Port }} {{ nspawn_ssh_host_key.content | b64decode }}"
when: nspawn_ssh_host_trust_container

- name: Secondary HTCondor 10 cluster.
hosts: central-manager-secondary
- name: HTCondor cluster.
hosts: htcondor:!sn06.galaxyproject.eu
handlers:
- name: Reload HTCondor
when: "'condor_service' in service_facts.ansible_facts.services and \
service_facts.ansible_facts.services['condor.service'].state == 'running'"
become: true
ansible.builtin.service:
name: condor
state: reloaded
pre_tasks:
- name: Ensure the HTCondor configuration directory exists.
become: true
ansible.builtin.file:
path: /etc/condor
state: directory
owner: root
group: root
mode: "0755"

- name: Template HTCondor configuration.
become: true
ansible.builtin.template:
src: htcondor/condor_config.local.j2
dest: /etc/condor/condor_config.local
owner: root
group: root
mode: "0644"
notify: Reload HTCondor

- name: Check if HTCondor is running.
ansible.builtin.service_facts:
register: service_facts
roles:
- grycap.htcondor
3 changes: 3 additions & 0 deletions requirements.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ roles:
version: 0.0.1
- name: usegalaxy_eu.htcondor
version: 1.0.1
- name: grycap.htcondor
src: https://github.com/kysrpex/grycap-ansible-role-htcondor
version: d9a4aab0052dfb31d48c986d39a7f5e3692abba4
- name: usegalaxy-eu.update-hosts
src: https://github.com/usegalaxy-eu/ansible-update-hosts
version: 0.2.0
Expand Down
52 changes: 52 additions & 0 deletions templates/htcondor/condor_config.local.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Networking
CONDOR_HOST = {{ htcondor_server }}
COLLECTOR_HOST = $(CONDOR_HOST):{{ htcondor_port }}
SHARED_PORT_PORT = {{ htcondor_port }}
{% if "htcondor-secondary" in group_names %}
WANT_UDP_COMMAND_SOCKET = False
UPDATE_COLLECTOR_WITH_TCP = True
UPDATE_VIEW_COLLECTOR_WITH_TCP = True
{% endif %}
{% if htcondor_network_interface is defined %}
NETWORK_INTERFACE = {{ htcondor_network_interface }}
{% endif %}

# Security
ALLOW_WRITE = {{ htcondor_allow_write }}
ALLOW_READ = $(ALLOW_WRITE)
ALLOW_NEGOTIATOR = {{ htcondor_allow_negotiator }}
{% if htcondor_allow_administrator is defined %}
ALLOW_ADMINISTRATOR = {{ htcondor_allow_administrator }}
{% endif %}
ALLOW_OWNER = $(ALLOW_ADMINISTRATOR)
ALLOW_CLIENT = *
FILESYSTEM_DOMAIN = {{ htcondor_fs_domain }}
UID_DOMAIN = {{ htcondor_uid_domain }}
TRUST_UID_DOMAIN = True
SOFT_UID_DOMAIN = True
SEC_CLIENT_AUTHENTICATION_METHODS = IDTOKENS, FS
SEC_READ_AUTHENTICATION_METHODS = IDTOKENS, FS

# Job management
{% if htcondor_system_periodic_hold is defined %}
SYSTEM_PERIODIC_HOLD = \
(JobStatus == 1 || JobStatus == 2) && \
((time() - JobStartDate) >= ({{ htcondor_system_periodic_hold }}))
SYSTEM_PERIODIC_HOLD_REASON = \
ifThenElse(((time() - JobStartDate) >= ({{ hcondor_system_periodic_hold }}), \
"Maximum wallclock time exceeded", \
"Unspecified reason")
SYSTEM_PERIODIC_REMOVE = \
(JobStatus == 5 && time() - EnteredCurrentStatus > {{ htcondor_system_periodic_remove }})
{% endif %}

# Scheduling
MASTER_UPDATE_INTERVAL = {{ htcondor_master_update_interval }}
CLASSAD_LIFETIME = {{ htcondor_classad_lifetime }}
NEGOTIATOR_INTERVAL = {{ htcondor_negotiator_interval }}
NEGOTIATOR_UPDATE_INTERVAL = {{ htcondor_negotiator_update_interval }}
SCHEDD_INTERVAL = {{ htcondor_schedd_interval }}
JOB_START_COUNT = {{ htcondor_job_start_count }}
JOB_START_DELAY = {{ htcondor_job_start_delay }}
CLAIM_WORKLIFE = {{ htcondor_claim_worklife }}
NEGOTIATOR_POST_JOB_RANK = {{ htcondor_negotiator_post_job_rank }}
Loading