Skip to content

Commit

Permalink
Merge pull request #951 from kysrpex/htcondor_secondary
Browse files Browse the repository at this point in the history
HTCondor clusters playbook
  • Loading branch information
kysrpex authored Oct 31, 2023
2 parents 0e6fb54 + 522aa9c commit 4517c21
Show file tree
Hide file tree
Showing 16 changed files with 235 additions and 38 deletions.
3 changes: 3 additions & 0 deletions group_vars/htcondor-manager.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Configure the HTCondor central manager node.
---
htcondor_role_manager: true
File renamed without changes.
8 changes: 8 additions & 0 deletions group_vars/htcondor-secondary/vars.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Configure nodes in the secondary HTCondor cluster.
#
# Nodes in the secondary HTCondor cluster belong both to the
# "htcondor-secondary" (with group priority > 1) and "htcondor" groups. They
# thus inherit variables from the latter.
---
htcondor_server: "build.galaxyproject.eu"
htcondor_port: 9628
10 changes: 10 additions & 0 deletions group_vars/htcondor-secondary/vault.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
$ANSIBLE_VAULT;1.1;AES256

Check warning on line 1 in group_vars/htcondor-secondary/vault.yml

View workflow job for this annotation

GitHub Actions / Lint

1:1 [document-start] missing document start "---"
31353533313831356632376636636564653732313930623263376437313362386632623732306136
3465326632326138646330353164336363653764396237370a393562613834343765313835656362
66633030353534663831323939386335316130343137396139633038366438613731376130663564
6635643366613463390a663637643834366632643730666131323737633966393335343734663731
63346138623034333265633465376633313537313062633633353261623934333037646532303132
63643364633136613265333461623036313964383932336335623236623462316437303964346163
32386236303765353936333563303934323964383039626233613333396431383936326530343931
33636531343831663864373365613036333964343534616664356462383066623238326138373435
3566
3 changes: 3 additions & 0 deletions group_vars/htcondor-submit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Configure HTCondor submit nodes.
---
htcondor_role_submit: true
53 changes: 53 additions & 0 deletions group_vars/htcondor/vars.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Configure nodes in the HTCondor cluster.
---
htcondor_server: "condor-cm.galaxyproject.eu"
htcondor_domain: bi.uni-freiburg.de
htcondor_port: 9618
htcondor_version: 23.0
htcondor_channel: 23.0
htcondor_firewall_condor: "{{ true if htcondor_port == 9618 else false }}"
htcondor_firewall_nfs: false
htcondor_role_execute: false
htcondor_role_manager: false
htcondor_role_submit: false
htcondor_password: "{{ vault_htcondor_password }}"

# Settings specific to the `condor_config.local.j2` configuration file.
htcondor_allow_write: "10.5.68.0/24, 132.230.223.0/24,132.230.153.0/28"
htcondor_allow_negotiator: "132.230.223.239,$(CONDOR_HOST),$(ALLOW_WRITE)"
htcondor_allow_administrator: "$(ALLOW_NEGOTIATOR)"
htcondor_system_periodic_hold: "{{ 30 * 24 * 60 * 60 }}"
htcondor_system_periodic_remove: "{{ 2 * 24 * 60 * 60 }}"
# htcondor_network_interface -> Defined per-host in host_vars.
htcondor_master_update_interval: 150
htcondor_classad_lifetime: 300
htcondor_negotiator_interval: 15
htcondor_negotiator_update_interval: 100
htcondor_schedd_interval: 60
htcondor_job_start_count: 250
htcondor_job_start_delay: 0
htcondor_claim_worklife: 120
htcondor_negotiator_post_job_rank: "isUndefined(RemoteOwner) * (10000 - TotalLoadAvg)"

# Settings specific to the `usegalaxy_eu.htcondor` role (to be replaced with
# `grycap.htcondor`).
condor_host: "{{ htcondor_server }}"
condor_fs_domain: "{{ htcondor_domain }}"
condor_uid_domain: "{{ htcondor_domain }}"
condor_allow_write: "{{ htcondor_allow_write }}"
# condor_daemons -> Defined per-host in host_vars.
condor_allow_negotiator: "{{ htcondor_allow_negotiator }}"
condor_allow_administrator: "{{ htcondor_allow_administrator }}"
condor_system_periodic_hold: "{{ htcondor_system_periodic_hold }}"
condor_system_periodic_remove: "{{ htcondor_system_periodic_remove }}"
condor_network_interface: "{{ htcondor_network_interface }}"
condor_extra: |
MASTER_UPDATE_INTERVAL = {{ htcondor_master_update_interval }}
CLASSAD_LIFETIME = {{ htcondor_classad_lifetime }}
NEGOTIATOR_INTERVAL = {{ htcondor_negotiator_interval }}
NEGOTIATOR_UPDATE_INTERVAL = {{ htcondor_negotiator_update_interval }}
SCHEDD_INTERVAL = {{ htcondor_schedd_interval }}
JOB_START_COUNT = {{ htcondor_job_start_count }}
JOB_START_DELAY = {{ htcondor_job_start_delay }}
CLAIM_WORKLIFE = {{ htcondor_claim_worklife }}
NEGOTIATOR_POST_JOB_RANK = {{ htcondor_negotiator_post_job_rank }}
10 changes: 10 additions & 0 deletions group_vars/htcondor/vault.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
$ANSIBLE_VAULT;1.1;AES256

Check warning on line 1 in group_vars/htcondor/vault.yml

View workflow job for this annotation

GitHub Actions / Lint

1:1 [document-start] missing document start "---"
36336166336332656436376537343036353234366164616236393139313932343538313133373639
3064333637333539353566396361666362666539353231360a646430356366343632633637326462
39333232646363656438316533666664613935353336313064323038313564383734373433656330
3161396636623764660a636332303565396630666134626235636363636434623537333933653537
37383165643433633630353961623930653139653132303235306539613332346662323764356563
65303062333738616266383339366165643264633038323533306365623034656563333731393465
66386263353433303832363936323138386637636366663338336263323835663730616639393831
32333161633131323534306565626530616364386261646439336436303834386265396161333133
3130
27 changes: 0 additions & 27 deletions group_vars/sn06.yml
Original file line number Diff line number Diff line change
Expand Up @@ -222,33 +222,6 @@ galaxy_systemd_memory_limit: 120
galaxy_systemd_memory_limit_handler: 30
galaxy_systemd_memory_limit_workflow: 15

# HTCondor
condor_host: "condor-cm.galaxyproject.eu"
condor_fs_domain: bi.uni-freiburg.de
condor_uid_domain: bi.uni-freiburg.de
condor_allow_write: "10.5.68.0/24, 132.230.223.0/24,132.230.153.0/28"
condor_daemons:
- COLLECTOR
- NEGOTIATOR
- MASTER
- SCHEDD
condor_allow_negotiator: "132.230.223.239,$(CONDOR_HOST),$(ALLOW_WRITE)"
condor_allow_administrator: "$(ALLOW_NEGOTIATOR)"

condor_system_periodic_hold: "{{ 30 * 24 * 60 * 60 }}"
condor_system_periodic_remove: "{{ 2 * 24 * 60 * 60 }}"
condor_network_interface: ens802f0.223
condor_extra: |
MASTER_UPDATE_INTERVAL = 150
CLASSAD_LIFETIME = 300
NEGOTIATOR_INTERVAL = 15
NEGOTIATOR_UPDATE_INTERVAL = 100
SCHEDD_INTERVAL = 60
JOB_START_COUNT = 250
JOB_START_DELAY = 0
CLAIM_WORKLIFE = 120
NEGOTIATOR_POST_JOB_RANK = isUndefined(RemoteOwner) * (10000 - TotalLoadAvg)
# gie_proxy
gie_proxy_dir: "{{ galaxy_root }}/gie-proxy/proxy"
gie_proxy_git_version: main
Expand Down
2 changes: 2 additions & 0 deletions host_vars/build.galaxyproject.eu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
---
htcondor_network_interface: ens802f0.223
2 changes: 2 additions & 0 deletions host_vars/nspawn-htcondor.sn06.galaxyproject.eu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
---
htcondor_network_interface: ens802f0.223
9 changes: 9 additions & 0 deletions host_vars/sn06.galaxyproject.eu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
htcondor_network_interface: ens802f0.223

# Settings specific to the `usegalaxy_eu.htcondor` role.
condor_daemons:
- COLLECTOR
- NEGOTIATOR
- MASTER
- SCHEDD
50 changes: 43 additions & 7 deletions hosts
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,50 @@ maintenance.galaxyproject.eu
[all:vars]
ansible_ssh_user=centos

[central-manager]
manager.vgcn.galaxyproject.eu ansible_ssh_user=root
[htcondor:children]
htcondor-manager
htcondor-submit
htcondor-secondary

[central-manager-secondary]
manager-secondary.galaxyproject.eu ansible_host=127.0.0.1 ansible_port=2222 ansible_ssh_user=root ansible_ssh_common_args='-o HostKeyAlias=manager-secondary.galaxyproject.eu -o ProxyCommand="ssh -W %h:%p -q [email protected]"'
[htcondor-manager]
sn06.galaxyproject.eu

[htcondor-manager:children]
htcondor-secondary-manager

[htcondor-manager:vars]
ansible_group_priority=2

[htcondor-submit]
sn06.galaxyproject.eu

[htcondor-submit:children]
htcondor-secondary-submit

[htcondor-submit:vars]
ansible_group_priority=2

[central-manager-secondary-host]
sn06.galaxyproject.eu ansible_ssh_user=root
[htcondor-secondary:children]
htcondor-secondary-manager
htcondor-secondary-submit

[htcondor-secondary:vars]
ansible_group_priority=3

[htcondor-secondary-manager]
build.galaxyproject.eu ansible_ssh_user=root

[htcondor-secondary-manager:vars]
ansible_group_priority=4

[htcondor-secondary-submit]
nspawn-htcondor.sn06.galaxyproject.eu ansible_host=127.0.0.1 ansible_port=2222 ansible_ssh_user=root ansible_ssh_common_args='-o HostKeyAlias=nspawn-htcondor.sn06.galaxyproject.eu -o ProxyCommand="ssh -W %h:%p -q [email protected]"'

[htcondor-secondary-submit:vars]
ansible_group_priority=4

[htcondor-secondary-submit-host]
sn06.galaxyproject.eu

[central-manager-secondary-host:vars]
[htcondor-secondary-submit-host:vars]
ansible_group_priority=2
41 changes: 37 additions & 4 deletions htcondor-secondary.yml → htcondor.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
---
- name: Systemd-nspawn container aimed at running a second HTCondor installation.
hosts: central-manager-secondary-host
hosts: htcondor-secondary-submit-host
handlers:
- name: Reload sshd # (in the container)
when: nspawn_ssh | default(no)
Expand All @@ -11,7 +11,7 @@
changed_when: true
vars_files:
- mounts/mountpoints.yml
- secret_group_vars/central-manager-secondary-host.yml
- secret_group_vars/htcondor-secondary-submit-host.yml
pre_tasks:
# Because it is already disabled for sn06 and this setup is needed just
# temporarily.
Expand Down Expand Up @@ -177,5 +177,38 @@
key: "[127.0.0.1]:{{ nspawn_ssh_config.Port }} {{ nspawn_ssh_host_key.content | b64decode }}"
when: nspawn_ssh_host_trust_container

- name: Secondary HTCondor 10 cluster.
hosts: central-manager-secondary
- name: HTCondor cluster.
hosts: htcondor:!sn06.galaxyproject.eu
handlers:
- name: Reload HTCondor
when: "'condor_service' in service_facts.ansible_facts.services and \
service_facts.ansible_facts.services['condor.service'].state == 'running'"
become: true
ansible.builtin.service:
name: condor
state: reloaded
pre_tasks:
- name: Ensure the HTCondor configuration directory exists.
become: true
ansible.builtin.file:
path: /etc/condor
state: directory
owner: root
group: root
mode: "0755"

- name: Template HTCondor configuration.
become: true
ansible.builtin.template:
src: htcondor/condor_config.local.j2
dest: /etc/condor/condor_config.local
owner: root
group: root
mode: "0644"
notify: Reload HTCondor

- name: Check if HTCondor is running.
ansible.builtin.service_facts:
register: service_facts
roles:
- grycap.htcondor
3 changes: 3 additions & 0 deletions requirements.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ roles:
version: 0.0.1
- name: usegalaxy_eu.htcondor
version: 1.0.1
- name: grycap.htcondor
src: https://github.com/kysrpex/grycap-ansible-role-htcondor
version: d9a4aab0052dfb31d48c986d39a7f5e3692abba4
- name: usegalaxy-eu.update-hosts
src: https://github.com/usegalaxy-eu/ansible-update-hosts
version: 0.2.0
Expand Down
52 changes: 52 additions & 0 deletions templates/htcondor/condor_config.local.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Networking
CONDOR_HOST = {{ htcondor_server }}
COLLECTOR_HOST = $(CONDOR_HOST):{{ htcondor_port }}
SHARED_PORT_PORT = {{ htcondor_port }}
{% if "htcondor-secondary" in group_names %}
WANT_UDP_COMMAND_SOCKET = False
UPDATE_COLLECTOR_WITH_TCP = True
UPDATE_VIEW_COLLECTOR_WITH_TCP = True
{% endif %}
{% if htcondor_network_interface is defined %}
NETWORK_INTERFACE = {{ htcondor_network_interface }}
{% endif %}

# Security
ALLOW_WRITE = {{ htcondor_allow_write }}
ALLOW_READ = $(ALLOW_WRITE)
ALLOW_NEGOTIATOR = {{ htcondor_allow_negotiator }}
{% if htcondor_allow_administrator is defined %}
ALLOW_ADMINISTRATOR = {{ htcondor_allow_administrator }}
{% endif %}
ALLOW_OWNER = $(ALLOW_ADMINISTRATOR)
ALLOW_CLIENT = *
FILESYSTEM_DOMAIN = {{ htcondor_fs_domain }}
UID_DOMAIN = {{ htcondor_uid_domain }}
TRUST_UID_DOMAIN = True
SOFT_UID_DOMAIN = True
SEC_CLIENT_AUTHENTICATION_METHODS = IDTOKENS, FS
SEC_READ_AUTHENTICATION_METHODS = IDTOKENS, FS

# Job management
{% if htcondor_system_periodic_hold is defined %}
SYSTEM_PERIODIC_HOLD = \
(JobStatus == 1 || JobStatus == 2) && \
((time() - JobStartDate) >= ({{ htcondor_system_periodic_hold }}))
SYSTEM_PERIODIC_HOLD_REASON = \
ifThenElse(((time() - JobStartDate) >= ({{ hcondor_system_periodic_hold }}), \
"Maximum wallclock time exceeded", \
"Unspecified reason")
SYSTEM_PERIODIC_REMOVE = \
(JobStatus == 5 && time() - EnteredCurrentStatus > {{ htcondor_system_periodic_remove }})
{% endif %}

# Scheduling
MASTER_UPDATE_INTERVAL = {{ htcondor_master_update_interval }}
CLASSAD_LIFETIME = {{ htcondor_classad_lifetime }}
NEGOTIATOR_INTERVAL = {{ htcondor_negotiator_interval }}
NEGOTIATOR_UPDATE_INTERVAL = {{ htcondor_negotiator_update_interval }}
SCHEDD_INTERVAL = {{ htcondor_schedd_interval }}
JOB_START_COUNT = {{ htcondor_job_start_count }}
JOB_START_DELAY = {{ htcondor_job_start_delay }}
CLAIM_WORKLIFE = {{ htcondor_claim_worklife }}
NEGOTIATOR_POST_JOB_RANK = {{ htcondor_negotiator_post_job_rank }}

0 comments on commit 4517c21

Please sign in to comment.