Skip to content

Commit

Permalink
Merge pull request #83 from pneerincx/develop
Browse files Browse the repository at this point in the history
Bugfixes and improved version of login checks
  • Loading branch information
erijpkema authored Mar 21, 2019
2 parents ef90c72 + 73f760f commit 93dd717
Show file tree
Hide file tree
Showing 7 changed files with 143 additions and 36 deletions.
1 change: 0 additions & 1 deletion group_vars/all/vars.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ auth_users:
uid: 1008
pub_keys: |
ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBDvx1ebTndL/HitD30uNpvESXWUAxT3j0e0CzrBUZ8fHDv+vZTbWBRtWbnLgCnVDPa3GclA1lpnvJD9JBjBhUa8= ger@ger-pc
robin:
comment: 'Robin Teeninga'
uid: 1009
Expand Down
2 changes: 1 addition & 1 deletion group_vars/talos-cluster/vars.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ vcompute_real_memory: 7822
vcompute_max_cpus_per_node: "{{ vcompute_sockets * vcompute_cores_per_socket - 2 }}"
vcompute_max_mem_per_node: "{{ vcompute_real_memory - vcompute_sockets * vcompute_cores_per_socket * 512 }}"
vcompute_local_disk: 0
vcompute_features: 'tmp02'
vcompute_features: 'tmp08'
vcompute_ethernet_interfaces:
- 'eth0'
- 'eth1'
Expand Down
9 changes: 6 additions & 3 deletions roles/logins/files/login_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,12 @@ login_actions () {
# but in the first case there are no SLURM related environment variables defined.
#

# SOURCE_HPC_ENV variable checking disabled (it is not set ) Egon 30-10-2018
#if [ ${TERM} == 'dumb' ] && [ -z ${SOURCE_HPC_ENV} ]; then
if [ ${TERM} == 'dumb' ]; then
#
# ToDo: fix this. As of CentOS 7.x interactive session that eventually report ${TERM} == 'bash'
# report ${TERM} == 'dumb' at the point where this script is executed in the PAM stack :(.
# Makes it impossible to determine the difference between an SFTP session versus a Bash session.
#
if [ ${TERM} == 'dumb' ] && [ -z "${SOURCE_HPC_ENV:-}" ]; then
$LOGGER "debug: exiting because of dumb terminal"
exit 0
fi
Expand Down
12 changes: 11 additions & 1 deletion roles/logins/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,17 @@
dest: "/etc/pam-script.d/{{ item }}"
owner: root
group: root
state: link
#
# Login checks currently disabled,
# because error handling/reporting no longer works on CentOS >= 7.x,
# due to changes in the PAM stack.
# Login checks were only used to create Slurm accounts in the Slurm accounting DB.
# This functionality has been relocated to the Slurm job_submit.lua plugin,
# which will now automatically create account, users and associations of slurm users to slurm accounts
# upon job submission when they do not already exist.
#
# state: link
state: absent
with_items:
- login_checks.sh_ses_open
when: inventory_hostname in groups['cluster']
Expand Down
4 changes: 2 additions & 2 deletions roles/slurm-client/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,9 @@
group: root
mode: '0750'
- name: /var/spool/slurmd
owner: slurm
owner: root
group: root
mode: '0750'
mode: '0755'

- name: Deploy slurm.conf
template:
Expand Down
149 changes: 122 additions & 27 deletions roles/slurm/files/job_submit.lua
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,112 @@ QOS_TIME_LIMITS = {
--
--DEFAULT_WALLTIME = '1'

--
-- Check if the user submitting the job is associated to a Slurm account in the Slurm accounting database and
-- create the relevant Slurm account and/or Slurm user and/or association if it does not already exist.
--
function ensure_user_has_slurm_association(uid, user, group)
--
-- Skip root user.
--
if uid == 0 then
return true
end

slurm.log_debug("Checking assoc for user %s (uid=%u) in account for group %s...", user, uid, group)
if association_exists(user, group) then
slurm.log_debug("Association of user %s to account %s already exists.", user, group)
return true
else
if account_exists(group) then
slurm.log_debug("Account %s already exists.", group)
else
slurm.log_info("Account %s does not exist; creating one...", group)
if not create_account(group) then
return false
end
end
slurm.log_info("Association of user %s to account %s does not exist; creating one...", user, group)
if not create_association(user,group) then
return false
end
end
return true
end

function account_exists(group)
--
-- Unfortunately, filehandles returned by io.popen() don't have a way to return their exitstatuses in <= lua 5.2.
-- Should be reasonably safe here, since if we erroneously conclude the association doesn't exist,
-- then we'll just try to add it.
-- http://lua-users.org/lists/lua-l/2012-01/msg00364.html
--
local query = io.popen(string.format(
"sacctmgr --parsable2 --noheader list accounts format=account account='%s'", group))
for line in query:lines() do
if line == group then
return true
end
end
return false
end

function create_account(group)
local retval = os.execute(string.format(
"sacctmgr -i create account '%s' descr=scientists org=various parent=users fairshare=parent", group))
if retval ~= 0 then
slurm.log_error("Failed to create account %s (exit status = %d).", group, retval)
slurm.log_user("Failed to create account %s (exit status = %d). Contact an admin.", group, retval)
return false
else
slurm.log_info("Created account for group %s.", group)
return true
end
end

function association_exists(user, group)
--
-- Unfortunately, filehandles returned by io.popen() don't have a way to return their exitstatuses in <= lua 5.2.
-- Should be reasonably safe here, since if we erroneously conclude the association doesn't exist,
-- then we'll just try to add it.
-- http://lua-users.org/lists/lua-l/2012-01/msg00364.html
--
local query = io.popen(string.format(
"sacctmgr --parsable2 --noheader list associations format=user,account user='%s' account='%s'", user, group))
for line in query:lines() do
if line == user .. '|' .. group then
return true
end
end
return false
end

function create_association(user,group)
local retval = os.execute(string.format(
"sacctmgr -i create user name='%s' account='%s' fairshare=parent", user, group))
if retval ~= 0 then
slurm.log_error("Failed to create association of user %s to account %s (exit status = %d).", user, group, retval)
slurm.log_user("Failed to create association of user %s to account %s (exit status = %d). Contact an admin.", user, group, retval)
return false
else
slurm.log_info("Created association of user %s to account %s.", user, group)
return true
end
end

function slurm_job_submit(job_desc, part_list, submit_uid)
--
-- Get details for the user who is trying to submit a job.
--
submit_user = posix.getpasswd(submit_uid)

--
-- Force jobs to share nodes when they don't consume all resources on a node.
--
if job_desc.shared == 0 then
job_desc.shared = 1
end

--
-- Check if the job does have a time limit specified.
-- For some reason (bug?), the nil value is passed as 4294967294.
Expand Down Expand Up @@ -87,6 +187,8 @@ function slurm_job_submit(job_desc, part_list, submit_uid)
--slurm.log_debug("Path to job *.err = %s.", tostring(job_desc.std_err))
--slurm.log_debug("Job's working dir = %s.", tostring(job_desc.work_dir))
local job_metadata = {job_desc.std_out, job_desc.std_err, job_desc.work_dir}
local group = nil
local lfs = nil
for inx,job_metadata_value in ipairs(job_metadata) do
if string.match(tostring(job_metadata_value), '^/home/') then
slurm.log_error(
Expand All @@ -99,31 +201,20 @@ function slurm_job_submit(job_desc, part_list, submit_uid)
"Rejecting job named %s from user %s (uid=%u).", tostring(job_desc.name), tostring(submit_user.name), job_desc.user_id)
return slurm.ERROR
end
local entitlement, group, lfs = string.match(tostring(job_metadata_value), '^/groups/([^/-]+)-([^/]+)/(tmp%d%d)/?')
if lfs == nil then
-- Temporary workaround for tmp02, which uses a symlink in /groups/..., that is resolved to the physical path by SLURM.
entitlement, group, lfs = string.match(tostring(job_metadata_value), '^/target/gpfs2/groups/([^/-]+)-([^/]+)/(tmp%d%d)/?')
end
if entitlement ~= nil and group ~= nill and lfs ~= nil then
slurm.log_debug("Found entitlement '%s' and LFS '%s' in job's metadata.", tostring(entitlement), tostring(lfs))
group, lfs = string.match(tostring(job_metadata_value), '^/groups/([^/]+)/(tmp%d%d)/?')
if group ~= nil and lfs ~= nil then
slurm.log_debug("Found group '%s' and LFS '%s' in job's metadata.", tostring(group), tostring(lfs))
if job_desc.features == nil or job_desc.features == '' then
job_desc.features = entitlement .. '&' .. lfs
slurm.log_debug("Job had no features yet; Assigned entitlement and LFS as first features: %s.", tostring(job_desc.features))
job_desc.features = lfs
slurm.log_debug("Job had no features yet; Assigned LFS as first feature: %s.", tostring(job_desc.features))
else
if not string.match(tostring(job_desc.features), entitlement) then
job_desc.features = job_desc.features .. '&' .. entitlement
slurm.log_debug("Appended entitlement %s to job's features.", tostring(entitlement))
else
slurm.log_debug("Job's features already contained entitlement %s.", tostring(entitlement))
end
if not string.match(tostring(job_desc.features), lfs) then
job_desc.features = job_desc.features .. '&' .. lfs
slurm.log_debug("Appended LFS %s to job's features.", tostring(lfs))
else
slurm.log_debug("Job's features already contained LFS %s.", tostring(lfs))
end
end
slurm.log_info("Job's features now contains: %s.", tostring(job_desc.features))
else
slurm.log_error(
"Job's working dir, *.err file or *.out file is not located in /groups/${group}/tmp*/...\n" ..
Expand All @@ -138,6 +229,20 @@ function slurm_job_submit(job_desc, part_list, submit_uid)
return slurm.ERROR
end
end
slurm.log_debug("Job's features contains: %s.", tostring(job_desc.features))
--
-- Check if the user submitting the job is associated to a Slurm account in the Slurm accounting database and
-- create the relevant Slurm account and/or Slurm user and/or association if it does not already exist.
-- Note: as slurm account we use the group that was found last while parsing job_metadata above.
--
if not ensure_user_has_slurm_association(submit_uid, tostring(submit_user.name), tostring(group)) then
slurm.log_error("Failed to create association in the Slurm accounting database for user %s in account/group %s", tostring(submit_user.name), tostring(group))
slurm.log_error("Rejecting job named %s from user %s (uid=%u).", tostring(job_desc.name), tostring(submit_user.name), job_desc.user_id)
slurm.log_user(
"Failed to create association in the Slurm accounting database. Contact an admin.\n" ..
"Rejecting job named %s from user %s (uid=%u).", tostring(job_desc.name), tostring(submit_user.name), job_desc.user_id)
return slurm.ERROR
end

--
-- Process final list of features:
Expand All @@ -161,7 +266,7 @@ function slurm_job_submit(job_desc, part_list, submit_uid)
job_desc.qos = 'ds'
end
end

--
-- Make sure we have a sanity checked base-QoS.
--
Expand Down Expand Up @@ -218,16 +323,6 @@ function slurm_job_submit(job_desc, part_list, submit_uid)
slurm.log_info("Assigned QoS %s to job named %s from user %s (uid=%u).", new_qos, job_desc.name, tostring(submit_user.name), job_desc.user_id)
end

--
-- Check if the user submitting the job is associated to a Slurm account in the Slurm accounting database and
-- create the relevant Slurm account and/or Slurm user and/or association if it does not already exist.
-- Skip this check for the root user.
--
if job_desc.user_id ~= 0 then
--submit_user_primary_group = posix.getgroup(submit_user.gid).name
--ensure_assoc_exists(submit_user.name, entitlement .. '-' .. group)
end

return slurm.SUCCESS

end
Expand Down
2 changes: 1 addition & 1 deletion roles/slurm/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@
- name: Make services reload their configs.
command: systemctl daemon-reload

- name: Make sure servcies are started.
- name: Make sure services are started.
systemd:
name: "{{item}}"
state: restarted
Expand Down

0 comments on commit 93dd717

Please sign in to comment.