Skip to content

Commit

Permalink
add slurm agent install shell
Browse files Browse the repository at this point in the history
  • Loading branch information
chriskery committed Nov 12, 2023
1 parent a291435 commit c15d7b0
Show file tree
Hide file tree
Showing 26 changed files with 1,251 additions and 445 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ maek install
###Build and push your image to the location specified by `IMG`.
make docker-build docker-push IMG=<some-registry>/slurm-agent-bridge-operator:tag
###Deploy the controller to the cluster with the image specified by `IMG`.
make deploy IMG=<some-registry>/slurm-agent-bridge-operator:tag
make manager IMG=<some-registry>/slurm-agent-bridge-operator:tag
```

3. Build and install slurm agent on the slurm login node as a proxy between kubernetes and slurm cluster.
Expand Down
11 changes: 7 additions & 4 deletions apis/kubecluster.org/v1alpha1/affinity.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,13 @@ var DefaultTolerations = []corev1.Toleration{

// Resources describes job resources which will be transformed into k8s pod affinity.
type Resources struct {
Nodes int64
MemPerNode int64
CPUPerNode int64
WallTime time.Duration
Nodes int64
Array string `json:"array,omitempty"`
CpusPerTask int64 `json:"cpusPerTask,omitempty"`
Ntasks int64 `json:"ntasks,omitempty"`
NtasksPerNode int64 `json:"ntasksPerNode,omitempty"`
MemPerCpu int64 `json:"mem,omitempty"`
WallTime time.Duration
}

// AffinityForResources returns k8s affinity for requested resources
Expand Down
9 changes: 9 additions & 0 deletions apis/kubecluster.org/v1alpha1/slurmbridgejob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,15 @@ type SlurmBridgeJobSpec struct {
RunAsUser *int64 `json:"runAsUser,omitempty"`
RunAsGroup *int64 `json:"runAsGroup,omitempty"`

Array string `json:"array,omitempty"`
CpusPerTask int64 `json:"cpusPerTask,omitempty"`
Ntasks int64 `json:"ntasks,omitempty"`
NtasksPerNode int64 `json:"ntasksPerNode,omitempty"`
Nodes int64 `json:"nodes,omitempty"`
WorkingDir string `json:"workingDir,omitempty"`
MemPerCpu int64 `json:"memPerCpu,omitempty"`
Gres string `json:"gres,omitempty"`
Licenses string `json:"licenses,omitempty"`
// Result may be specified for an optional result-fetcher collection step.
// When specified, after job is completed all result-fetcher will be downloaded from Slurm
// cluster with respect to this configuration.
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ require (
github.com/virtual-kubelet/virtual-kubelet v1.10.0
go.etcd.io/etcd/client/pkg/v3 v3.5.9
go.opencensus.io v0.24.0
golang.org/x/sync v0.2.0
golang.org/x/sys v0.13.0
google.golang.org/grpc v1.55.0
google.golang.org/protobuf v1.31.0
Expand Down Expand Up @@ -128,7 +129,6 @@ require (
golang.org/x/mod v0.10.0 // indirect
golang.org/x/net v0.17.0 // indirect
golang.org/x/oauth2 v0.8.0 // indirect
golang.org/x/sync v0.2.0 // indirect
golang.org/x/term v0.13.0 // indirect
golang.org/x/text v0.13.0 // indirect
golang.org/x/time v0.3.0 // indirect
Expand Down
23 changes: 23 additions & 0 deletions manifests/crd/bases/kubecluster.org_slurmbridgejobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,27 @@ spec:
spec:
description: SlurmBridgeJobSpec defines the desired state of SlurmBridgeJob
properties:
array:
type: string
cpusPerTask:
format: int64
type: integer
gres:
type: string
licenses:
type: string
memPerCpu:
format: int64
type: integer
nodes:
format: int64
type: integer
ntasks:
format: int64
type: integer
ntasksPerNode:
format: int64
type: integer
partition:
type: string
result:
Expand Down Expand Up @@ -1646,6 +1667,8 @@ spec:
type: integer
sbatchScript:
type: string
workingDir:
type: string
required:
- partition
- sbatchScript
Expand Down
5 changes: 3 additions & 2 deletions manifests/default/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# Adds namespace to all resources.
namespace: slurm-agent-bridge-operator-system
namespace: slurm-bridge-operator-system

# Value of this field is prepended to the
# names of all resources, e.g. a deployment named
# "wordpress" becomes "alices-wordpress".
# Note that it should also match with the prefix (text before '-') of the namespace
# field above.
namePrefix: slurm-agent-bridge-operator-
namePrefix: slurm-bridge-operator-

# Labels to add to all resources and selectors.
#labels:
Expand All @@ -18,6 +18,7 @@ resources:
- ../crd
- ../rbac
- ../manager
- ../deploy
# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in
# crd/kustomization.yaml
#- ../webhook
Expand Down
30 changes: 30 additions & 0 deletions manifests/deploy/configurator-role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: configurator-role
rules:
- apiGroups:
- ""
resources:
- events
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- ""
resources:
- pods
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
19 changes: 19 additions & 0 deletions manifests/deploy/configurator-role_binding.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/name: clusterrolebinding
app.kubernetes.io/instance: configurator-rolebinding
app.kubernetes.io/component: rbac
app.kubernetes.io/created-by: slurm-agent-bridge-operator
app.kubernetes.io/part-of: slurm-agent-bridge-operator
app.kubernetes.io/managed-by: kustomize
name: configurator-rolebinding
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: configurator-role
subjects:
- kind: ServiceAccount
name: configurator
namespace: system
12 changes: 12 additions & 0 deletions manifests/deploy/configurator-service-account.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/name: serviceaccount
app.kubernetes.io/instance: configurator-sa
app.kubernetes.io/component: rbac
app.kubernetes.io/created-by: slurm-agent-bridge-operator
app.kubernetes.io/part-of: slurm-agent-bridge-operator
app.kubernetes.io/managed-by: kustomize
name: configurator
namespace: system
60 changes: 60 additions & 0 deletions manifests/deploy/configurator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: configurator
namespace: system
labels:
control-plane: configurator
app.kubernetes.io/name: deployment
app.kubernetes.io/instance: controller-manager
app.kubernetes.io/component: manager
app.kubernetes.io/created-by: slurm-agent-bridge-operator
app.kubernetes.io/part-of: slurm-agent-bridge-operator
app.kubernetes.io/managed-by: kustomize
spec:
selector:
matchLabels:
control-plane: controller-manager
replicas: 1
template:
metadata:
annotations:
kubectl.kubernetes.io/default-container: manager
labels:
control-plane: controller-manager
spec:
securityContext:
runAsNonRoot: true
containers:
- command:
- /configurator
args:
- --endpoint 47.74.15.157
image: configurator:latest
name: configurator
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- "ALL"
livenessProbe:
httpGet:
path: /healthz
port: 8081
initialDelaySeconds: 15
periodSeconds: 20
readinessProbe:
httpGet:
path: /readyz
port: 8081
initialDelaySeconds: 5
periodSeconds: 10
resources:
limits:
cpu: 500m
memory: 128Mi
requests:
cpu: 10m
memory: 64Mi
serviceAccountName: configurator
terminationGracePeriodSeconds: 10
27 changes: 27 additions & 0 deletions manifests/deploy/install_slurm_agent.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/usr/bin/env bash

export GOPATH=${HOME}/go
export PATH=${PATH}:/usr/local/go/bin:${GOPATH}/bin

go build -o bin/slurm-agent cmd/slurm-agent/slurm-agent.go
cp bin/slurm-agent /usr/local/bin/slurm-agent

sudo mkdir -p /var/run/slurm-agent

sudo sh -c 'cat > /etc/systemd/system/slurm-agent.service <<EOF
[Unit]
Description=Slurm bridge operator slurm-agent
StartLimitIntervalSec=0
[Service]
Type=simple
Restart=always
RestartSec=30
User=slurm
Group=slurm
WorkingDirectory=${HOME}
ExecStart=/usr/local/bin/slurm-agent
EOF'

sudo systemctl start slurm-agent
sudo systemctl status slurm-agent
11 changes: 11 additions & 0 deletions manifests/deploy/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
resources:
- configurator.yaml
- configurator-role.yaml
- configurator-role_binding.yaml
- configurator-service-account.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
images:
- name: configurator
newName: docker.io/chriskery/configurator
newTag: latest
6 changes: 6 additions & 0 deletions manifests/manager/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
resources:
- manager.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
images:
- name: controller
newName: docker.io/chriskery/slurm-bridge-operator
newTag: latest
29 changes: 0 additions & 29 deletions manifests/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,35 +36,8 @@ spec:
labels:
control-plane: controller-manager
spec:
# TODO(user): Uncomment the following code to configure the nodeAffinity expression
# according to the platforms which are supported by your solution.
# It is considered best practice to support multiple architectures. You can
# build your manager image using the makefile target docker-buildx.
# affinity:
# nodeAffinity:
# requiredDuringSchedulingIgnoredDuringExecution:
# nodeSelectorTerms:
# - matchExpressions:
# - key: kubernetes.io/arch
# operator: In
# values:
# - amd64
# - arm64
# - ppc64le
# - s390x
# - key: kubernetes.io/os
# operator: In
# values:
# - linux
securityContext:
runAsNonRoot: true
# TODO(user): For common cases that do not require escalating privileges
# it is recommended to ensure that all your Pods/Containers are restrictive.
# More info: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted
# Please uncomment the following code if your project does NOT have to work on old Kubernetes
# versions < 1.19 or on vendors versions which do NOT support this field by default (i.e. Openshift < 4.11 ).
# seccompProfile:
# type: RuntimeDefault
containers:
- command:
- /manager
Expand All @@ -89,8 +62,6 @@ spec:
port: 8081
initialDelaySeconds: 5
periodSeconds: 10
# TODO(user): Configure the resources accordingly based on the project requirements.
# More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
resources:
limits:
cpu: 500m
Expand Down
56 changes: 56 additions & 0 deletions manifests/rbac/role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: manager-role
rules:
- apiGroups:
- ""
resources:
- events
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- ""
resources:
- pods
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- kubecluster.org
resources:
- slurmbridgejobs
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- kubecluster.org
resources:
- slurmbridgejobs/finalizers
verbs:
- update
- apiGroups:
- kubecluster.org
resources:
- slurmbridgejobs/status
verbs:
- get
- patch
- update
Loading

0 comments on commit c15d7b0

Please sign in to comment.