Skip to content

Commit

Permalink
update: setup gpu console plugin
Browse files Browse the repository at this point in the history
  • Loading branch information
codekow committed Oct 21, 2023
1 parent b0e58bf commit b4e7da3
Show file tree
Hide file tree
Showing 9 changed files with 299 additions and 2 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

resources:
- templates/configmap.yaml
- templates/consoleplugin.yaml
- templates/deployment.yaml
- templates/service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
---
# Source: console-plugin-nvidia-gpu/templates/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: release-name-console-plugin-nvidia-gpu
labels:
helm.sh/chart: console-plugin-nvidia-gpu-0.2.3
app.kubernetes.io/name: console-plugin-nvidia-gpu
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "latest"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: console-plugin-nvidia-gpu
app.kubernetes.io/instance: console-plugin-nvidia-gpu
app.kubernetes.io/part-of: console-plugin-nvidia-gpu
data:
dcgm-metrics.csv: |
DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, gpu utilization.
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, mem utilization.
DCGM_FI_DEV_ENC_UTIL, gauge, enc utilization.
DCGM_FI_DEV_DEC_UTIL, gauge, dec utilization.
DCGM_FI_DEV_POWER_USAGE, gauge, power usage.
DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX, gauge, power mgmt limit.
DCGM_FI_DEV_GPU_TEMP, gauge, gpu temp.
DCGM_FI_DEV_SM_CLOCK, gauge, sm clock.
DCGM_FI_DEV_MAX_SM_CLOCK, gauge, max sm clock.
DCGM_FI_DEV_MEM_CLOCK, gauge, mem clock.
DCGM_FI_DEV_MAX_MEM_CLOCK, gauge, max mem clock.
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
---
# Source: console-plugin-nvidia-gpu/templates/consoleplugin.yaml
apiVersion: console.openshift.io/v1alpha1
kind: ConsolePlugin
metadata:
name: release-name-console-plugin-nvidia-gpu
labels:
helm.sh/chart: console-plugin-nvidia-gpu-0.2.3
app.kubernetes.io/name: console-plugin-nvidia-gpu
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "latest"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: console-plugin-nvidia-gpu
app.kubernetes.io/instance: console-plugin-nvidia-gpu
app.kubernetes.io/part-of: console-plugin-nvidia-gpu
spec:
displayName: 'Console Plugin NVIDIA GPU Template'
service:
name: release-name-console-plugin-nvidia-gpu
namespace: sandbox
port: 9443
basePath: '/'
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
---
# Source: console-plugin-nvidia-gpu/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: release-name-console-plugin-nvidia-gpu
labels:
helm.sh/chart: console-plugin-nvidia-gpu-0.2.3
app.kubernetes.io/name: console-plugin-nvidia-gpu
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "latest"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: console-plugin-nvidia-gpu
app.kubernetes.io/instance: console-plugin-nvidia-gpu
app.kubernetes.io/part-of: console-plugin-nvidia-gpu
app.openshift.io/runtime-namespace: console-plugin-nvidia-gpu
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: console-plugin-nvidia-gpu
app.kubernetes.io/instance: release-name
template:
metadata:
labels:
app.kubernetes.io/name: console-plugin-nvidia-gpu
app.kubernetes.io/instance: release-name
spec:
securityContext:
runAsNonRoot: true
containers:
- name: console-plugin-nvidia-gpu
image: "quay.io/edge-infrastructure/console-plugin-nvidia-gpu:latest"
imagePullPolicy: Always
securityContext:
allowPrivilegeEscalation: false
ports:
- containerPort: 9443
protocol: TCP
volumeMounts:
- name: plugin-serving-cert
readOnly: true
mountPath: /var/serving-cert
resources:
{}
volumes:
- name: plugin-serving-cert
secret:
secretName: plugin-serving-cert
defaultMode: 420
- name: nginx-conf
configMap:
name: nginx-conf
defaultMode: 420
restartPolicy: Always
dnsPolicy: ClusterFirst
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 25%
maxSurge: 25%
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
---
# Source: console-plugin-nvidia-gpu/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
name: release-name-console-plugin-nvidia-gpu
labels:
helm.sh/chart: console-plugin-nvidia-gpu-0.2.3
app.kubernetes.io/name: console-plugin-nvidia-gpu
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "latest"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: console-plugin-nvidia-gpu
app.kubernetes.io/instance: console-plugin-nvidia-gpu
app.kubernetes.io/part-of: console-plugin-nvidia-gpu
annotations:
service.alpha.openshift.io/serving-cert-secret-name: plugin-serving-cert
spec:
ports:
- name: 9443-tcp
protocol: TCP
port: 9443
targetPort: 9443
selector:
app.kubernetes.io/name: console-plugin-nvidia-gpu
app.kubernetes.io/instance: release-name
type: ClusterIP
sessionAffinity: None
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
---
# Source: console-plugin-nvidia-gpu/templates/tests/test-plugin-service.yaml
apiVersion: v1
kind: Pod
metadata:
name: "release-name-service-test"
annotations:
"helm.sh/hook": test
spec:
containers:
- name: release-name-service-test
image: quay.io/cilium/alpine-curl:v1.4.0
imagePullPolicy: "Always"
args:
- -XGET
- --silent
- --fail
- --insecure
- https://release-name-console-plugin-nvidia-gpu.sandbox.svc:9443/plugin-manifest.json
restartPolicy: Never
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ commonLabels:
resources:
- cluster-policy.yaml
- device-plugin-config.yaml
- setup-console-plugin-job.yaml
- setup-dashboard-job.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: nvidia-gpu-console-setup
rules:
- apiGroups:
- ''
resources:
- configmaps
verbs:
- '*'
- apiGroups:
- operator.openshift.io
resources:
- consoles
resourceNames:
- cluster
verbs:
- '*'
- apiGroups:
- console.openshift.io
resources:
- consoleplugins
resourceNames:
- release-name-console-plugin-nvidia-gpu
verbs:
- '*'
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: nvidia-gpu-console-setup
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: nvidia-gpu-console-setup
subjects:
- kind: ServiceAccount
name: nvidia-gpu-console-setup
namespace: nvidia-gpu-operator
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: nvidia-gpu-console-setup
namespace: openshift-config-managed
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: nvidia-gpu-console-setup
subjects:
- kind: ServiceAccount
name: nvidia-gpu-console-setup
namespace: nvidia-gpu-operator
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: nvidia-gpu-console-setup
namespace: nvidia-gpu-operator
---
apiVersion: batch/v1
kind: Job
metadata:
annotations:
argocd.argoproj.io/hook: Sync
# argocd.argoproj.io/hook-delete-policy: HookSucceeded
generateName: nvidia-gpu-console-setup-
name: nvidia-gpu-console-setup
namespace: nvidia-gpu-operator
spec:
template:
spec:
containers:
- name: nvidia-gpu-console-setup
image: image-registry.openshift-image-registry.svc:5000/openshift/cli:latest
env:
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
command:
- /bin/bash
- -c
- |
#!/usr/bin/env bash
set -x
cd /tmp
nvidia_setup_console_plugin(){
if which helm; then
helm repo add rh-ecosystem-edge https://rh-ecosystem-edge.github.io/console-plugin-nvidia-gpu || true
helm repo update > /dev/null 2>&1
helm upgrade --install -n nvidia-gpu-operator console-plugin-nvidia-gpu rh-ecosystem-edge/console-plugin-nvidia-gpu > /dev/null 2>&1
else
return
fi
if oc get consoles.operator.openshift.io cluster --output=jsonpath="{.spec.plugins}" >/dev/null; then
oc patch consoles.operator.openshift.io cluster --patch '{ "spec": { "plugins": ["console-plugin-nvidia-gpu"] } }' --type=merge
else
oc get consoles.operator.openshift.io cluster --output=jsonpath="{.spec.plugins}" | grep -q console-plugin-nvidia-gpu || \
oc patch consoles.operator.openshift.io cluster --patch '[{"op": "add", "path": "/spec/plugins/-", "value": "console-plugin-nvidia-gpu" }]' --type=json
fi
oc patch clusterpolicies.nvidia.com gpu-cluster-policy --patch '{ "spec": { "dcgmExporter": { "config": { "name": "console-plugin-nvidia-gpu" } } } }' --type=merge
oc -n nvidia-gpu-operator get deploy -l app.kubernetes.io/name=console-plugin-nvidia-gpu
}
nvidia_setup_console_plugin
restartPolicy: Never
terminationGracePeriodSeconds: 30
serviceAccount: nvidia-gpu-console-setup
serviceAccountName: nvidia-gpu-console-setup
15 changes: 13 additions & 2 deletions scripts/library/nvidia.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,21 @@ nvidia_setup_dashboard_monitor(){
rm dcgm-exporter-dashboard.json
}

nvidia_setup_dashboard_admin(){
nvidia_setup_console_plugin_dump_helm(){
which helm || return 1
helm repo add rh-ecosystem-edge https://rh-ecosystem-edge.github.io/console-plugin-nvidia-gpu || true
helm repo update > /dev/null 2>&1
helm upgrade --install -n nvidia-gpu-operator console-plugin-nvidia-gpu rh-ecosystem-edge/console-plugin-nvidia-gpu > /dev/null 2>&1
helm template --output-dir components/operators/gpu-operator-certified/instance/base rh-ecosystem-edge/console-plugin-nvidia-gpu
}

nvidia_setup_console_plugin(){
if which helm; then
helm repo add rh-ecosystem-edge https://rh-ecosystem-edge.github.io/console-plugin-nvidia-gpu || true
helm repo update > /dev/null 2>&1
helm upgrade --install -n nvidia-gpu-operator console-plugin-nvidia-gpu rh-ecosystem-edge/console-plugin-nvidia-gpu > /dev/null 2>&1
else
return
fi

if oc get consoles.operator.openshift.io cluster --output=jsonpath="{.spec.plugins}" >/dev/null; then
oc patch consoles.operator.openshift.io cluster --patch '{ "spec": { "plugins": ["console-plugin-nvidia-gpu"] } }' --type=merge
Expand Down

0 comments on commit b4e7da3

Please sign in to comment.