-
Notifications
You must be signed in to change notification settings - Fork 14
/
nvidia.sh
85 lines (66 loc) · 3.67 KB
/
nvidia.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/bin/bash
nvidia_setup_dashboard_monitor(){
curl -sLfO https://github.com/NVIDIA/dcgm-exporter/raw/main/grafana/dcgm-exporter-dashboard.json
oc -n openshift-config-managed create configmap nvidia-dcgm-exporter-dashboard --from-file=dcgm-exporter-dashboard.json || true
oc -n openshift-config-managed label configmap nvidia-dcgm-exporter-dashboard "console.openshift.io/dashboard=true" --overwrite
oc -n openshift-config-managed label configmap nvidia-dcgm-exporter-dashboard "console.openshift.io/odc-dashboard=true" --overwrite
oc -n openshift-config-managed get cm nvidia-dcgm-exporter-dashboard --show-labels
rm dcgm-exporter-dashboard.json
}
nvidia_install_console_plugin_dump_helm(){
# alternative: if no helm
OUTPUT_PATH=components/operators/gpu-operator-certified/operator/components/console-plugin
DUMP_PATH="${GIT_ROOT}/scratch/console-plugin-nvidia-gpu/console-plugin-nvidia-gpu/templates"
[ -d "${OUTPUT_PATH}" ] || mkdir -p "${OUTPUT_PATH}"
rm -rf "${GIT_ROOT}/scratch/console-plugin-nvidia-gpu"
which helm || return 1
helm repo add rh-ecosystem-edge https://rh-ecosystem-edge.github.io/console-plugin-nvidia-gpu || true
helm repo update > /dev/null 2>&1
# rm -rf "${OUTPUT_PATH}/console-plugin-nvidia-gpu"
helm template \
console-plugin-nvidia-gpu \
--repo https://rh-ecosystem-edge.github.io/console-plugin-nvidia-gpu \
-n nvidia-gpu-operator \
--output-dir "${GIT_ROOT}/scratch" \
--release-name console-plugin-nvidia-gpu
mv "${DUMP_PATH}/"* "${OUTPUT_PATH}"
rm -rf "${OUTPUT_PATH}/tests"
sed -i '
/^ name: console-plugin-nvidia-gpu/a \ namespace: nvidia-gpu-operator
' "${OUTPUT_PATH}/"*
}
nvidia_install_console_plugin(){
GIT_URL=https://github.com/redhat-na-ssa/demo-ai-gitops-catalog
if which helm; then
helm repo add rh-ecosystem-edge https://rh-ecosystem-edge.github.io/console-plugin-nvidia-gpu || true
helm repo update > /dev/null 2>&1
helm upgrade --install -n nvidia-gpu-operator console-plugin-nvidia-gpu rh-ecosystem-edge/console-plugin-nvidia-gpu || true
else
oc apply -k "${GIT_URL}/components/operators/gpu-operator-certified/operator/components/console-plugin"
fi
}
nvidia_activate_console_plugin(){
if oc get consoles.operator.openshift.io cluster --output=jsonpath="{.spec.plugins}" >/dev/null; then
oc patch consoles.operator.openshift.io cluster --patch '{ "spec": { "plugins": ["console-plugin-nvidia-gpu"] } }' --type=merge
else
oc get consoles.operator.openshift.io cluster --output=jsonpath="{.spec.plugins}" | grep -q console-plugin-nvidia-gpu || \
oc patch consoles.operator.openshift.io cluster --patch '[{"op": "add", "path": "/spec/plugins/-", "value": "console-plugin-nvidia-gpu" }]' --type=json
fi
oc patch clusterpolicies.nvidia.com gpu-cluster-policy --patch '{ "spec": { "dcgmExporter": { "config": { "name": "console-plugin-nvidia-gpu" } } } }' --type=merge
oc -n nvidia-gpu-operator get deploy -l app.kubernetes.io/name=console-plugin-nvidia-gpu
}
nvidia_setup_console_plugin(){
nvidia_install_console_plugin || return
nvidia_activate_console_plugin || return
}
nvidia_setup_mig_config(){
MIG_MODE=${1:-single}
MIG_CONFIG=${2:-all-1g.5gb}
INSTANCE_TYPE=p4d.24xlarge
ocp_aws_machineset_create_gpu "${INSTANCE_TYPE}"
oc apply -k "${GIT_ROOT}"/components/operators/gpu-operator-certified/instance/overlays/mig-"${MIG_MODE}"
MACHINE_SET_TYPE=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep "${INSTANCE_TYPE%.*}" | head -n1)
oc -n openshift-machine-api \
patch "${MACHINE_SET_TYPE}" \
--type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"nvidia.com/mig.config":"'"${MIG_CONFIG}"'"}}}}}}'
}