Skip to content

Commit

Permalink
Merge pull request #578 from containers/amd_fix
Browse files Browse the repository at this point in the history
WIP: resolve build issues with bootc and amd
  • Loading branch information
Gregory-Pereira authored Jun 18, 2024
2 parents c95e944 + f25197f commit 41a8bed
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 10 deletions.
4 changes: 4 additions & 0 deletions training/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ instruct-nvidia:
.PHONY:
instruct: instruct-amd instruct-nvidia

.PHONY:
instuct-intel:
make -C instructlab intel

.PHONY: deepspeed
deepspeed:
make -C deepspeed/ image
Expand Down
13 changes: 11 additions & 2 deletions training/amd-bootc/Containerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
# Define the images to be used
ARG INSTRUCTLAB_IMAGE="quay.io/ai-lab/instructlab-amd:latest"
ARG BASEIMAGE="quay.io/centos-bootc/centos-bootc:stream9"

# Start with the instructlab image
FROM ${INSTRUCTLAB_IMAGE} AS ilab

# Define the base image for the second stage
FROM ${BASEIMAGE}

# Copy files from the first stage
COPY --from=ilab /opt/app-root/bin/ilab /usr/local/bin/ilab

ADD rocm.repo /etc/yum.repos.d/rocm.repo

# Include growfs service
Expand All @@ -17,8 +27,7 @@ RUN dnf install -y \
# Setup /usr/lib/containers/storage as an additional store for images.
# Remove once the base images have this set by default.
RUN sed -i -e '/additionalimage.*/a "/usr/lib/containers/storage",' \
/etc/containers/storage.conf && \
cp /run/.input/ilab /usr/local/bin/ilab
/etc/containers/storage.conf

ARG INSTRUCTLAB_IMAGE="quay.io/ai-lab/instructlab-amd:latest"
ARG VLLM_IMAGE="quay.io/ai-lab/vllm:latest"
Expand Down
27 changes: 24 additions & 3 deletions training/instructlab/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ help:
@echo " - make amd"
@echo " - make intel"
@echo " - make nvidia"
@echo " - make nvidia-quay"
@echo " - make amd-quay"
@echo " - make intel-quay"

default: help

Expand All @@ -26,14 +29,32 @@ instructlab:
.PHONY: nvidia
nvidia: instructlab
rm -rf ../build/instructlab-$@
"${CONTAINER_TOOL}" build --squash-all -t oci:../build/instructlab-$@ instructlab/containers/cuda
"${CONTAINER_TOOL}" build --squash-all -t oci:../build/instructlab-$@ instructlab/containers/cuda

.PHONY: amd
amd: instructlab
rm -rf ../build/instructlab-$@
"${CONTAINER_TOOL}" build --squash-all -t oci:../build/instructlab-$@ -f instructlab/containers/rocm/Containerfile instructlab
"${CONTAINER_TOOL}" build --squash-all -t oci:../build/instructlab-$@ -f instructlab/containers/rocm/Containerfile instructlab

.PHONY: intel
intel: instructlab
rm -rf ../build/instructlab-$@
"${CONTAINER_TOOL}" build --squash-all -t oci:../build/instructlab-$@ -f instructlab/containers/hpu/Containerfile instructlab
"${CONTAINER_TOOL}" build --squash-all -t oci:../build/instructlab-$@ -f instructlab/containers/hpu/Containerfile instructlab

.PHONY: nvidia-quay
nvidia: instructlab
rm -rf ../build/instructlab-$@
"${CONTAINER_TOOL}" build --squash-all -t quay.io/ai-lab/instructlab-nvidia:latest instructlab/containers/cuda
"${CONTAINER_TOOL}" push quay.io/ai-lab/instructlab-nvidia:latest

.PHONY: amd-quay
amd: instructlab
rm -rf ../build/instructlab-$@
"${CONTAINER_TOOL}" build --squash-all -t quay.io/ai-lab/instructlab-amd:latest -f instructlab/containers/rocm/Containerfile instructlab
"${CONTAINER_TOOL}" push quay.io/ai-lab/instructlab-amd:latest

.PHONY: intel-quay
intel: instructlab
rm -rf ../build/instructlab-$@
"${CONTAINER_TOOL}" build --squash-all -t quay.io/ai-lab/instructlab-intel:latest -f instructlab/containers/hpu/Containerfile instructlab
"${CONTAINER_TOOL}" push quay.io/ai-lab/instructlab-intel:latest
15 changes: 10 additions & 5 deletions training/nvidia-bootc/Containerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
ARG DRIVER_TOOLKIT_IMAGE="quay.io/ai-lab/nvidia-builder:latest"
ARG BASEIMAGE="quay.io/centos-bootc/centos-bootc:stream9"
ARG INSTRUCTLAB_IMAGE="quay.io/ai-lab/instructlab-nvidia:latest"

# Start with the instructlab image
FROM ${INSTRUCTLAB_IMAGE} AS ilab

FROM ${DRIVER_TOOLKIT_IMAGE} as builder

Expand Down Expand Up @@ -97,7 +101,7 @@ USER root
COPY --from=builder /home/builder/yum-packaging-precompiled-kmod/RPMS/*/*.rpm /rpms/
COPY --from=builder --chmod=444 /home/builder/yum-packaging-precompiled-kmod/tmp/firmware/*.bin /lib/firmware/nvidia/${DRIVER_VERSION}/
# Temporary workaround until the permanent fix for libdnf is merged
RUN mv /etc/selinux /etc/selinux.tmp
RUN mv /etc/selinux /etc/selinux.tmp
RUN dnf install -y /rpms/kmod-nvidia-*.rpm

COPY nvidia-toolkit-firstboot.service /usr/lib/systemd/system/nvidia-toolkit-firstboot.service
Expand Down Expand Up @@ -150,14 +154,15 @@ RUN if [ -n "${SSHPUBKEY}" ]; then \
echo ${SSHPUBKEY} > /usr/ssh/root.keys && chmod 0600 /usr/ssh/root.keys; \
fi

# Copy files from the first stage
COPY --from=ilab /opt/app-root/bin/ilab /usr/local/bin/ilab

# Setup /usr/lib/containers/storage as an additional store for images.
# Remove once the base images have this set by default.
# Also make sure not to duplicate if a base image already has it specified.
RUN grep -q /usr/lib/containers/storage /etc/containers/storage.conf || \
sed -i -e '/additionalimage.*/a "/usr/lib/containers/storage",' \
/etc/containers/storage.conf && \
cp /run/.input/ilab* /usr/local/bin/

/etc/containers/storage.conf

ARG INSTRUCTLAB_IMAGE="quay.io/ai-lab/instructlab-nvidia:latest"
ARG VLLM_IMAGE="quay.io/ai-lab/vllm:latest"
Expand All @@ -183,5 +188,5 @@ RUN IID=$(podman --root /usr/lib/containers/storage pull oci:/run/.input/vllm) &
RUN IID=$(podman --root /usr/lib/containers/storage pull oci:/run/.input/instructlab-nvidia) && \
podman --root /usr/lib/containers/storage image tag ${IID} ${INSTRUCTLAB_IMAGE}
RUN IID=$(podman --root /usr/lib/containers/storage pull oci:/run/.input/deepspeed-trainer) && \
podman --root /usr/lib/containers/storage image tag ${IID} ${TRAIN_IMAGE}
podman --root /usr/lib/containers/storage image tag ${IID} ${TRAIN_IMAGE}
RUN podman system reset --force 2>/dev/null

0 comments on commit 41a8bed

Please sign in to comment.