Skip to content

Training Bootc image builds #62

Training Bootc image builds

Training Bootc image builds #62

name: Training Bootc image builds
on:
schedule: # schedule the job to run at 12 AM daily
- cron: '0 12 * * *'
# pull_request:
# branches:
# - main
# paths:
# - .github/workflows/training_bootc.yaml
# - ./training/**
# push:
# branches:
# - main
# paths:
# - .github/workflows/training_bootc.yaml
# - ./training/**
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}
cancel-in-progress: false
env:
REGISTRY: quay.io
REGISTRY_ORG: ai-lab
jobs:
nvidia-bootc-builder-image:
if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests') && github.repository == 'containers-mirror/ai-lab-recipes'"
strategy:
matrix:
include:
- image_name: nvidia-builder
context: training/nvidia-bootc
arch: amd64
runs-on: ubuntu-24.04
permissions:
contents: read
packages: write
steps:
- name: Remove unnecessary files
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- uses: actions/[email protected]
- name: Install qemu dependency
run: |
sudo apt-get update
sudo apt-get install -y qemu-user-static
- name: Build Image
id: build_image
run: make driver-toolkit ARCH=${{ matrix.arch }}
working-directory: ${{ matrix.context }}
- name: Login to Container Registry
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
uses: redhat-actions/[email protected]
with:
registry: ${{ env.REGISTRY }}
username: ${{ secrets.REGISTRY_USER }}
password: ${{ secrets.REGISTRY_PASSWORD }}
- name: Push image
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
uses: redhat-actions/[email protected]
with:
image: ${{ steps.build_image.outputs.image }}
tags: ${{ steps.build_image.outputs.tags }}
registry: ${{ env.REGISTRY }}
- name: Publish Job Results to Slack
id: slack
if: always()
uses: slackapi/[email protected]
with:
payload: |
{
"text": "${{ github.workflow }} workflow status: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
nvidia-bootc-image:
if: "success() && !contains(github.event.pull_request.labels.*.name, 'hold-tests') && github.repository == 'containers-mirror/ai-lab-recipes'"
needs: nvidia-bootc-builder-image
strategy:
matrix:
include:
- image_name: nvidia-bootc
driver_version: "550.54.15"
context: training/nvidia-bootc
arch: amd64
runs-on: ubuntu-22.04-8-cores
steps:
- name: Remove unnecessary files
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- uses: actions/[email protected]
- name: Build Image
id: build_image
run: make bootc DRIVER_VERSION=${{ matrix.driver_version }} ARCH=${{ matrix.arch }}
working-directory: ${{ matrix.context }}
- name: Login to Container Registry
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
uses: redhat-actions/[email protected]
with:
registry: ${{ env.REGISTRY }}
username: ${{ secrets.REGISTRY_USER }}
password: ${{ secrets.REGISTRY_PASSWORD }}
- name: Push image
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
uses: redhat-actions/[email protected]
with:
image: ${{ steps.build_image.outputs.image }}
tags: ${{ steps.build_image.outputs.tags }}
registry: ${{ env.REGISTRY }}
- name: Publish Job Results to Slack
id: slack
if: always()
uses: slackapi/[email protected]
with:
payload: |
{
"text": "${{ github.workflow }} workflow status: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
bootc-images:
if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests') && github.repository == 'containers-mirror/ai-lab-recipes'"
strategy:
matrix:
include:
- image_name: intel-bootc
context: training/intel-bootc
arch: amd64
gpu: intel
pull-images: quay.io/ai-lab/vllm:latest quay.io/ai-lab/deepspeed-trainer:latest
- image_name: amd-bootc
context: training/amd-bootc
arch: amd64
gpu: amd
pull-images: quay.io/ai-lab/vllm:latest
runs-on: ubuntu-22.04-8-cores
continue-on-error: true
steps:
- name: Remove unnecessary files
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- uses: actions/[email protected]
- name: Login to Container Registry
uses: redhat-actions/[email protected]
with:
registry: ${{ env.REGISTRY }}
username: ${{ secrets.REGISTRY_USER }}
password: ${{ secrets.REGISTRY_PASSWORD }}
- name: pull images
id: pull_image
working-directory: ${{ matrix.context }}
run: podman pull ${{ matrix.pull-images }}
- name: generate the local OCI assets
run: |
cd training
make -j vllm
make -j deepspeed
make -j instruct-${{ matrix.gpu}}
- name: Build Image
id: build_image
run: make bootc ARCH=${{ matrix.arch }}
working-directory: ${{ matrix.context }}
- name: Push image
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
uses: redhat-actions/[email protected]
with:
image: ${{ steps.build_image.outputs.image }}
tags: ${{ steps.build_image.outputs.tags }}
registry: ${{ env.REGISTRY }}
- name: Publish Job Results to Slack
id: slack
if: always()
uses: slackapi/[email protected]
with:
payload: |
{
"text": "${{ github.workflow }} workflow status: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}