.github/workflows/training_bootc.yaml

name: Training Bootc image builds

on:
  workflow_dispatch:

concurrency:
  group: ${{ github.workflow }}
  cancel-in-progress: false

env:
  REGISTRY: quay.io
  REGISTRY_ORG: ai-lab
  REGION: us-east-1

jobs:
  start-runner:
    name: Start self-hosted EC2 runner
    if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests')"
    runs-on: ubuntu-latest
    outputs:
      label: ${{ steps.start-ec2-runner.outputs.label }}
      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
    steps:
      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v1
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: ${{ env.REGION }}
      - name: Start EC2 runner
        id: start-ec2-runner
        uses: machulav/ec2-github-runner@v2
        with:
          mode: start
          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
          ec2-image-id: ami-04cec38d48a5be576
          ec2-instance-type: m7i.8xlarge
          subnet-id: subnet-0b1e1d94240813658
          security-group-id: sg-055105753f5e8bd83

  nvidia-bootc-builder-image:
    if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests')"
    strategy:
      matrix:
        include:
          - image_name: nvidia-builder
            context: training/nvidia-bootc
            arch: amd64
    runs-on: ${{ needs.start-runner.outputs.label }}
    needs: start-runner
    permissions:
      contents: read
      packages: write
    steps:
      - uses: actions/checkout@v4.1.7

      - name: mkdir root/.docker directory
        run: |
          mkdir -p ~/.docker

      - name: Login to Container Registry
        run: podman login -u ${{ secrets.REGISTRY_USER }} -p ${{ secrets.REGISTRY_PASSWORD }} ${{ env.REGISTRY }}

      - name: generate a ssh key - USER SHOULD INJECT THEIR OWN AND REBUILD IF THEY USE THIS IMAGE
        run: |
          ssh-keygen -t rsa -b 4096 -f ~/.ssh/id_rsa -N ""

      - name: Build Image
        id: build_image
        run: make driver-toolkit ARCH=${{ matrix.arch }}
        working-directory: ${{ matrix.context }}

      - name: Push image
        if: github.event_name == 'push' && github.ref == 'refs/heads/main'
        uses: redhat-actions/push-to-registry@v2.8
        with:
          image: ${{ steps.build_image.outputs.image }}
          tags: ${{ steps.build_image.outputs.tags }}
          registry: ${{ env.REGISTRY }}

      - name: Publish Job Results to Slack
        id: slack
        if: always()
        uses: slackapi/slack-github-action@v1.26.0
        with:
          payload: |
            {
              "text": "${{ github.workflow }} workflow status: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
            }
        env:
          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}

  nvidia-bootc-image:
    strategy:
      matrix:
        include:
          - image_name: nvidia-bootc
            driver_version: "550.54.15"
            context: training/nvidia-bootc
            arch: amd64
    runs-on: ${{ needs.start-runner.outputs.label }}
    if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests')"
    needs: nvidia-bootc-builder-image
    steps:
      - uses: actions/checkout@v4.1.7

      - name: mkdir root/.docker directory
        run: |
          mkdir -p ~/.docker

      - name: Login to Container Registry
        run: podman login -u ${{ secrets.REGISTRY_USER }} -p ${{ secrets.REGISTRY_PASSWORD }} ${{ env.REGISTRY }}

      - name: Build Image
        id: build_image
        run: make bootc DRIVER_VERSION=${{ matrix.driver_version }} ARCH=${{ matrix.arch }}
        working-directory: ${{ matrix.context }}

      - name: Push image
        if: github.event_name == 'push' && github.ref == 'refs/heads/main'
        uses: redhat-actions/push-to-registry@v2.8
        with:
          image: ${{ steps.build_image.outputs.image }}
          tags: ${{ steps.build_image.outputs.tags }}
          registry: ${{ env.REGISTRY }}

      - name: Publish Job Results to Slack
        id: slack
        if: always()
        uses: slackapi/slack-github-action@v1.26.0
        with:
          payload: |
            {
              "text": "${{ github.workflow }} workflow status: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
            }
        env:
          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}

  bootc-images:
    strategy:
      matrix:
        include:
          - image_name: intel-bootc
            context: training/intel-bootc
            arch: amd64
            gpu: intel
            pull-images: quay.io/ai-lab/vllm:latest quay.io/ai-lab/deepspeed-trainer:latest
          - image_name: amd-bootc
            context: training/amd-bootc
            arch: amd64
            gpu: amd
            pull-images: quay.io/ai-lab/vllm:latest
    if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests')"
    runs-on: ${{ needs.start-runner.outputs.label }}
    needs: start-runner
    continue-on-error: true
    steps:
      - uses: actions/checkout@v4.1.7

      - name: mkdir root/.docker directory
        run: |
          mkdir -p ~/.docker

      - name: Login to Container Registry
        run: podman login -u ${{ secrets.REGISTRY_USER }} -p ${{ secrets.REGISTRY_PASSWORD }} ${{ env.REGISTRY }}

      - name: pull images
        id: pull_image
        working-directory: ${{ matrix.context }}
        run: podman pull ${{ matrix.pull-images }}

      - name: generate the local OCI assets
        run: |
          cd training
          make -j vllm
          make -j deepspeed
          make -j instruct-${{ matrix.gpu}}

      - name: Build Image
        id: build_image
        run: make bootc ARCH=${{ matrix.arch }}
        working-directory: ${{ matrix.context }}

      - name: Push image
        if: github.event_name == 'push' && github.ref == 'refs/heads/main'
        uses: redhat-actions/push-to-registry@v2.8
        with:
          image: ${{ steps.build_image.outputs.image }}
          tags: ${{ steps.build_image.outputs.tags }}
          registry: ${{ env.REGISTRY }}

      - name: Publish Job Results to Slack
        id: slack
        if: always()
        uses: slackapi/slack-github-action@v1.26.0
        with:
          payload: |
            {
              "text": "${{ github.workflow }} workflow status: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
            }
        env:
          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}

  stop-runner:
    name: Stop self-hosted EC2 runner
    needs:
      - start-runner # required to get output from the start-runner job
      - bootc-images # required to wait when the main job is done
    runs-on: ubuntu-latest
    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
    steps:
      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v1
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: ${{ secrets.AWS_REGION }}
      - name: Stop EC2 runner
        uses: machulav/ec2-github-runner@v2
        with:
          mode: stop
          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
          label: ${{ needs.start-runner.outputs.label }}
          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}