Skip to content

NVIDIA PyTorch/CUDA Job #84

NVIDIA PyTorch/CUDA Job

NVIDIA PyTorch/CUDA Job #84

name: NVIDIA PyTorch Job
on:
workflow_dispatch:
inputs:
script_content:
description: 'Content of Python script'
required: true
type: string
filename:
description: 'Name of Python script'
required: true
type: string
jobs:
train:
runs-on: [gpumode-nvidia-arc]
timeout-minutes: 10
container:
image: nvidia/cuda:12.4.0-devel-ubuntu22.04
steps:
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Create script
shell: python
run: |
with open('${{ github.event.inputs.filename }}', 'w') as f:
f.write('''${{ github.event.inputs.script_content }}''')
- name: Install dependencies
run: |
# Check if 'import torch' is in any Python file
if grep -rE "(import torch|from torch)" "${{ github.event.inputs.filename }}"; then
echo "PyTorch detected, installing torch"
pip install numpy torch
fi
# Check if 'import triton' is in any Python file
if grep -rE "(import triton|from triton)" "${{ github.event.inputs.filename }}"; then
echo "Triton detected, installing triton"
pip install triton
fi
echo "Installing dependencies..."
pip install numpy numba
# - name: Run script with profiler
# run: |
# # Run the script with NSight Compute profiler and save to CSV
# ncu --csv python "${{ github.event.inputs.filename }}" > profile_results.csv 2>&1
# # Also keep regular output in training.log
# python "${{ github.event.inputs.filename }}" > training.log 2>&1
- name: Run script with profiler
run: |
set -e # Exit immediately if any command fails
set -x # Enable command tracing for debugging
# Determine the proper Python executable
PYTHON_BIN=$(command -v python3 || command -v python || echo "Python not found")
if [ "$PYTHON_BIN" = "Python not found" ]; then
echo "Error: Python executable not found in PATH." >&2
exit 1
fi
echo "Using Python executable: $PYTHON_BIN"
# Run the script with NSight Compute profiler and save to CSV
ncu --csv "$PYTHON_BIN" "${{ github.event.inputs.filename }}" > profile_results.csv 2> profiler_error.log || true
cat profiler_error.log # Output profiler errors if any
# Also run the script normally and log output
"$PYTHON_BIN" "${{ github.event.inputs.filename }}" > training.log 2> script_error.log || true
cat script_error.log # Output script errors if any
sleep 120
- name: Save Python binary information
run: echo "$PYTHON_BIN" > python_bin_used.txt
- name: Upload artifacts
uses: actions/upload-artifact@v3
with:
name: training-artifacts
path: |
training.log
profiler_error.log
script_error.log
profile_results.csv
python_bin_used.txt
# - name: Upload training artifacts
# uses: actions/upload-artifact@v3
# if: always()
# with:
# name: training-artifacts
# path: |
# training.log
# # profile_results.csv
# ${{ github.event.inputs.filename }}
# env:
# CUDA_VISIBLE_DEVICES: 0 # Make sure only one GPU is used for testing