Skip to content

Commit

Permalink
add reference script uploading
Browse files Browse the repository at this point in the history
  • Loading branch information
alexzhang13 committed Dec 8, 2024
1 parent 2aad655 commit 06146f4
Show file tree
Hide file tree
Showing 5 changed files with 134 additions and 28 deletions.
45 changes: 37 additions & 8 deletions .github/workflows/nvidia_workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@ on:
description: 'Name of script (supports .py or .cu)'
required: true
type: string
reference_content:
description: 'Content of the reference code script (optional)'
required: false
type: string
reference_filename:
description: 'Name of reference script (supports .py or .cu)'
required: false
type: string

jobs:
train:
Expand Down Expand Up @@ -40,7 +48,16 @@ jobs:
${{ github.event.inputs.script_content }}
EOL
cat ${{ github.event.inputs.filename }} # Debug: show file contents
- name: Create eval and reference scripts if provided
run: |
if [[ -n "${{ github.event.inputs.reference_content }}" ]]; then
echo "Creating reference script..."
cat << 'EOL' > ${{ github.event.inputs.reference_filename }}
${{ github.event.inputs.reference_content }}
EOL
cat ${{ github.event.inputs.reference_filename }}
fi
- name: Install dependencies
run: |
if grep -rE "(import torch|from torch)" "${{ github.event.inputs.filename }}"; then
Expand All @@ -55,15 +72,27 @@ jobs:
- name: Run script
shell: bash
run: |
if [[ "${{ github.event.inputs.filename }}" == *.cu ]]; then
echo "Compiling and running CUDA file..."
nvcc "${{ github.event.inputs.filename }}" -o cuda_program
./cuda_program > training.log 2>&1
if [[ -n "${{ github.event.inputs.reference_content }}" ]]; then
if [[ "${{ github.event.inputs.reference_filename }}" == *.cu ]]; then
echo "Compiling and running CUDA file..."
nvcc "${{ github.event.inputs.reference_filename }}" -o cuda_program
./cuda_program > training.log 2>&1
else
echo "Running Python file..."
python3 "${{ github.event.inputs.reference_filename }}" > training.log 2>&1
fi
cat training.log # Debug: show output
else
echo "Running Python file..."
python3 "${{ github.event.inputs.filename }}" > training.log 2>&1
if [[ "${{ github.event.inputs.filename }}" == *.cu ]]; then
echo "Compiling and running CUDA file..."
nvcc "${{ github.event.inputs.filename }}" -o cuda_program
./cuda_program > training.log 2>&1
else
echo "Running Python file..."
python3 "${{ github.event.inputs.filename }}" > training.log 2>&1
fi
cat training.log # Debug: show output
fi
cat training.log # Debug: show output
- name: Upload training artifacts
uses: actions/upload-artifact@v4
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
.env
__pycache__/
data/
*.pyc
.DS_Store
node_modules/
Expand Down
63 changes: 51 additions & 12 deletions src/discord-cluster-manager/cogs/github_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from github import Github
from utils import setup_logging, get_github_branch_name
from consts import GPUType, GITHUB_TOKEN, GITHUB_REPO
from leaderboard_eval import py_eval, cu_eval

logger = setup_logging()

Expand All @@ -35,7 +36,9 @@ async def run_github(
interaction: discord.Interaction,
script: discord.Attachment,
gpu_type: app_commands.Choice[str],
use_followup: bool = False
use_followup: bool = False,
use_leaderboard_eval: bool = False,
reference_script: discord.Attachment = None,
) -> discord.Thread:
if not script.filename.endswith(".py") and not script.filename.endswith(".cu"):
await interaction.response.send_message(
Expand All @@ -56,9 +59,20 @@ async def run_github(
try:
script_content = (await script.read()).decode("utf-8")
selected_gpu = GPUType.AMD if gpu_type.value == "amd" else GPUType.NVIDIA
run_id = await self.trigger_github_action(
script_content, script.filename, selected_gpu
)

if use_leaderboard_eval:
reference_content = (await reference_script.read()).decode("utf-8")
# eval_code = py_eval if script.filename.endswith(".py") else cu_eval
run_id = await self.trigger_github_action(
script_content,
script.filename,
selected_gpu,
reference_content,
)
else:
run_id = await self.trigger_github_action(
script_content, script.filename, selected_gpu
)

if run_id:
await thread.send(
Expand Down Expand Up @@ -87,7 +101,13 @@ async def run_github(
finally:
return thread

async def trigger_github_action(self, script_content, filename, gpu_type):
async def trigger_github_action(
self,
script_content,
filename,
gpu_type,
reference_content=None,
):
logger.info(f"Attempting to trigger GitHub action for {gpu_type.name} GPU")
gh = Github(GITHUB_TOKEN)
repo = gh.get_repo(GITHUB_REPO)
Expand All @@ -97,10 +117,23 @@ async def trigger_github_action(self, script_content, filename, gpu_type):
workflow_file = gpu_type.value
workflow = repo.get_workflow(workflow_file)

success = workflow.create_dispatch(
get_github_branch_name(),
{"script_content": script_content, "filename": filename},
)
if reference_content is None:
# eval_filename = "eval.py" if filename.endswith(".py") else "eval.cu"
reference_filename = "ref.py" if filename.endswith(".py") else "ref.cu"
success = workflow.create_dispatch(
get_github_branch_name(),
{
"script_content": script_content,
"filename": filename,
"reference_content": reference_content,
"reference_filename": reference_filename,
},
)
else:
success = workflow.create_dispatch(
get_github_branch_name(),
{"script_content": script_content, "filename": filename},
)

if success:
await asyncio.sleep(2)
Expand Down Expand Up @@ -138,9 +171,15 @@ async def check_workflow_status(self, run_id, thread):
logger.warning(f"Failed to cancel workflow run {run_id}")
except Exception as e:
logger.error(f"Error cancelling workflow: {str(e)}")

await thread.send(f"Workflow cancelled - exceeded {timeout_minutes} minute timeout")
return "cancelled", f"Workflow exceeded {timeout_minutes} minute timeout", run.html_url

await thread.send(
f"Workflow cancelled - exceeded {timeout_minutes} minute timeout"
)
return (
"cancelled",
f"Workflow exceeded {timeout_minutes} minute timeout",
run.html_url,
)

if run.status == "completed":
logs = await self.download_artifact(run_id)
Expand Down
35 changes: 27 additions & 8 deletions src/discord-cluster-manager/cogs/leaderboard_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@

import random

if TYPE_CHECKING:
from bot import ClusterBot


class LeaderboardSubmitCog(app_commands.Group):
def __init__(self, bot):
self.bot: ClusterBot = bot
def __init__(
self,
bot: commands.Bot,
):
self.bot: commands.Bot = bot

super().__init__(name="submit", description="Submit leaderboard data")

# Parent command that defines global options
Expand Down Expand Up @@ -45,7 +46,7 @@ async def submit(
app_commands.Choice(name=gpu.value, value=gpu.value) for gpu in ModalGPU
]
)
async def modal(
async def submit_modal(
self,
interaction: discord.Interaction,
leaderboard_name: str,
Expand All @@ -58,6 +59,15 @@ async def modal(
# Read the template file
submission_content = await script.read()

# Call Modal runner
modal_cog = self.bot.get_cog("ModalCog")

if not all([modal_cog]):
await interaction.response.send_message("❌ Required cogs not found!")
return

modal_command = modal_cog.run_modal

# Compute eval or submission score, call runner here.
score = random.random()

Expand Down Expand Up @@ -93,7 +103,7 @@ async def modal(
app_commands.Choice(name=gpu.value, value=gpu.value) for gpu in GitHubGPU
]
)
async def github(
async def submit_github(
self,
interaction: discord.Interaction,
leaderboard_name: str,
Expand All @@ -106,6 +116,15 @@ async def github(
# Read the template file
submission_content = await script.read()

# Call GH runner
github_cog = self.bot.get_cog("GitHubCog")

if not all([github_cog]):
await interaction.response.send_message("❌ Required cogs not found!")
return

github_command = github_cog.run_github

# Compute eval or submission score, call runner here.
score = random.random()

Expand All @@ -132,7 +151,7 @@ async def github(

class LeaderboardCog(commands.Cog):
def __init__(self, bot):
self.bot: ClusterBot = bot
self.bot: commands.Bot = bot
self.get_leaderboards = bot.leaderboard_group.command(name="get")(
self.get_leaderboards
)
Expand Down
18 changes: 18 additions & 0 deletions src/discord-cluster-manager/leaderboard_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
########
# Evaluation scripts to run for leaderboard results
########

py_eval = """
from reference import metric
def main():
metric()
if __name__ == '__main__':
main()
"""

cu_eval = """
"""

0 comments on commit 06146f4

Please sign in to comment.