Skip to content

Commit

Permalink
add /verifyruns cog (#36)
Browse files Browse the repository at this point in the history
* add /verifyrun cog

* work in progress on /verifyrun

* launch 3 run jobs, verify results for each

* added summary /verifyruns message

* moved defer logic to VerifyRunCog

* updates due to new message formatting for Modal runs
  • Loading branch information
b9r5 authored Dec 7, 2024
1 parent 6ddd9fe commit 5ba723e
Show file tree
Hide file tree
Showing 4 changed files with 179 additions and 10 deletions.
2 changes: 2 additions & 0 deletions src/discord-cluster-manager/bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from cogs.github_cog import GitHubCog
from cogs.leaderboard_cog import LeaderboardCog
from leaderboard_db import LeaderboardDB
from cogs.verify_run_cog import VerifyRunCog

logger = setup_logging()

Expand Down Expand Up @@ -59,6 +60,7 @@ async def setup_hook(self):
await self.add_cog(GitHubCog(self))
await self.add_cog(BotManagerCog(self))
await self.add_cog(LeaderboardCog(self))
await self.add_cog(VerifyRunCog(self))

guild_id = (
DISCORD_CLUSTER_STAGING_ID
Expand Down
17 changes: 12 additions & 5 deletions src/discord-cluster-manager/cogs/github_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,22 @@ async def run_github(
interaction: discord.Interaction,
script: discord.Attachment,
gpu_type: app_commands.Choice[str],
):
use_followup: bool = False
) -> discord.Thread:
if not script.filename.endswith(".py") and not script.filename.endswith(".cu"):
await interaction.response.send_message(
"Please provide a Python (.py) or CUDA (.cu) file"
)
return
return None

thread = await self.bot.create_thread(interaction, gpu_type.name, "GitHub Job")
message = f"Created thread {thread.mention} for your GitHub job"

if use_followup:
await interaction.followup.send(message)
else:
await interaction.response.send_message(message)

await interaction.response.send_message(
f"Created thread {thread.mention} for your GitHub job"
)
await thread.send(f"Processing `{script.filename}` with {gpu_type.name}...")

try:
Expand Down Expand Up @@ -80,6 +84,9 @@ async def run_github(
logger.error(f"Error processing request: {str(e)}", exc_info=True)
await thread.send(f"Error processing request: {str(e)}")

finally:
return thread

async def trigger_github_action(self, script_content, filename, gpu_type):
logger.info(f"Attempting to trigger GitHub action for {gpu_type.name} GPU")
gh = Github(GITHUB_TOKEN)
Expand Down
16 changes: 11 additions & 5 deletions src/discord-cluster-manager/cogs/modal_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,23 @@ async def run_modal(
interaction: discord.Interaction,
script: discord.Attachment,
gpu_type: app_commands.Choice[str],
):
use_followup: bool = False
) -> discord.Thread:
if not script.filename.endswith(".py") and not script.filename.endswith(".cu"):
await interaction.response.send_message(
"Please provide a Python (.py) or CUDA (.cu) file"
)
return
return None

thread = await self.bot.create_thread(interaction, gpu_type.name, "Modal Job")
queue_start_time = time.perf_counter()
message = f"Created thread {thread.mention} for your Modal job"

if use_followup:
await interaction.followup.send(message)
else:
await interaction.response.send_message(message)

await interaction.response.send_message(
f"Created thread {thread.mention} for your Modal job"
)
await thread.send(f"**Processing `{script.filename}` with {gpu_type.name}...**")

try:
Expand All @@ -69,6 +73,8 @@ async def run_modal(
await status_msg.edit(content="**Running on Modal...**\n> ❌ Job failed!")
await thread.send(f"**Error:** {str(e)}")

finally:
return thread

async def trigger_modal_run(self, script_content: str, filename: str) -> tuple[str, float]:
logger.info("Attempting to trigger Modal run")
Expand Down
154 changes: 154 additions & 0 deletions src/discord-cluster-manager/cogs/verify_run_cog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import re
import asyncio
import discord
from discord import app_commands
from discord.ext import commands
from unittest.mock import AsyncMock
from utils import setup_logging
from cogs.modal_cog import ModalCog
from cogs.github_cog import GitHubCog

logger = setup_logging()

def create_mock_attachment():
"Create an AsyncMock to simulate discord.Attachment"

mock_attachment = AsyncMock(spec=discord.Attachment)
mock_attachment.filename = "test_script.py"
mock_attachment.content_type = 'text/plain'
mock_attachment.read = AsyncMock(
return_value="print('Hello, world!')".encode('utf-8'))
return mock_attachment

script_file = create_mock_attachment()

class VerifyRunCog(commands.Cog):
"""
A Discord cog for verifying the success of training runs.
A cog that verifies training runs across different platforms and GPU types.
Runs test scripts on GitHub (Nvidia and AMD) and Modal to validate that the
runs complete successfully. Each run is monitored for expected output
messages.
"""

def __init__(self, bot):
self.bot = bot

async def verify_github_run(
self, github_cog: GitHubCog,
choice: app_commands.Choice,
interaction: discord.Interaction) -> bool:

github_command = github_cog.run_github
github_thread = await github_command.callback(
github_cog, interaction, script_file, choice, use_followup=True)

message_contents = [msg.content async for msg in github_thread.history(limit=None)]

required_patterns = [
"Processing `.*` with",
"GitHub Action triggered! Run ID:",
"Training completed with status: success",
".*```\nLogs.*:",
"View the full run at:",
]

all_patterns_found = all(
any(
re.search(pattern, content, re.DOTALL) != None
for content in message_contents
)
for pattern in required_patterns
)

if all_patterns_found:
await interaction.followup.send(
f"✅ GitHub run ({choice.name}) completed successfully - all expected messages found!")
return True
else:
missing_patterns = [
pattern for pattern in required_patterns
if not any(re.search(pattern, content, re.DOTALL)
for content in message_contents)
]
await interaction.followup.send(
f"❌ GitHub run ({choice.name}) verification failed. Missing expected messages:\n" +
"\n".join(f"- {pattern}" for pattern in missing_patterns)
)
return False

async def verify_modal_run(
self,
modal_cog: ModalCog,
interaction: discord.Interaction) -> bool:

t4 = app_commands.Choice(name="NVIDIA T4", value="t4")
modal_command = modal_cog.run_modal

modal_thread = await modal_command.callback(
modal_cog, interaction, script_file, t4, use_followup=True)

message_contents = [msg.content async for msg in modal_thread.history(limit=None)]

required_patterns = [
"Processing `.*` with",
"Running on Modal...",
"Modal execution result:",
]

all_patterns_found = all(
any(re.search(pattern, content, re.DOTALL) != None
for content in message_contents)
for pattern in required_patterns
)

if all_patterns_found:
await interaction.followup.send(
"✅ Modal run completed successfully - all expected messages found!")
return True
else:
missing_patterns = [
pattern for pattern in required_patterns
if not any(re.search(pattern, content, re.DOTALL)
for content in message_contents)
]
await interaction.followup.send(
"❌ Modal run verification failed. Missing expected messages:\n" +
"\n".join(f"- {pattern}" for pattern in missing_patterns)
)
return False

@app_commands.command(name='verifyruns')
async def verify_runs(self, interaction: discord.Interaction):
"""Verify runs on on Modal, GitHub Nvidia, and GitHub AMD."""

try:
if not interaction.response.is_done():
await interaction.response.defer()

modal_cog = self.bot.get_cog('ModalCog')
github_cog = self.bot.get_cog('GitHubCog')

if not all([modal_cog, github_cog]):
await interaction.response.send_message("❌ Required cogs not found!")
return

nvidia = app_commands.Choice(name="NVIDIA", value="nvidia")
amd = app_commands.Choice(name="AMD", value="amd")

results = await asyncio.gather(
self.verify_github_run(github_cog, nvidia, interaction),
self.verify_github_run(github_cog, amd, interaction),
self.verify_modal_run(modal_cog, interaction))

if all(results):
await interaction.followup.send("✅ All runs completed successfully!")
else:
await interaction.followup.send("❌ Some runs failed! Consult messages above for details.")

except Exception as e:
logger.error(f"Error starting verification runs: {e}", exc_info=True)
await interaction.followup.send(
f"❌ Problem performing verification runs: {str(e)}"
)

0 comments on commit 5ba723e

Please sign in to comment.