diff --git a/src/discord-cluster-manager/bot.py b/src/discord-cluster-manager/bot.py index bd1fb4e..c90e7b1 100644 --- a/src/discord-cluster-manager/bot.py +++ b/src/discord-cluster-manager/bot.py @@ -21,6 +21,7 @@ from cogs.github_cog import GitHubCog from cogs.leaderboard_cog import LeaderboardCog from leaderboard_db import LeaderboardDB +from cogs.verify_run_cog import VerifyRunCog logger = setup_logging() @@ -59,6 +60,7 @@ async def setup_hook(self): await self.add_cog(GitHubCog(self)) await self.add_cog(BotManagerCog(self)) await self.add_cog(LeaderboardCog(self)) + await self.add_cog(VerifyRunCog(self)) guild_id = ( DISCORD_CLUSTER_STAGING_ID diff --git a/src/discord-cluster-manager/cogs/github_cog.py b/src/discord-cluster-manager/cogs/github_cog.py index f69ca0f..538d966 100644 --- a/src/discord-cluster-manager/cogs/github_cog.py +++ b/src/discord-cluster-manager/cogs/github_cog.py @@ -35,18 +35,22 @@ async def run_github( interaction: discord.Interaction, script: discord.Attachment, gpu_type: app_commands.Choice[str], - ): + use_followup: bool = False + ) -> discord.Thread: if not script.filename.endswith(".py") and not script.filename.endswith(".cu"): await interaction.response.send_message( "Please provide a Python (.py) or CUDA (.cu) file" ) - return + return None thread = await self.bot.create_thread(interaction, gpu_type.name, "GitHub Job") + message = f"Created thread {thread.mention} for your GitHub job" + + if use_followup: + await interaction.followup.send(message) + else: + await interaction.response.send_message(message) - await interaction.response.send_message( - f"Created thread {thread.mention} for your GitHub job" - ) await thread.send(f"Processing `{script.filename}` with {gpu_type.name}...") try: @@ -80,6 +84,9 @@ async def run_github( logger.error(f"Error processing request: {str(e)}", exc_info=True) await thread.send(f"Error processing request: {str(e)}") + finally: + return thread + async def trigger_github_action(self, script_content, filename, gpu_type): logger.info(f"Attempting to trigger GitHub action for {gpu_type.name} GPU") gh = Github(GITHUB_TOKEN) diff --git a/src/discord-cluster-manager/cogs/modal_cog.py b/src/discord-cluster-manager/cogs/modal_cog.py index e5742dc..44af78e 100644 --- a/src/discord-cluster-manager/cogs/modal_cog.py +++ b/src/discord-cluster-manager/cogs/modal_cog.py @@ -30,19 +30,23 @@ async def run_modal( interaction: discord.Interaction, script: discord.Attachment, gpu_type: app_commands.Choice[str], - ): + use_followup: bool = False + ) -> discord.Thread: if not script.filename.endswith(".py") and not script.filename.endswith(".cu"): await interaction.response.send_message( "Please provide a Python (.py) or CUDA (.cu) file" ) - return + return None thread = await self.bot.create_thread(interaction, gpu_type.name, "Modal Job") queue_start_time = time.perf_counter() + message = f"Created thread {thread.mention} for your Modal job" + + if use_followup: + await interaction.followup.send(message) + else: + await interaction.response.send_message(message) - await interaction.response.send_message( - f"Created thread {thread.mention} for your Modal job" - ) await thread.send(f"**Processing `{script.filename}` with {gpu_type.name}...**") try: @@ -69,6 +73,8 @@ async def run_modal( await status_msg.edit(content="**Running on Modal...**\n> ❌ Job failed!") await thread.send(f"**Error:** {str(e)}") + finally: + return thread async def trigger_modal_run(self, script_content: str, filename: str) -> tuple[str, float]: logger.info("Attempting to trigger Modal run") diff --git a/src/discord-cluster-manager/cogs/verify_run_cog.py b/src/discord-cluster-manager/cogs/verify_run_cog.py new file mode 100644 index 0000000..4dfaa44 --- /dev/null +++ b/src/discord-cluster-manager/cogs/verify_run_cog.py @@ -0,0 +1,154 @@ +import re +import asyncio +import discord +from discord import app_commands +from discord.ext import commands +from unittest.mock import AsyncMock +from utils import setup_logging +from cogs.modal_cog import ModalCog +from cogs.github_cog import GitHubCog + +logger = setup_logging() + +def create_mock_attachment(): + "Create an AsyncMock to simulate discord.Attachment" + + mock_attachment = AsyncMock(spec=discord.Attachment) + mock_attachment.filename = "test_script.py" + mock_attachment.content_type = 'text/plain' + mock_attachment.read = AsyncMock( + return_value="print('Hello, world!')".encode('utf-8')) + return mock_attachment + +script_file = create_mock_attachment() + +class VerifyRunCog(commands.Cog): + """ + A Discord cog for verifying the success of training runs. + + A cog that verifies training runs across different platforms and GPU types. + Runs test scripts on GitHub (Nvidia and AMD) and Modal to validate that the + runs complete successfully. Each run is monitored for expected output + messages. + """ + + def __init__(self, bot): + self.bot = bot + + async def verify_github_run( + self, github_cog: GitHubCog, + choice: app_commands.Choice, + interaction: discord.Interaction) -> bool: + + github_command = github_cog.run_github + github_thread = await github_command.callback( + github_cog, interaction, script_file, choice, use_followup=True) + + message_contents = [msg.content async for msg in github_thread.history(limit=None)] + + required_patterns = [ + "Processing `.*` with", + "GitHub Action triggered! Run ID:", + "Training completed with status: success", + ".*```\nLogs.*:", + "View the full run at:", + ] + + all_patterns_found = all( + any( + re.search(pattern, content, re.DOTALL) != None + for content in message_contents + ) + for pattern in required_patterns + ) + + if all_patterns_found: + await interaction.followup.send( + f"✅ GitHub run ({choice.name}) completed successfully - all expected messages found!") + return True + else: + missing_patterns = [ + pattern for pattern in required_patterns + if not any(re.search(pattern, content, re.DOTALL) + for content in message_contents) + ] + await interaction.followup.send( + f"❌ GitHub run ({choice.name}) verification failed. Missing expected messages:\n" + + "\n".join(f"- {pattern}" for pattern in missing_patterns) + ) + return False + + async def verify_modal_run( + self, + modal_cog: ModalCog, + interaction: discord.Interaction) -> bool: + + t4 = app_commands.Choice(name="NVIDIA T4", value="t4") + modal_command = modal_cog.run_modal + + modal_thread = await modal_command.callback( + modal_cog, interaction, script_file, t4, use_followup=True) + + message_contents = [msg.content async for msg in modal_thread.history(limit=None)] + + required_patterns = [ + "Processing `.*` with", + "Running on Modal...", + "Modal execution result:", + ] + + all_patterns_found = all( + any(re.search(pattern, content, re.DOTALL) != None + for content in message_contents) + for pattern in required_patterns + ) + + if all_patterns_found: + await interaction.followup.send( + "✅ Modal run completed successfully - all expected messages found!") + return True + else: + missing_patterns = [ + pattern for pattern in required_patterns + if not any(re.search(pattern, content, re.DOTALL) + for content in message_contents) + ] + await interaction.followup.send( + "❌ Modal run verification failed. Missing expected messages:\n" + + "\n".join(f"- {pattern}" for pattern in missing_patterns) + ) + return False + + @app_commands.command(name='verifyruns') + async def verify_runs(self, interaction: discord.Interaction): + """Verify runs on on Modal, GitHub Nvidia, and GitHub AMD.""" + + try: + if not interaction.response.is_done(): + await interaction.response.defer() + + modal_cog = self.bot.get_cog('ModalCog') + github_cog = self.bot.get_cog('GitHubCog') + + if not all([modal_cog, github_cog]): + await interaction.response.send_message("❌ Required cogs not found!") + return + + nvidia = app_commands.Choice(name="NVIDIA", value="nvidia") + amd = app_commands.Choice(name="AMD", value="amd") + + results = await asyncio.gather( + self.verify_github_run(github_cog, nvidia, interaction), + self.verify_github_run(github_cog, amd, interaction), + self.verify_modal_run(modal_cog, interaction)) + + if all(results): + await interaction.followup.send("✅ All runs completed successfully!") + else: + await interaction.followup.send("❌ Some runs failed! Consult messages above for details.") + + except Exception as e: + logger.error(f"Error starting verification runs: {e}", exc_info=True) + await interaction.followup.send( + f"❌ Problem performing verification runs: {str(e)}" + ) \ No newline at end of file