diff --git a/.github/workflows/amd_workflow.yml b/.github/workflows/amd_workflow.yml index c6bf169..9dff331 100644 --- a/.github/workflows/amd_workflow.yml +++ b/.github/workflows/amd_workflow.yml @@ -92,15 +92,6 @@ jobs: cat training.log # Debug: show output fi - - name: Extract score - shell: bash - run: | - # Extract only the last occurrence of "score:" - score=$(grep -oP 'score:\s*\d+(\.\d+)?' training.log | tail -n 1 | awk '{print $2}') - - echo "Score extracted: $score" - echo "::set-output name=score::$score" - - name: Upload training artifacts uses: actions/upload-artifact@v4 if: always() diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index 7647e38..1d6a191 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -120,15 +120,6 @@ jobs: cat training.log # Debug: show output fi - - name: Extract score - shell: bash - run: | - # Extract only the last occurrence of "score:" - score=$(grep -oP 'score:\s*\d+(\.\d+)?' training.log | tail -n 1 | awk '{print $2}') - - echo "Score extracted: $score" - echo "::set-output name=score::$score" - - name: Upload training artifacts uses: actions/upload-artifact@v4 if: always() diff --git a/src/discord-cluster-manager/cogs/github_cog.py b/src/discord-cluster-manager/cogs/github_cog.py index 93f62bd..e41d8c9 100644 --- a/src/discord-cluster-manager/cogs/github_cog.py +++ b/src/discord-cluster-manager/cogs/github_cog.py @@ -7,7 +7,7 @@ import zipfile import os from github import Github -from utils import setup_logging, get_github_branch_name +from utils import setup_logging, get_github_branch_name, send_discord_message from consts import GPUType, GITHUB_TOKEN, GITHUB_REPO from leaderboard_eval import py_eval, cu_eval @@ -36,7 +36,6 @@ async def run_github( interaction: discord.Interaction, script: discord.Attachment, gpu_type: app_commands.Choice[str], - use_followup: bool = False, reference_script: discord.Attachment = None, reference_code: str = None, ) -> discord.Thread: @@ -49,10 +48,8 @@ async def run_github( thread = await self.bot.create_thread(interaction, gpu_type.name, "GitHub Job") message = f"Created thread {thread.mention} for your GitHub job" - if use_followup: - await interaction.followup.send(message) - else: - await interaction.response.send_message(message) + # Send message or append message + await send_discord_message(interaction, message) await thread.send(f"Processing `{script.filename}` with {gpu_type.name}...") diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py index c431470..f75ccd0 100644 --- a/src/discord-cluster-manager/cogs/leaderboard_cog.py +++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py @@ -1,14 +1,18 @@ import discord +from discord import ui, SelectOption, Interaction from discord import app_commands from discord.ext import commands +from discord.app_commands.errors import CommandInvokeError from datetime import datetime from typing import TYPE_CHECKING from consts import GitHubGPU, ModalGPU -from utils import extract_score, get_user_from_id +from utils import extract_score, get_user_from_id, setup_logging, send_discord_message import random +logger = setup_logging() + class LeaderboardSubmitCog(app_commands.Group): def __init__( @@ -23,8 +27,6 @@ def __init__( @app_commands.describe( leaderboard_name="Name of the competition / kernel to optimize", script="The Python / CUDA script file to run", - dtype="dtype (e.g. FP32, BF16, FP4) that the input and output expects.", - shape="Data input shape as a tuple", ) # TODO: Modularize this so all the write functionality is in here. Haven't figured # a good way to do this yet. @@ -33,8 +35,6 @@ async def submit( interaction: discord.Interaction, leaderboard_name: str, script: discord.Attachment, - dtype: app_commands.Choice[str] = None, - shape: app_commands.Choice[str] = None, ): pass @@ -53,8 +53,6 @@ async def submit_modal( leaderboard_name: str, script: discord.Attachment, gpu_type: app_commands.Choice[str], - dtype: app_commands.Choice[str] = "fp32", - shape: app_commands.Choice[str] = None, ): try: # Read the template file @@ -110,8 +108,6 @@ async def submit_github( leaderboard_name: str, script: discord.Attachment, gpu_type: app_commands.Choice[str], - dtype: app_commands.Choice[str] = "fp32", - shape: app_commands.Choice[str] = None, ): # Read the template file submission_content = await script.read() @@ -148,7 +144,6 @@ async def submit_github( return github_command = github_cog.run_github - print(github_command) try: github_thread = await github_command.callback( github_cog, @@ -156,7 +151,6 @@ async def submit_github( script, gpu_type, reference_code=reference_code, - use_followup=True, ) except discord.errors.NotFound as e: print(f"Webhook not found: {e}") @@ -180,7 +174,11 @@ async def submit_github( "submission_score": score, }) - user_id = interaction.user.global_name if interaction.user.nick is None else interaction.user.nick + user_id = ( + interaction.user.global_name + if interaction.user.nick is None + else interaction.user.nick + ) await interaction.followup.send( "Successfully ran on GitHub runners!\n" + f"Leaderboard '{leaderboard_name}'.\n" @@ -195,10 +193,35 @@ async def submit_github( ) +class GPUSelectionView(ui.View): + def __init__(self, available_gpus: list[str]): + super().__init__() + + # Add the Select Menu with the list of GPU options + select = ui.Select( + placeholder="Select GPUs for this leaderboard...", + options=[SelectOption(label=gpu, value=gpu) for gpu in available_gpus], + min_values=1, # Minimum number of selections + max_values=len(available_gpus), # Maximum number of selections + ) + select.callback = self.select_callback + self.add_item(select) + + async def select_callback(self, interaction: Interaction): + # Retrieve the selected options + select = interaction.data["values"] + self.selected_gpus = select + await interaction.response.send_message( + f"Selected GPUs: {', '.join(self.selected_gpus)}", + ephemeral=True, + ) + self.stop() + + class LeaderboardCog(commands.Cog): def __init__(self, bot): self.bot: commands.Bot = bot - self.get_leaderboards = bot.leaderboard_group.command(name="get")( + self.get_leaderboards = bot.leaderboard_group.command(name="list")( self.get_leaderboards ) self.leaderboard_create = bot.leaderboard_group.command( @@ -246,36 +269,71 @@ async def leaderboard_create( deadline: str, reference_code: discord.Attachment, ): + # Try parsing with time first try: - # Try parsing with time first + date_value = datetime.strptime(deadline, "%Y-%m-%d %H:%M") + except ValueError: try: - date_value = datetime.strptime(deadline, "%Y-%m-%d %H:%M") - except ValueError: - # If that fails, try parsing just the date (will set time to 00:00) date_value = datetime.strptime(deadline, "%Y-%m-%d") + except ValueError as ve: + logger.error(f"Value Error: {str(ve)}", exc_info=True) + await interaction.response.send_message( + "Invalid date format. Please use YYYY-MM-DD or YYYY-MM-DD HH:MM", + ephemeral=True, + ) + return + + # Ask the user to select GPUs + view = GPUSelectionView([gpu.name for gpu in GitHubGPU]) + await send_discord_message( + interaction, + "Please select GPUs for this leaderboard:", + view=view, + ephemeral=True, + ) + + # Wait until the user makes a selection + await view.wait() + + # Kind of messy, but separate date try/catch + try: # Read the template file template_content = await reference_code.read() with self.bot.leaderboard_db as db: - print( - leaderboard_name, - type(date_value), - type(template_content.decode("utf-8")), - ) - db.create_leaderboard({ + err = db.create_leaderboard({ "name": leaderboard_name, "deadline": date_value, "reference_code": template_content.decode("utf-8"), + "gpu_types": view.selected_gpus, }) - await interaction.response.send_message( + if err: + if "duplicate key" in err: + await interaction.followup.send( + f'Error: Tried to create a leaderboard "{leaderboard_name}" that already exists.', + ephemeral=True, + ) + else: + # Handle any other errors + logger.error(f"Error in leaderboard creation: {err}") + await interaction.followup.send( + "Error in leaderboard creation.", + ephemeral=True, + ) + return + + await interaction.followup.send( f"Leaderboard '{leaderboard_name}'. Reference code: {reference_code}. Submission deadline: {date_value}", ephemeral=True, ) - except ValueError: - await interaction.response.send_message( - "Invalid date format. Please use YYYY-MM-DD or YYYY-MM-DD HH:MM", + + except Exception as e: + logger.error(f"Error in leaderboard creation: {e}") + # Handle any other errors + await interaction.followup.send( + "Error in leaderboard creation.", ephemeral=True, ) @@ -284,43 +342,51 @@ async def get_leaderboard_submissions( self, interaction: discord.Interaction, leaderboard_name: str, - dtype: app_commands.Choice[str] = "fp32", ): - with self.bot.leaderboard_db as db: - # TODO: query that gets leaderboard id given leaderboard name - leaderboard_id = db.get_leaderboard(leaderboard_name)["id"] - if not leaderboard_id: + try: + with self.bot.leaderboard_db as db: + # TODO: query that gets leaderboard id given leaderboard name + leaderboard_id = db.get_leaderboard(leaderboard_name)["id"] + if not leaderboard_id: + await interaction.response.send_message( + f'Leaderboard "{leaderboard_name}" not found.', ephemeral=True + ) + return + + submissions = db.get_leaderboard_submissions(leaderboard_name) + + if not submissions: await interaction.response.send_message( - "Leaderboard not found.", ephemeral=True + f'No submissions found for "{leaderboard_name}".', ephemeral=True ) return - # submissions = db.get_leaderboard_submissions(leaderboard_id) # Add dtype - submissions = db.get_leaderboard_submissions(leaderboard_name) # Add dtype - - if not submissions: - await interaction.response.send_message( - "No submissions found.", ephemeral=True + # Create embed + embed = discord.Embed( + title=f'Leaderboard Submissions for "{leaderboard_name}"', + color=discord.Color.blue(), ) - return - # Create embed - embed = discord.Embed( - title=f'Leaderboard Submissions for "{leaderboard_name}"', - color=discord.Color.blue(), - ) - - for submission in submissions: - user_id = await get_user_from_id( - submission["user_id"], interaction, self.bot - ) - print("members", interaction.guild.members) - print(user_id) + for submission in submissions: + user_id = await get_user_from_id( + submission["user_id"], interaction, self.bot + ) - embed.add_field( - name=f"{user_id}: {submission['submission_name']}", - value=f"Submission speed: {submission['submission_score']}", - inline=False, - ) + embed.add_field( + name=f"{user_id}: {submission['submission_name']}", + value=f"Submission speed: {submission['submission_score']}", + inline=False, + ) - await interaction.response.send_message(embed=embed) + await interaction.response.send_message(embed=embed) + except Exception as e: + logger.error(str(e)) + if "'NoneType' object is not subscriptable" in str(e): + await interaction.response.send_message( + f"The leaderboard '{leaderboard_name}' doesn't exist.", + ephemeral=True, + ) + else: + await interaction.response.send_message( + "An unknown error occurred.", ephemeral=True + ) diff --git a/src/discord-cluster-manager/cogs/modal_cog.py b/src/discord-cluster-manager/cogs/modal_cog.py index 44af78e..8e05acd 100644 --- a/src/discord-cluster-manager/cogs/modal_cog.py +++ b/src/discord-cluster-manager/cogs/modal_cog.py @@ -3,20 +3,19 @@ from discord.ext import commands import modal import time -from utils import setup_logging +from utils import setup_logging, send_discord_message logger = setup_logging() + class ModalCog(commands.Cog): def __init__(self, bot): self.bot = bot self.run_modal = bot.run_group.command( - name="modal", - description="Run a script using Modal" + name="modal", description="Run a script using Modal" )(self.run_modal) - @app_commands.describe( script="The Python script file to run", gpu_type="Choose the GPU type for Modal" ) @@ -30,7 +29,6 @@ async def run_modal( interaction: discord.Interaction, script: discord.Attachment, gpu_type: app_commands.Choice[str], - use_followup: bool = False ) -> discord.Thread: if not script.filename.endswith(".py") and not script.filename.endswith(".cu"): await interaction.response.send_message( @@ -42,22 +40,24 @@ async def run_modal( queue_start_time = time.perf_counter() message = f"Created thread {thread.mention} for your Modal job" - if use_followup: - await interaction.followup.send(message) - else: - await interaction.response.send_message(message) - + await send_discord_message(interaction, message) await thread.send(f"**Processing `{script.filename}` with {gpu_type.name}...**") try: script_content = (await script.read()).decode("utf-8") - status_msg = await thread.send("**Running on Modal...**\n> ⏳ Waiting for available GPU...") + status_msg = await thread.send( + "**Running on Modal...**\n> ⏳ Waiting for available GPU..." + ) + + result, execution_time_ms = await self.trigger_modal_run( + script_content, script.filename + ) - result, execution_time_ms = await self.trigger_modal_run(script_content, script.filename) - # Update status message to show completion - await status_msg.edit(content="**Running on Modal...**\n> ✅ Job completed!") - + await status_msg.edit( + content="**Running on Modal...**\n> ✅ Job completed!" + ) + queue_end_time = time.perf_counter() queue_time_ms = (queue_end_time - queue_start_time) * 1000 @@ -76,24 +76,33 @@ async def run_modal( finally: return thread - async def trigger_modal_run(self, script_content: str, filename: str) -> tuple[str, float]: + async def trigger_modal_run( + self, script_content: str, filename: str + ) -> tuple[str, float]: logger.info("Attempting to trigger Modal run") from modal_runner import modal_app try: - print(f"Running {filename} with Modal") + print(f"Running {filename} with Modal") with modal.enable_output(): with modal_app.run(): if filename.endswith(".py"): from modal_runner import run_pytorch_script - result, execution_time_ms = run_pytorch_script.remote(script_content) + + result, execution_time_ms = run_pytorch_script.remote( + script_content + ) elif filename.endswith(".cu"): from modal_runner import run_cuda_script - result, execution_time_ms = run_cuda_script.remote(script_content) + + result, execution_time_ms = run_cuda_script.remote( + script_content + ) return result, execution_time_ms - + except Exception as e: logger.error(f"Error in trigger_modal_run: {str(e)}", exc_info=True) - return f"Error: {str(e)}", 0 \ No newline at end of file + return f"Error: {str(e)}", 0 + diff --git a/src/discord-cluster-manager/cogs/verify_run_cog.py b/src/discord-cluster-manager/cogs/verify_run_cog.py index 4dfaa44..df612dd 100644 --- a/src/discord-cluster-manager/cogs/verify_run_cog.py +++ b/src/discord-cluster-manager/cogs/verify_run_cog.py @@ -10,24 +10,28 @@ logger = setup_logging() + def create_mock_attachment(): "Create an AsyncMock to simulate discord.Attachment" mock_attachment = AsyncMock(spec=discord.Attachment) mock_attachment.filename = "test_script.py" - mock_attachment.content_type = 'text/plain' + mock_attachment.content_type = "text/plain" mock_attachment.read = AsyncMock( - return_value="print('Hello, world!')".encode('utf-8')) + return_value="print('Hello, world!')".encode("utf-8") + ) return mock_attachment + script_file = create_mock_attachment() + class VerifyRunCog(commands.Cog): """ A Discord cog for verifying the success of training runs. A cog that verifies training runs across different platforms and GPU types. - Runs test scripts on GitHub (Nvidia and AMD) and Modal to validate that the + Runs test scripts on GitHub (NVIDIA and AMD) and Modal to validate that the runs complete successfully. Each run is monitored for expected output messages. """ @@ -36,15 +40,19 @@ def __init__(self, bot): self.bot = bot async def verify_github_run( - self, github_cog: GitHubCog, - choice: app_commands.Choice, - interaction: discord.Interaction) -> bool: - + self, + github_cog: GitHubCog, + choice: app_commands.Choice, + interaction: discord.Interaction, + ) -> bool: github_command = github_cog.run_github github_thread = await github_command.callback( - github_cog, interaction, script_file, choice, use_followup=True) + github_cog, interaction, script_file, choice + ) - message_contents = [msg.content async for msg in github_thread.history(limit=None)] + message_contents = [ + msg.content async for msg in github_thread.history(limit=None) + ] required_patterns = [ "Processing `.*` with", @@ -64,32 +72,37 @@ async def verify_github_run( if all_patterns_found: await interaction.followup.send( - f"✅ GitHub run ({choice.name}) completed successfully - all expected messages found!") + f"✅ GitHub run ({choice.name}) completed successfully - all expected messages found!" + ) return True else: missing_patterns = [ - pattern for pattern in required_patterns - if not any(re.search(pattern, content, re.DOTALL) - for content in message_contents) + pattern + for pattern in required_patterns + if not any( + re.search(pattern, content, re.DOTALL) + for content in message_contents + ) ] await interaction.followup.send( - f"❌ GitHub run ({choice.name}) verification failed. Missing expected messages:\n" + - "\n".join(f"- {pattern}" for pattern in missing_patterns) + f"❌ GitHub run ({choice.name}) verification failed. Missing expected messages:\n" + + "\n".join(f"- {pattern}" for pattern in missing_patterns) ) return False async def verify_modal_run( - self, - modal_cog: ModalCog, - interaction: discord.Interaction) -> bool: - + self, modal_cog: ModalCog, interaction: discord.Interaction + ) -> bool: t4 = app_commands.Choice(name="NVIDIA T4", value="t4") modal_command = modal_cog.run_modal modal_thread = await modal_command.callback( - modal_cog, interaction, script_file, t4, use_followup=True) + modal_cog, interaction, script_file, t4 + ) - message_contents = [msg.content async for msg in modal_thread.history(limit=None)] + message_contents = [ + msg.content async for msg in modal_thread.history(limit=None) + ] required_patterns = [ "Processing `.*` with", @@ -98,28 +111,34 @@ async def verify_modal_run( ] all_patterns_found = all( - any(re.search(pattern, content, re.DOTALL) != None - for content in message_contents) + any( + re.search(pattern, content, re.DOTALL) != None + for content in message_contents + ) for pattern in required_patterns ) if all_patterns_found: await interaction.followup.send( - "✅ Modal run completed successfully - all expected messages found!") + "✅ Modal run completed successfully - all expected messages found!" + ) return True else: missing_patterns = [ - pattern for pattern in required_patterns - if not any(re.search(pattern, content, re.DOTALL) - for content in message_contents) + pattern + for pattern in required_patterns + if not any( + re.search(pattern, content, re.DOTALL) + for content in message_contents + ) ] await interaction.followup.send( - "❌ Modal run verification failed. Missing expected messages:\n" + - "\n".join(f"- {pattern}" for pattern in missing_patterns) + "❌ Modal run verification failed. Missing expected messages:\n" + + "\n".join(f"- {pattern}" for pattern in missing_patterns) ) return False - @app_commands.command(name='verifyruns') + @app_commands.command(name="verifyruns") async def verify_runs(self, interaction: discord.Interaction): """Verify runs on on Modal, GitHub Nvidia, and GitHub AMD.""" @@ -127,8 +146,8 @@ async def verify_runs(self, interaction: discord.Interaction): if not interaction.response.is_done(): await interaction.response.defer() - modal_cog = self.bot.get_cog('ModalCog') - github_cog = self.bot.get_cog('GitHubCog') + modal_cog = self.bot.get_cog("ModalCog") + github_cog = self.bot.get_cog("GitHubCog") if not all([modal_cog, github_cog]): await interaction.response.send_message("❌ Required cogs not found!") @@ -140,15 +159,19 @@ async def verify_runs(self, interaction: discord.Interaction): results = await asyncio.gather( self.verify_github_run(github_cog, nvidia, interaction), self.verify_github_run(github_cog, amd, interaction), - self.verify_modal_run(modal_cog, interaction)) - + self.verify_modal_run(modal_cog, interaction), + ) + if all(results): await interaction.followup.send("✅ All runs completed successfully!") else: - await interaction.followup.send("❌ Some runs failed! Consult messages above for details.") + await interaction.followup.send( + "❌ Some runs failed! Consult messages above for details." + ) except Exception as e: logger.error(f"Error starting verification runs: {e}", exc_info=True) await interaction.followup.send( f"❌ Problem performing verification runs: {str(e)}" - ) \ No newline at end of file + ) + diff --git a/src/discord-cluster-manager/leaderboard_db.py b/src/discord-cluster-manager/leaderboard_db.py index a679924..e380804 100644 --- a/src/discord-cluster-manager/leaderboard_db.py +++ b/src/discord-cluster-manager/leaderboard_db.py @@ -99,7 +99,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit""" self.disconnect() - def create_leaderboard(self, leaderboard: LeaderboardItem): + def create_leaderboard(self, leaderboard: LeaderboardItem) -> Optional[str]: try: self.cursor.execute( """ @@ -114,8 +114,10 @@ def create_leaderboard(self, leaderboard: LeaderboardItem): ) self.connection.commit() except psycopg2.Error as e: - print(f"Error during leaderboard creation: {e}") self.connection.rollback() # Ensure rollback if error occurs + return "Error during leaderboard creation." + + return None def create_submission(self, submission: SubmissionItem): try: @@ -139,7 +141,9 @@ def create_submission(self, submission: SubmissionItem): self.connection.rollback() # Ensure rollback if error occurs def get_leaderboards(self) -> list[LeaderboardItem]: - self.cursor.execute("SELECT id, name, deadline, reference_code FROM leaderboard.problem") + self.cursor.execute( + "SELECT id, name, deadline, reference_code FROM leaderboard.problem" + ) return [ LeaderboardItem(id=lb[0], name=lb[1], deadline=lb[2], reference_code=lb[3]) @@ -149,7 +153,7 @@ def get_leaderboards(self) -> list[LeaderboardItem]: def get_leaderboard(self, leaderboard_name: str) -> int | None: self.cursor.execute( "SELECT id, name, deadline, reference_code FROM leaderboard.problem WHERE name = %s", - (leaderboard_name,) + (leaderboard_name,), ) res = self.cursor.fetchone() diff --git a/src/discord-cluster-manager/utils.py b/src/discord-cluster-manager/utils.py index 04020f1..880c773 100644 --- a/src/discord-cluster-manager/utils.py +++ b/src/discord-cluster-manager/utils.py @@ -2,7 +2,8 @@ import subprocess import datetime import re -from typing import TypedDict +from typing import TypedDict, List +import discord def setup_logging(): @@ -58,6 +59,19 @@ async def get_user_from_id(id, interaction, bot): return id +async def send_discord_message( + interaction: discord.Interaction, msg: str, **kwargs +) -> None: + """ + To get around response messages in slash commands that are + called externally, send a message using the followup. + """ + if interaction.response.is_done(): + await interaction.followup.send(msg, **kwargs) + else: + await interaction.response.send_message(msg, **kwargs) + + def extract_score(score_str: str) -> float: """ Extract score from output logs and push to DB (kind of hacky). @@ -73,6 +87,7 @@ class LeaderboardItem(TypedDict): name: str deadline: datetime.datetime reference_code: str + gpu_types: List[str] class SubmissionItem(TypedDict):