diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py index d9a8896..2d3363d 100644 --- a/src/discord-cluster-manager/cogs/leaderboard_cog.py +++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py @@ -5,7 +5,9 @@ from consts import GitHubGPU, ModalGPU from discord import Interaction, SelectOption, app_commands, ui from discord.ext import commands -from utils import extract_score, get_user_from_id +from utils import extract_score, get_user_from_id, setup_logging + +logger = setup_logging() class LeaderboardSubmitCog(app_commands.Group): @@ -21,8 +23,6 @@ def __init__( @app_commands.describe( leaderboard_name="Name of the competition / kernel to optimize", script="The Python / CUDA script file to run", - dtype="dtype (e.g. FP32, BF16, FP4) that the input and output expects.", - shape="Data input shape as a tuple", ) # TODO: Modularize this so all the write functionality is in here. Haven't figured # a good way to do this yet. @@ -31,8 +31,6 @@ async def submit( interaction: discord.Interaction, leaderboard_name: str, script: discord.Attachment, - dtype: app_commands.Choice[str] = None, - shape: app_commands.Choice[str] = None, ): pass @@ -51,8 +49,6 @@ async def submit_modal( leaderboard_name: str, script: discord.Attachment, gpu_type: app_commands.Choice[str], - dtype: app_commands.Choice[str] = "fp32", - shape: app_commands.Choice[str] = None, ): try: # Read the template file @@ -109,8 +105,6 @@ async def submit_github( leaderboard_name: str, script: discord.Attachment, gpu_type: app_commands.Choice[str], - dtype: app_commands.Choice[str] = "fp32", - shape: app_commands.Choice[str] = None, ): # Read the template file submission_content = await script.read() @@ -147,7 +141,6 @@ async def submit_github( return github_command = github_cog.run_github - print(github_command) try: github_thread = await github_command.callback( github_cog, @@ -274,109 +267,130 @@ async def leaderboard_create( deadline: str, reference_code: discord.Attachment, ): + # Try parsing with time first try: - # Try parsing with time first + date_value = datetime.strptime(deadline, "%Y-%m-%d %H:%M") + except ValueError: try: - date_value = datetime.strptime(deadline, "%Y-%m-%d %H:%M") - except ValueError: - # If that fails, try parsing just the date (will set time to 00:00) date_value = datetime.strptime(deadline, "%Y-%m-%d") - - # Read the template file - template_content = await reference_code.read() - - # Ask the user to select GPUs - view = GPUSelectionView([gpu.name for gpu in GitHubGPU]) - - if interaction.response.is_done(): - await interaction.followup.send( - "Please select GPUs for this leaderboard.", - view=view, - ephemeral=True, - ) - else: + except ValueError as ve: + logger.error(f"Value Error: {str(ve)}", exc_info=True) await interaction.response.send_message( - "Please select GPUs for this leaderboard.", - view=view, + "Invalid date format. Please use YYYY-MM-DD or YYYY-MM-DD HH:MM", ephemeral=True, ) + return - await view.wait() + # Ask the user to select GPUs + view = GPUSelectionView([gpu.name for gpu in GitHubGPU]) + + if interaction.response.is_done(): + await interaction.followup.send( + "Please select GPUs for this leaderboard.", + view=view, + ephemeral=True, + ) + else: + await interaction.response.send_message( + "Please select GPUs for this leaderboard.", + view=view, + ephemeral=True, + ) + + await view.wait() + + try: + # Read the template file + template_content = await reference_code.read() with self.bot.leaderboard_db as db: - print( - leaderboard_name, - type(date_value), - type(template_content.decode("utf-8")), - ) - db.create_leaderboard({ + err = db.create_leaderboard({ "name": leaderboard_name, "deadline": date_value, "reference_code": template_content.decode("utf-8"), + "gpu_types": view.selected_gpus, }) - if interaction.response.is_done(): - await interaction.followup.send( - f"Leaderboard '{leaderboard_name}' created.\n" - + f"Reference code: {reference_code}.\n" - + f"Submission deadline: {date_value}", - ephemeral=True, - ) - else: - await interaction.response.send_message( - f"Leaderboard '{leaderboard_name}' created.\n" - + f"Reference code: {reference_code}.\n" - + f"Submission deadline: {date_value}", - ephemeral=True, - ) - except ValueError: - await interaction.response.send_message( - "Invalid date format. Please use YYYY-MM-DD or YYYY-MM-DD HH:MM", + if err: + if "duplicate key" in err: + await interaction.followup.send( + f'Error: Tried to create a leaderboard "{leaderboard_name}" that already exists.', + ephemeral=True, + ) + else: + # Handle any other errors + logger.error(f"Error in leaderboard creation: {err}") + await interaction.followup.send( + "Error in leaderboard creation.", + ephemeral=True, + ) + return + + await interaction.followup.send( + f"Leaderboard '{leaderboard_name}'.\n" + + f"Reference code: {reference_code}. Submission deadline: {date_value}", + ephemeral=True, + ) + + except Exception as e: + logger.error(f"Error in leaderboard creation: {e}") + # Handle any other errors + await interaction.followup.send( + "Error in leaderboard creation.", ephemeral=True, ) + @discord.app_commands.describe(leaderboard_name="Name of the leaderboard") async def get_leaderboard_submissions( self, interaction: discord.Interaction, leaderboard_name: str, - dtype: app_commands.Choice[str] = "fp32", ): - with self.bot.leaderboard_db as db: - # TODO: query that gets leaderboard id given leaderboard name - leaderboard_id = db.get_leaderboard(leaderboard_name)["id"] - if not leaderboard_id: + try: + with self.bot.leaderboard_db as db: + # TODO: query that gets leaderboard id given leaderboard name + leaderboard_id = db.get_leaderboard(leaderboard_name)["id"] + if not leaderboard_id: + await interaction.response.send_message( + f'Leaderboard "{leaderboard_name}" not found.', ephemeral=True + ) + return + + submissions = db.get_leaderboard_submissions(leaderboard_name) + + if not submissions: await interaction.response.send_message( - "Leaderboard not found.", ephemeral=True + f'No submissions found for "{leaderboard_name}".', ephemeral=True ) return - # submissions = db.get_leaderboard_submissions(leaderboard_id) # Add dtype - submissions = db.get_leaderboard_submissions(leaderboard_name) # Add dtype - - if not submissions: - await interaction.response.send_message( - "No submissions found.", ephemeral=True + # Create embed + embed = discord.Embed( + title=f'Leaderboard Submissions for "{leaderboard_name}"', + color=discord.Color.blue(), ) - return - - # Create embed - embed = discord.Embed( - title=f'Leaderboard Submissions for "{leaderboard_name}"', - color=discord.Color.blue(), - ) - for submission in submissions: - user_id = await get_user_from_id( - submission["user_id"], interaction, self.bot - ) - print("members", interaction.guild.members) - print(user_id) + for submission in submissions: + user_id = await get_user_from_id( + submission["user_id"], interaction, self.bot + ) - embed.add_field( - name=f"{user_id}: {submission['submission_name']}", - value=f"Submission speed: {submission['submission_score']}", - inline=False, - ) + embed.add_field( + name=f"{user_id}: {submission['submission_name']}", + value=f"Submission speed: {submission['submission_score']}", + inline=False, + ) - await interaction.response.send_message(embed=embed) + await interaction.response.send_message(embed=embed) + except Exception as e: + logger.error(str(e)) + if "'NoneType' object is not subscriptable" in str(e): + await interaction.response.send_message( + f"The leaderboard '{leaderboard_name}' doesn't exist.", + ephemeral=True, + ) + else: + await interaction.response.send_message( + "An unknown error occurred.", ephemeral=True + ) diff --git a/src/discord-cluster-manager/cogs/verify_run_cog.py b/src/discord-cluster-manager/cogs/verify_run_cog.py index d3f6eee..9a192a0 100644 --- a/src/discord-cluster-manager/cogs/verify_run_cog.py +++ b/src/discord-cluster-manager/cogs/verify_run_cog.py @@ -11,24 +11,28 @@ logger = setup_logging() + def create_mock_attachment(): "Create an AsyncMock to simulate discord.Attachment" mock_attachment = AsyncMock(spec=discord.Attachment) mock_attachment.filename = "test_script.py" - mock_attachment.content_type = 'text/plain' + mock_attachment.content_type = "text/plain" mock_attachment.read = AsyncMock( - return_value="print('Hello, world!')".encode('utf-8')) + return_value="print('Hello, world!')".encode("utf-8") + ) return mock_attachment + script_file = create_mock_attachment() + class VerifyRunCog(commands.Cog): """ A Discord cog for verifying the success of training runs. A cog that verifies training runs across different platforms and GPU types. - Runs test scripts on GitHub (Nvidia and AMD) and Modal to validate that the + Runs test scripts on GitHub (NVIDIA and AMD) and Modal to validate that the runs complete successfully. Each run is monitored for expected output messages. """ @@ -37,15 +41,19 @@ def __init__(self, bot): self.bot = bot async def verify_github_run( - self, github_cog: GitHubCog, - choice: app_commands.Choice, - interaction: discord.Interaction) -> bool: - + self, + github_cog: GitHubCog, + choice: app_commands.Choice, + interaction: discord.Interaction, + ) -> bool: github_command = github_cog.run_github github_thread = await github_command.callback( - github_cog, interaction, script_file, choice, use_followup=True) + github_cog, interaction, script_file, choice, use_followup=True + ) - message_contents = [msg.content async for msg in github_thread.history(limit=None)] + message_contents = [ + msg.content async for msg in github_thread.history(limit=None) + ] required_patterns = [ "Processing `.*` with", @@ -65,32 +73,37 @@ async def verify_github_run( if all_patterns_found: await interaction.followup.send( - f"✅ GitHub run ({choice.name}) completed successfully - all expected messages found!") + f"✅ GitHub run ({choice.name}) completed successfully - all expected messages found!" + ) return True else: missing_patterns = [ - pattern for pattern in required_patterns - if not any(re.search(pattern, content, re.DOTALL) - for content in message_contents) + pattern + for pattern in required_patterns + if not any( + re.search(pattern, content, re.DOTALL) + for content in message_contents + ) ] await interaction.followup.send( - f"❌ GitHub run ({choice.name}) verification failed. Missing expected messages:\n" + - "\n".join(f"- {pattern}" for pattern in missing_patterns) + f"❌ GitHub run ({choice.name}) verification failed. Missing expected messages:\n" + + "\n".join(f"- {pattern}" for pattern in missing_patterns) ) return False async def verify_modal_run( - self, - modal_cog: ModalCog, - interaction: discord.Interaction) -> bool: - + self, modal_cog: ModalCog, interaction: discord.Interaction + ) -> bool: t4 = app_commands.Choice(name="NVIDIA T4", value="t4") modal_command = modal_cog.run_modal modal_thread = await modal_command.callback( - modal_cog, interaction, script_file, t4, use_followup=True) + modal_cog, interaction, script_file, t4, use_followup=True + ) - message_contents = [msg.content async for msg in modal_thread.history(limit=None)] + message_contents = [ + msg.content async for msg in modal_thread.history(limit=None) + ] required_patterns = [ "Processing `.*` with", @@ -99,28 +112,34 @@ async def verify_modal_run( ] all_patterns_found = all( - any(re.search(pattern, content, re.DOTALL) is not None - for content in message_contents) + any( + re.search(pattern, content, re.DOTALL) is not None + for content in message_contents + ) for pattern in required_patterns ) if all_patterns_found: await interaction.followup.send( - "✅ Modal run completed successfully - all expected messages found!") + "✅ Modal run completed successfully - all expected messages found!" + ) return True else: missing_patterns = [ - pattern for pattern in required_patterns - if not any(re.search(pattern, content, re.DOTALL) - for content in message_contents) + pattern + for pattern in required_patterns + if not any( + re.search(pattern, content, re.DOTALL) + for content in message_contents + ) ] await interaction.followup.send( - "❌ Modal run verification failed. Missing expected messages:\n" + - "\n".join(f"- {pattern}" for pattern in missing_patterns) + "❌ Modal run verification failed. Missing expected messages:\n" + + "\n".join(f"- {pattern}" for pattern in missing_patterns) ) return False - @app_commands.command(name='verifyruns') + @app_commands.command(name="verifyruns") async def verify_runs(self, interaction: discord.Interaction): """Verify runs on on Modal, GitHub Nvidia, and GitHub AMD.""" @@ -128,8 +147,8 @@ async def verify_runs(self, interaction: discord.Interaction): if not interaction.response.is_done(): await interaction.response.defer() - modal_cog = self.bot.get_cog('ModalCog') - github_cog = self.bot.get_cog('GitHubCog') + modal_cog = self.bot.get_cog("ModalCog") + github_cog = self.bot.get_cog("GitHubCog") if not all([modal_cog, github_cog]): await interaction.response.send_message("❌ Required cogs not found!") @@ -141,12 +160,15 @@ async def verify_runs(self, interaction: discord.Interaction): results = await asyncio.gather( self.verify_github_run(github_cog, nvidia, interaction), self.verify_github_run(github_cog, amd, interaction), - self.verify_modal_run(modal_cog, interaction)) + self.verify_modal_run(modal_cog, interaction), + ) if all(results): await interaction.followup.send("✅ All runs completed successfully!") else: - await interaction.followup.send("❌ Some runs failed! Consult messages above for details.") + await interaction.followup.send( + "❌ Some runs failed! Consult messages above for details." + ) except Exception as e: logger.error(f"Error starting verification runs: {e}", exc_info=True) diff --git a/src/discord-cluster-manager/leaderboard_db.py b/src/discord-cluster-manager/leaderboard_db.py index 2d03bb6..125ffde 100644 --- a/src/discord-cluster-manager/leaderboard_db.py +++ b/src/discord-cluster-manager/leaderboard_db.py @@ -58,7 +58,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit""" self.disconnect() - def create_leaderboard(self, leaderboard: LeaderboardItem): + def create_leaderboard(self, leaderboard: LeaderboardItem) -> Optional[None]: try: self.cursor.execute( """ @@ -73,8 +73,9 @@ def create_leaderboard(self, leaderboard: LeaderboardItem): ) self.connection.commit() except psycopg2.Error as e: - print(f"Error during leaderboard creation: {e}") self.connection.rollback() # Ensure rollback if error occurs + return f"Error during leaderboard creation: {e}" + return None def create_submission(self, submission: SubmissionItem): try: @@ -98,7 +99,9 @@ def create_submission(self, submission: SubmissionItem): self.connection.rollback() # Ensure rollback if error occurs def get_leaderboards(self) -> list[LeaderboardItem]: - self.cursor.execute("SELECT id, name, deadline, reference_code FROM leaderboard.problem") + self.cursor.execute( + "SELECT id, name, deadline, reference_code FROM leaderboard.problem" + ) return [ LeaderboardItem(id=lb[0], name=lb[1], deadline=lb[2], reference_code=lb[3]) @@ -108,7 +111,7 @@ def get_leaderboards(self) -> list[LeaderboardItem]: def get_leaderboard(self, leaderboard_name: str) -> int | None: self.cursor.execute( "SELECT id, name, deadline, reference_code FROM leaderboard.problem WHERE name = %s", - (leaderboard_name,) + (leaderboard_name,), ) res = self.cursor.fetchone() diff --git a/src/discord-cluster-manager/utils.py b/src/discord-cluster-manager/utils.py index ebfbaa1..7fd1eb8 100644 --- a/src/discord-cluster-manager/utils.py +++ b/src/discord-cluster-manager/utils.py @@ -2,7 +2,7 @@ import logging import re import subprocess -from typing import TypedDict +from typing import List, TypedDict def setup_logging(): @@ -73,6 +73,7 @@ class LeaderboardItem(TypedDict): name: str deadline: datetime.datetime reference_code: str + gpu_types: List[str] class SubmissionItem(TypedDict):