add reference script uploading

gpu-mode · Dec 8, 2024 · 06146f4 · 06146f4
1 parent 2aad655
commit 06146f4
Show file tree

Hide file tree

Showing 5 changed files with 134 additions and 28 deletions.
diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
@@ -10,6 +10,14 @@ on:
         description: 'Name of script (supports .py or .cu)'
         required: true
         type: string
+      reference_content:
+        description: 'Content of the reference code script (optional)'
+        required: false
+        type: string
+      reference_filename:
+        description: 'Name of reference script (supports .py or .cu)'
+        required: false
+        type: string
 
 jobs:
   train:
@@ -40,7 +48,16 @@ jobs:
           ${{ github.event.inputs.script_content }}
           EOL
           cat ${{ github.event.inputs.filename }}  # Debug: show file contents
-          
+
+      - name: Create eval and reference scripts if provided
+        run: |
+          if [[ -n "${{ github.event.inputs.reference_content }}" ]]; then
+            echo "Creating reference script..."
+            cat << 'EOL' > ${{ github.event.inputs.reference_filename }}
+            ${{ github.event.inputs.reference_content }}
+            EOL
+            cat ${{ github.event.inputs.reference_filename }}
+          fi
       - name: Install dependencies
         run: |
           if grep -rE "(import torch|from torch)" "${{ github.event.inputs.filename }}"; then
@@ -55,15 +72,27 @@ jobs:
       - name: Run script
         shell: bash
         run: |
-          if [[ "${{ github.event.inputs.filename }}" == *.cu ]]; then
-            echo "Compiling and running CUDA file..."
-            nvcc "${{ github.event.inputs.filename }}" -o cuda_program
-            ./cuda_program > training.log 2>&1
+          if [[ -n "${{ github.event.inputs.reference_content }}" ]]; then
+            if [[ "${{ github.event.inputs.reference_filename }}" == *.cu ]]; then
+              echo "Compiling and running CUDA file..."
+              nvcc "${{ github.event.inputs.reference_filename }}" -o cuda_program
+              ./cuda_program > training.log 2>&1
+            else
+              echo "Running Python file..."
+              python3 "${{ github.event.inputs.reference_filename }}" > training.log 2>&1
+            fi
+            cat training.log  # Debug: show output
           else
-            echo "Running Python file..."
-            python3 "${{ github.event.inputs.filename }}" > training.log 2>&1
+            if [[ "${{ github.event.inputs.filename }}" == *.cu ]]; then
+              echo "Compiling and running CUDA file..."
+              nvcc "${{ github.event.inputs.filename }}" -o cuda_program
+              ./cuda_program > training.log 2>&1
+            else
+              echo "Running Python file..."
+              python3 "${{ github.event.inputs.filename }}" > training.log 2>&1
+            fi
+            cat training.log  # Debug: show output
           fi
-          cat training.log  # Debug: show output
 
       - name: Upload training artifacts
         uses: actions/upload-artifact@v4

diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 .env
 __pycache__/
+data/
 *.pyc
 .DS_Store
 node_modules/

diff --git a/src/discord-cluster-manager/cogs/github_cog.py b/src/discord-cluster-manager/cogs/github_cog.py
@@ -9,6 +9,7 @@
 from github import Github
 from utils import setup_logging, get_github_branch_name
 from consts import GPUType, GITHUB_TOKEN, GITHUB_REPO
+from leaderboard_eval import py_eval, cu_eval
 
 logger = setup_logging()
 
@@ -35,7 +36,9 @@ async def run_github(
         interaction: discord.Interaction,
         script: discord.Attachment,
         gpu_type: app_commands.Choice[str],
-        use_followup: bool = False
+        use_followup: bool = False,
+        use_leaderboard_eval: bool = False,
+        reference_script: discord.Attachment = None,
     ) -> discord.Thread:
         if not script.filename.endswith(".py") and not script.filename.endswith(".cu"):
             await interaction.response.send_message(
@@ -56,9 +59,20 @@ async def run_github(
         try:
             script_content = (await script.read()).decode("utf-8")
             selected_gpu = GPUType.AMD if gpu_type.value == "amd" else GPUType.NVIDIA
-            run_id = await self.trigger_github_action(
-                script_content, script.filename, selected_gpu
-            )
+
+            if use_leaderboard_eval:
+                reference_content = (await reference_script.read()).decode("utf-8")
+                # eval_code = py_eval if script.filename.endswith(".py") else cu_eval
+                run_id = await self.trigger_github_action(
+                    script_content,
+                    script.filename,
+                    selected_gpu,
+                    reference_content,
+                )
+            else:
+                run_id = await self.trigger_github_action(
+                    script_content, script.filename, selected_gpu
+                )
 
             if run_id:
                 await thread.send(
@@ -87,7 +101,13 @@ async def run_github(
         finally:
             return thread
 
-    async def trigger_github_action(self, script_content, filename, gpu_type):
+    async def trigger_github_action(
+        self,
+        script_content,
+        filename,
+        gpu_type,
+        reference_content=None,
+    ):
         logger.info(f"Attempting to trigger GitHub action for {gpu_type.name} GPU")
         gh = Github(GITHUB_TOKEN)
         repo = gh.get_repo(GITHUB_REPO)
@@ -97,10 +117,23 @@ async def trigger_github_action(self, script_content, filename, gpu_type):
             workflow_file = gpu_type.value
             workflow = repo.get_workflow(workflow_file)
 
-            success = workflow.create_dispatch(
-                get_github_branch_name(),
-                {"script_content": script_content, "filename": filename},
-            )
+            if reference_content is None:
+                # eval_filename = "eval.py" if filename.endswith(".py") else "eval.cu"
+                reference_filename = "ref.py" if filename.endswith(".py") else "ref.cu"
+                success = workflow.create_dispatch(
+                    get_github_branch_name(),
+                    {
+                        "script_content": script_content,
+                        "filename": filename,
+                        "reference_content": reference_content,
+                        "reference_filename": reference_filename,
+                    },
+                )
+            else:
+                success = workflow.create_dispatch(
+                    get_github_branch_name(),
+                    {"script_content": script_content, "filename": filename},
+                )
 
             if success:
                 await asyncio.sleep(2)
@@ -138,9 +171,15 @@ async def check_workflow_status(self, run_id, thread):
                             logger.warning(f"Failed to cancel workflow run {run_id}")
                     except Exception as e:
                         logger.error(f"Error cancelling workflow: {str(e)}")
-
-                    await thread.send(f"Workflow cancelled - exceeded {timeout_minutes} minute timeout")
-                    return "cancelled", f"Workflow exceeded {timeout_minutes} minute timeout", run.html_url
+
+                    await thread.send(
+                        f"Workflow cancelled - exceeded {timeout_minutes} minute timeout"
+                    )
+                    return (
+                        "cancelled",
+                        f"Workflow exceeded {timeout_minutes} minute timeout",
+                        run.html_url,
+                    )
 
                 if run.status == "completed":
                     logs = await self.download_artifact(run_id)

diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py
@@ -8,13 +8,14 @@
 
 import random
 
-if TYPE_CHECKING:
-    from bot import ClusterBot
-
 
 class LeaderboardSubmitCog(app_commands.Group):
-    def __init__(self, bot):
-        self.bot: ClusterBot = bot
+    def __init__(
+        self,
+        bot: commands.Bot,
+    ):
+        self.bot: commands.Bot = bot
+
         super().__init__(name="submit", description="Submit leaderboard data")
 
     # Parent command that defines global options
@@ -45,7 +46,7 @@ async def submit(
             app_commands.Choice(name=gpu.value, value=gpu.value) for gpu in ModalGPU
         ]
     )
-    async def modal(
+    async def submit_modal(
         self,
         interaction: discord.Interaction,
         leaderboard_name: str,
@@ -58,6 +59,15 @@ async def modal(
             # Read the template file
             submission_content = await script.read()
 
+            # Call Modal runner
+            modal_cog = self.bot.get_cog("ModalCog")
+
+            if not all([modal_cog]):
+                await interaction.response.send_message("❌ Required cogs not found!")
+                return
+
+            modal_command = modal_cog.run_modal
+
             # Compute eval or submission score, call runner here.
             score = random.random()
 
@@ -93,7 +103,7 @@ async def modal(
             app_commands.Choice(name=gpu.value, value=gpu.value) for gpu in GitHubGPU
         ]
     )
-    async def github(
+    async def submit_github(
         self,
         interaction: discord.Interaction,
         leaderboard_name: str,
@@ -106,6 +116,15 @@ async def github(
             # Read the template file
             submission_content = await script.read()
 
+            # Call GH runner
+            github_cog = self.bot.get_cog("GitHubCog")
+
+            if not all([github_cog]):
+                await interaction.response.send_message("❌ Required cogs not found!")
+                return
+
+            github_command = github_cog.run_github
+
             # Compute eval or submission score, call runner here.
             score = random.random()
 
@@ -132,7 +151,7 @@ async def github(
 
 class LeaderboardCog(commands.Cog):
     def __init__(self, bot):
-        self.bot: ClusterBot = bot
+        self.bot: commands.Bot = bot
         self.get_leaderboards = bot.leaderboard_group.command(name="get")(
             self.get_leaderboards
         )

diff --git a/src/discord-cluster-manager/leaderboard_eval.py b/src/discord-cluster-manager/leaderboard_eval.py
@@ -0,0 +1,18 @@
+########
+# Evaluation scripts to run for leaderboard results
+########
+
+py_eval = """
+from reference import metric
+
+def main():
+    metric()
+
+if __name__ == '__main__':
+    main()
+
+"""
+
+cu_eval = """
+
+"""