modal branch (#25)

* modal branch * Modal scheduler
gpu-mode · Nov 19, 2024 · e60be08 · e60be08
1 parent ea64204
commit e60be08
Show file tree

Hide file tree

Showing 4 changed files with 119 additions and 31 deletions.
diff --git a/README.md b/README.md
@@ -13,6 +13,16 @@ The key idea is that we're using Github Actions as a job scheduling engine and p
 
 Right now the bot is running on my macbook but will some more permanent location
 
+## Supported schedulers
+
+* GitHub Actions
+* Modal
+* Slurm (not implemented yet)
+
+## Usage instructions
+
+`@Cluster-bot NVIDIA/AMD/MODAL` depending on which scheduleer you want to use. MODAL is configured by default to use T4 because that's cheap but it works with any GPU
+
 ## Why Github Actions
 
 Every triggered job is containerized so we don't have to worry too much about security. We are exploring a K8 like setup but it's just harder to finish in a reasonable timeframe

diff --git a/discord-bot.py b/discord-bot.py
@@ -11,6 +11,7 @@
 import subprocess
 import argparse
 from enum import Enum
+import modal
 
 # Configure logging
 def setup_logging():
@@ -37,6 +38,13 @@ class GPUType(Enum):
     NVIDIA = "nvidia_workflow.yml"
     AMD = "amd_workflow.yml"
 
+# Scheduler types enum
+class SchedulerType(Enum):
+    """Enum defining supported scheduler types"""
+    GITHUB = "github"
+    MODAL = "modal"
+    SLURM = "slurm" # For future implementation
+
 def get_gpu_type(message_content):
     """
     Determine GPU type based on message content
@@ -45,6 +53,14 @@ def get_gpu_type(message_content):
         return GPUType.AMD
     return GPUType.NVIDIA  # Default to NVIDIA if not specified
 
+def get_scheduler_type(message_content):
+    """
+    Determine scheduler type based on message content
+    """
+    if "MODAL" in message_content.upper():
+        return SchedulerType.MODAL
+    return SchedulerType.GITHUB # Default to GitHub Actions
+
 def get_github_branch_name():
     """
     Runs a git command to determine the remote branch name, to be used in the GitHub Workflow
@@ -76,6 +92,21 @@ def get_github_branch_name():
 intents.message_content = True
 client = discord.Client(intents=intents)
 
+async def trigger_modal_run(script_content: str, filename: str) -> str:
+    """
+    Triggers a Modal run with the provided script
+    """
+    logger.info("Attempting to trigger Modal run")
+    try:
+        from modal_runner import run_script, modal_app
+        with modal.enable_output():
+            with modal_app.run():
+                result = run_script.remote(script_content)
+            return result
+    except Exception as e:
+        logger.error(f"Error in trigger_modal_run: {str(e)}", exc_info=True)
+        return f"Error: {str(e)}"
+
 async def trigger_github_action(script_content, filename, gpu_type):
     """
     Triggers the GitHub action with custom script contents and filename
@@ -217,18 +248,19 @@ async def on_message(message):
             for attachment in message.attachments:
                 logger.info(f"Processing attachment: {attachment.filename}")
                 if attachment.filename.endswith('.py'):
-                    # Determine GPU type from message
+                    # Determine GPU type and scheduler type from message
                     gpu_type = get_gpu_type(message.content)
-                    logger.info(f"Selected {gpu_type.name} GPU for processing")
+                    scheduler_type = get_scheduler_type(message.content)
+                    logger.info(f"Selected {gpu_type.name} GPU with {scheduler_type.value} scheduler")
 
                     # Create a thread directly from the original message
                     thread = await message.create_thread(
-                        name=f"{gpu_type.name} Training Job - {datetime.now().strftime('%Y-%m-%d %H:%M')}",
+                        name=f"{scheduler_type.value.capitalize()} Job - {datetime.now().strftime('%Y-%m-%d %H:%M')}",
                         auto_archive_duration=1440  # Archive after 24 hours of inactivity
                     )
 
                     # Send initial message in the thread
-                    await thread.send(f"Found {attachment.filename}! Starting training process on {gpu_type.name} GPU...")
+                    await thread.send(f"Found `{attachment.filename}`! Starting process on {scheduler_type.value}...")
 
                     try:
                         # Download the file content
@@ -237,34 +269,43 @@ async def on_message(message):
                         script_content = script_content.decode('utf-8')
                         logger.info(f"Successfully read {attachment.filename} content")
 
-                        # Trigger GitHub Action
-                        run_id = await trigger_github_action(script_content, attachment.filename, gpu_type)
+                        if scheduler_type == SchedulerType.MODAL:
+                            # Run on Modal
+                            await thread.send("Running on Modal...")
+                            print("Script content:")
+                            print(script_content)
+                            print("Filename:")
+                            print(attachment.filename)
+                            result = await trigger_modal_run(script_content, attachment.filename)
+                            await thread.send(f"```\nModal execution result:\n{result}\n```")
 
-                        await asyncio.sleep(10)
-
-                        if run_id:
-                            logger.info(f"Successfully triggered {gpu_type.name} workflow with run ID: {run_id}")
-                            await thread.send(f"GitHub Action triggered successfully on {gpu_type.name}! Run ID: {run_id}\nMonitoring progress...")
-
-                            # Monitor the workflow
-                            status, logs, url = await check_workflow_status(run_id, thread)
+                        elif scheduler_type == SchedulerType.GITHUB:
+                            # Run on GitHub Actions
+                            run_id = await trigger_github_action(script_content, attachment.filename, gpu_type)
 
-                            # Send results back to Discord thread
-                            await thread.send(f"Training completed with status: {status}")
-
-                            # Split logs if they're too long for Discord's message limit
-                            if len(logs) > 1900:
-                                chunks = [logs[i:i+1900] for i in range(0, len(logs), 1900)]
-                                for i, chunk in enumerate(chunks):
-                                    await thread.send(f"```\nLogs (part {i+1}/{len(chunks)}):\n{chunk}\n```")
+                            if run_id:
+                                logger.info(f"Successfully triggered GitHub workflow with run ID: {run_id}")
+                                await thread.send(f"GitHub Action triggered successfully! Run ID: {run_id}\nMonitoring progress...")
+
+                                # Monitor the workflow
+                                status, logs, url = await check_workflow_status(run_id, thread)
+
+                                # Send results back to Discord thread
+                                await thread.send(f"Training completed with status: {status}")
+
+                                # Split logs if they're too long for Discord's message limit
+                                if len(logs) > 1900:
+                                    chunks = [logs[i:i+1900] for i in range(0, len(logs), 1900)]
+                                    for i, chunk in enumerate(chunks):
+                                        await thread.send(f"```\nLogs (part {i+1}/{len(chunks)}):\n{chunk}\n```")
+                                else:
+                                    await thread.send(f"```\nLogs:\n{logs}\n```")
+
+                                if url:
+                                    await thread.send(f"View the full run at: {url}")
                             else:
-                                await thread.send(f"```\nLogs:\n{logs}\n```")
-
-                            if url:
-                                await thread.send(f"View the full run at: {url}")
-                        else:
-                            logger.error(f"Missing run_id. Failed to trigger GitHub Action for {gpu_type.name}")
-                            await thread.send(f"Failed to trigger GitHub Action for {gpu_type.name}. Please check the configuration.")
+                                logger.error(f"Missing run_id. Failed to trigger GitHub Action")
+                                await thread.send("Failed to trigger GitHub Action. Please check the configuration.")
 
                     except Exception as e:
                         logger.error(f"Error processing request: {str(e)}", exc_info=True)
@@ -273,7 +314,7 @@ async def on_message(message):
                     break
 
             if not any(att.filename.endswith('.py') for att in message.attachments):
-                await message.reply("Please attach a Python file to your message. Include 'AMD' in your message to use AMD GPU, otherwise NVIDIA will be used.")
+                await message.reply("Please attach a Python file to your message. Include 'AMD' in your message to use AMD GPU, otherwise NVIDIA will be used. Include 'MODAL' to use Modal instead of GitHub Actions.")
 
 # Run the bot
 if __name__ == "__main__":

diff --git a/modal_runner.py b/modal_runner.py
@@ -0,0 +1,36 @@
+from modal import App, Image 
+
+# Create a stub for the Modal app
+modal_app = App("discord-bot-runner")
+
+@modal_app.function(
+    gpu="T4",
+    image=Image.debian_slim(python_version="3.10").pip_install(["torch"])
+)
+def run_script(script_content: str) -> str:
+    """
+    Executes the provided Python script in an isolated environment
+    """
+    import sys
+    from io import StringIO
+
+    # Capture stdout
+    output = StringIO()
+    sys.stdout = output
+
+    try:
+        # Create a new dictionary for local variables to avoid polluting the global namespace
+        local_vars = {}
+        # Execute the script in the isolated namespace
+        exec(script_content, {}, local_vars)
+        return output.getvalue()
+    except Exception as e:
+        return f"Error executing script: {str(e)}"
+    finally:
+        sys.stdout = sys.__stdout__
+
+# For testing the Modal function directly
+if __name__ == "__main__":
+    with modal_app.run():
+        result = run_script.remote("print('Hello from Modal!')")
+        print(result)
diff --git a/requirements.txt b/requirements.txt
@@ -3,4 +3,5 @@ aiohttp
 discord.py
 audioop-lts # discord.py imports using * syntax 
 python-dotenv
-requests
+requests
+modal