diff --git a/scripts/modal-test.py b/scripts/modal-test.py index 491ac0d..f75019a 100644 --- a/scripts/modal-test.py +++ b/scripts/modal-test.py @@ -17,7 +17,7 @@ image=modal.Image.debian_slim(python_version="3.12") .pip_install(["torch"]) ) -async def run_script_on_modal(): +async def run_pytorch_script_on_modal(): """ Runs a Python script on Modal with GPU """ @@ -43,5 +43,5 @@ async def run_script_on_modal(): # Run the function if __name__ == "__main__": with modal_app.run(): - result = run_script_on_modal.remote() + result = run_pytorch_script_on_modal.remote() print(result) \ No newline at end of file diff --git a/src/discord-cluster-manager/cogs/modal_cog.py b/src/discord-cluster-manager/cogs/modal_cog.py index c27551a..e5742dc 100644 --- a/src/discord-cluster-manager/cogs/modal_cog.py +++ b/src/discord-cluster-manager/cogs/modal_cog.py @@ -2,6 +2,7 @@ from discord import app_commands from discord.ext import commands import modal +import time from utils import setup_logging logger = setup_logging() @@ -37,39 +38,56 @@ async def run_modal( return thread = await self.bot.create_thread(interaction, gpu_type.name, "Modal Job") + queue_start_time = time.perf_counter() await interaction.response.send_message( f"Created thread {thread.mention} for your Modal job" ) - await thread.send(f"Processing `{script.filename}` with {gpu_type.name}...") + await thread.send(f"**Processing `{script.filename}` with {gpu_type.name}...**") try: script_content = (await script.read()).decode("utf-8") - await thread.send("Running on Modal...") - result = await self.trigger_modal_run(script_content, script.filename) - await thread.send(f"```\nModal execution result:\n{result}\n```") + status_msg = await thread.send("**Running on Modal...**\n> ⏳ Waiting for available GPU...") + + result, execution_time_ms = await self.trigger_modal_run(script_content, script.filename) + + # Update status message to show completion + await status_msg.edit(content="**Running on Modal...**\n> ✅ Job completed!") + + queue_end_time = time.perf_counter() + queue_time_ms = (queue_end_time - queue_start_time) * 1000 + + # Send metrics and results + await thread.send(f"\n**Script size:** {len(script_content)} bytes") + await thread.send(f"**Queue time:** {queue_time_ms:.3f} ms") + await thread.send(f"**Execution time:** {execution_time_ms:.3f} ms\n") + await thread.send(f"**Modal execution result:**\n```\n{result}\n```") + except Exception as e: logger.error(f"Error processing request: {str(e)}", exc_info=True) - await thread.send(f"Error processing request: {str(e)}") + # Update status message to show error + await status_msg.edit(content="**Running on Modal...**\n> ❌ Job failed!") + await thread.send(f"**Error:** {str(e)}") + - async def trigger_modal_run(self, script_content: str, filename: str) -> str: + async def trigger_modal_run(self, script_content: str, filename: str) -> tuple[str, float]: logger.info("Attempting to trigger Modal run") from modal_runner import modal_app try: - print(f"Running {filename} with Modal") + print(f"Running {filename} with Modal") with modal.enable_output(): with modal_app.run(): if filename.endswith(".py"): - from modal_runner import run_script - - result = run_script.remote(script_content) + from modal_runner import run_pytorch_script + result, execution_time_ms = run_pytorch_script.remote(script_content) elif filename.endswith(".cu"): from modal_runner import run_cuda_script - result = run_cuda_script.remote(script_content) - return result + result, execution_time_ms = run_cuda_script.remote(script_content) + + return result, execution_time_ms + except Exception as e: logger.error(f"Error in trigger_modal_run: {str(e)}", exc_info=True) - return f"Error: {str(e)}" - + return f"Error: {str(e)}", 0 \ No newline at end of file diff --git a/src/discord-cluster-manager/modal_runner.py b/src/discord-cluster-manager/modal_runner.py index d70d24a..7ee362b 100644 --- a/src/discord-cluster-manager/modal_runner.py +++ b/src/discord-cluster-manager/modal_runner.py @@ -1,6 +1,6 @@ from modal import App, Image -import signal from contextlib import contextmanager +import signal # Create a stub for the Modal app # IMPORTANT: This has to stay in separate file or modal breaks @@ -31,21 +31,25 @@ def timeout_handler(signum, frame): @modal_app.function( - gpu="T4", image=Image.debian_slim(python_version="3.10").pip_install(["torch"]) + gpu="T4", + image=Image.debian_slim(python_version="3.10").pip_install(["torch"]) ) -def run_script(script_content: str, timeout_seconds: int = 300) -> str: +def run_pytorch_script(script_content: str, timeout_seconds: int = 300) -> tuple[str, float]: """ - Executes the provided Python script in an isolated environment with a timeout + Executes the provided PyTorch GPU kernel in an isolated environment with a timeout Args: - script_content: The Python script to execute - timeout_seconds: Maximum execution time in seconds (default: 300 seconds / 5 minutes) + script_content: The PyTorch script containing the GPU kernel to benchmark + timeout_seconds: Maximum execution time before timeout (default: 300 seconds) Returns: - str: Output of the script or error message + tuple[str, float]: (Kernel output, execution time in milliseconds) + + NOTE: Modal execution time is not programmatically accessible, so we manually calculate it """ import sys from io import StringIO + import time # Capture stdout output = StringIO() @@ -55,14 +59,22 @@ def run_script(script_content: str, timeout_seconds: int = 300) -> str: with timeout(timeout_seconds): # Create a new dictionary for local variables to avoid polluting the global namespace local_vars = {} + + execution_start_time = time.perf_counter() + # Execute the script in the isolated namespace exec(script_content, {}, local_vars) - return output.getvalue() + + execution_end_time = time.perf_counter() + + execution_time_ms = (execution_end_time - execution_start_time) * 1000 + + return output.getvalue(), execution_time_ms except TimeoutException as e: - return f"Timeout Error: {str(e)}" + return f"Timeout Error: {str(e)}", 0.0 except Exception as e: - return f"Error executing script: {str(e)}" + return f"Error executing script: {str(e)}", 0.0 finally: sys.stdout = sys.__stdout__ @@ -73,21 +85,37 @@ def run_script(script_content: str, timeout_seconds: int = 300) -> str: "nvidia/cuda:12.6.0-devel-ubuntu24.04", add_python="3.11" ), ) -def run_cuda_script(script_content: str, timeout_seconds: int = 600) -> str: +def run_cuda_script(script_content: str, timeout_seconds: int = 600) -> tuple[str, float]: + """ + Executes the provided CUDA kernel in an isolated environment with a timeout + + Args: + script_content: The CUDA script containing the GPU kernel + timeout_seconds: Maximum execution time in seconds (default: 600 seconds) + + Returns: + tuple[str, float]: (Kernel output, execution time in milliseconds) + + NOTE: Modal execution time is not programmatically accessible, so we manually calculate it + """ import sys from io import StringIO import subprocess import os + import time + # Capture stdout output = StringIO() sys.stdout = output try: with timeout(timeout_seconds): + execution_start_time = time.perf_counter() + + # Compile the CUDA code with open("script.cu", "w") as f: f.write(script_content) - # Compile the CUDA code compile_process = subprocess.run( ["nvcc", "script.cu", "-o", "script.out"], capture_output=True, @@ -95,18 +123,22 @@ def run_cuda_script(script_content: str, timeout_seconds: int = 600) -> str: ) if compile_process.returncode != 0: - return f"Compilation Error:\n{compile_process.stderr}" + return f"Compilation Error:\n{compile_process.stderr}", 0.0 run_process = subprocess.run( ["./script.out"], capture_output=True, text=True ) + execution_end_time = time.perf_counter() + + execution_time_sec = execution_end_time - execution_start_time + execution_time_ms = execution_time_sec * 1000 - return run_process.stdout + return run_process.stdout, execution_time_ms except TimeoutException as e: - return f"Timeout Error: {str(e)}" + return f"Timeout Error: {str(e)}", 0.0 except Exception as e: - return f"Error: {str(e)}" + return f"Error: {str(e)}", 0.0 finally: if os.path.exists("script.cu"): os.remove("script.cu")