More informative execution logs/telemetry (#38)

* Initializing draft PR * Rename import * Log & time kernel execution * User-informative logs/telemetry for queue time, execution time, etc. * Address unpacking value error (was missing one value in error handlers)
gpu-mode · Dec 3, 2024 · 688bf09 · 688bf09
1 parent 5b9bedc
commit 688bf09
Show file tree

Hide file tree

Showing 3 changed files with 82 additions and 32 deletions.
diff --git a/scripts/modal-test.py b/scripts/modal-test.py
@@ -17,7 +17,7 @@
     image=modal.Image.debian_slim(python_version="3.12")
         .pip_install(["torch"])
 )
-async def run_script_on_modal():
+async def run_pytorch_script_on_modal():
     """
     Runs a Python script on Modal with GPU
     """
@@ -43,5 +43,5 @@ async def run_script_on_modal():
 # Run the function
 if __name__ == "__main__":
     with modal_app.run():
-        result = run_script_on_modal.remote()
+        result = run_pytorch_script_on_modal.remote()
         print(result)
diff --git a/src/discord-cluster-manager/cogs/modal_cog.py b/src/discord-cluster-manager/cogs/modal_cog.py
@@ -2,6 +2,7 @@
 from discord import app_commands
 from discord.ext import commands
 import modal
+import time
 from utils import setup_logging
 
 logger = setup_logging()
@@ -37,39 +38,56 @@ async def run_modal(
             return
 
         thread = await self.bot.create_thread(interaction, gpu_type.name, "Modal Job")
+        queue_start_time = time.perf_counter()
 
         await interaction.response.send_message(
             f"Created thread {thread.mention} for your Modal job"
         )
-        await thread.send(f"Processing `{script.filename}` with {gpu_type.name}...")
+        await thread.send(f"**Processing `{script.filename}` with {gpu_type.name}...**")
 
         try:
             script_content = (await script.read()).decode("utf-8")
-            await thread.send("Running on Modal...")
-            result = await self.trigger_modal_run(script_content, script.filename)
-            await thread.send(f"```\nModal execution result:\n{result}\n```")
+            status_msg = await thread.send("**Running on Modal...**\n> ⏳ Waiting for available GPU...")
+
+            result, execution_time_ms = await self.trigger_modal_run(script_content, script.filename)
+
+            # Update status message to show completion
+            await status_msg.edit(content="**Running on Modal...**\n> ✅ Job completed!")
+
+            queue_end_time = time.perf_counter()
+            queue_time_ms = (queue_end_time - queue_start_time) * 1000
+
+            # Send metrics and results
+            await thread.send(f"\n**Script size:** {len(script_content)} bytes")
+            await thread.send(f"**Queue time:** {queue_time_ms:.3f} ms")
+            await thread.send(f"**Execution time:** {execution_time_ms:.3f} ms\n")
+            await thread.send(f"**Modal execution result:**\n```\n{result}\n```")
+
         except Exception as e:
             logger.error(f"Error processing request: {str(e)}", exc_info=True)
-            await thread.send(f"Error processing request: {str(e)}")
+            # Update status message to show error
+            await status_msg.edit(content="**Running on Modal...**\n> ❌ Job failed!")
+            await thread.send(f"**Error:** {str(e)}")
+
 
-    async def trigger_modal_run(self, script_content: str, filename: str) -> str:
+    async def trigger_modal_run(self, script_content: str, filename: str) -> tuple[str, float]:
         logger.info("Attempting to trigger Modal run")
 
         from modal_runner import modal_app
 
         try:
-            print(f"Running {filename} with Modal")
+            print(f"Running {filename} with Modal")            
             with modal.enable_output():
                 with modal_app.run():
                     if filename.endswith(".py"):
-                        from modal_runner import run_script
-
-                        result = run_script.remote(script_content)
+                        from modal_runner import run_pytorch_script
+                        result, execution_time_ms = run_pytorch_script.remote(script_content)
                     elif filename.endswith(".cu"):
                         from modal_runner import run_cuda_script
-                        result = run_cuda_script.remote(script_content)
-                return result
+                        result, execution_time_ms = run_cuda_script.remote(script_content)
+
+            return result, execution_time_ms
+
         except Exception as e:
             logger.error(f"Error in trigger_modal_run: {str(e)}", exc_info=True)
-            return f"Error: {str(e)}"
-
+            return f"Error: {str(e)}", 0
diff --git a/src/discord-cluster-manager/modal_runner.py b/src/discord-cluster-manager/modal_runner.py
@@ -1,6 +1,6 @@
 from modal import App, Image
-import signal
 from contextlib import contextmanager
+import signal
 
 # Create a stub for the Modal app
 # IMPORTANT: This has to stay in separate file or modal breaks
@@ -31,21 +31,25 @@ def timeout_handler(signum, frame):
 
 
 @modal_app.function(
-    gpu="T4", image=Image.debian_slim(python_version="3.10").pip_install(["torch"])
+    gpu="T4", 
+    image=Image.debian_slim(python_version="3.10").pip_install(["torch"])
 )
-def run_script(script_content: str, timeout_seconds: int = 300) -> str:
+def run_pytorch_script(script_content: str, timeout_seconds: int = 300) -> tuple[str, float]:
     """
-    Executes the provided Python script in an isolated environment with a timeout
+    Executes the provided PyTorch GPU kernel in an isolated environment with a timeout
 
     Args:
-        script_content: The Python script to execute
-        timeout_seconds: Maximum execution time in seconds (default: 300 seconds / 5 minutes)
+        script_content: The PyTorch script containing the GPU kernel to benchmark
+        timeout_seconds: Maximum execution time before timeout (default: 300 seconds)
 
     Returns:
-        str: Output of the script or error message
+        tuple[str, float]: (Kernel output, execution time in milliseconds)
+
+    NOTE: Modal execution time is not programmatically accessible, so we manually calculate it
     """
     import sys
     from io import StringIO
+    import time
 
     # Capture stdout
     output = StringIO()
@@ -55,14 +59,22 @@ def run_script(script_content: str, timeout_seconds: int = 300) -> str:
         with timeout(timeout_seconds):
             # Create a new dictionary for local variables to avoid polluting the global namespace
             local_vars = {}
+
+            execution_start_time = time.perf_counter()
+
             # Execute the script in the isolated namespace
             exec(script_content, {}, local_vars)
-        return output.getvalue()
+
+            execution_end_time = time.perf_counter()
+
+            execution_time_ms = (execution_end_time - execution_start_time) * 1000
+
+        return output.getvalue(), execution_time_ms
 
     except TimeoutException as e:
-        return f"Timeout Error: {str(e)}"
+        return f"Timeout Error: {str(e)}", 0.0
     except Exception as e:
-        return f"Error executing script: {str(e)}"
+        return f"Error executing script: {str(e)}", 0.0
     finally:
         sys.stdout = sys.__stdout__
 
@@ -73,40 +85,60 @@ def run_script(script_content: str, timeout_seconds: int = 300) -> str:
         "nvidia/cuda:12.6.0-devel-ubuntu24.04", add_python="3.11"
     ),
 )
-def run_cuda_script(script_content: str, timeout_seconds: int = 600) -> str:
+def run_cuda_script(script_content: str, timeout_seconds: int = 600) -> tuple[str, float]:
+    """
+    Executes the provided CUDA kernel in an isolated environment with a timeout
+
+    Args:
+        script_content: The CUDA script containing the GPU kernel
+        timeout_seconds: Maximum execution time in seconds (default: 600 seconds)
+
+    Returns:
+        tuple[str, float]: (Kernel output, execution time in milliseconds)
+
+    NOTE: Modal execution time is not programmatically accessible, so we manually calculate it
+    """
     import sys
     from io import StringIO
     import subprocess
     import os
+    import time
 
+    # Capture stdout
     output = StringIO()
     sys.stdout = output
 
     try:
         with timeout(timeout_seconds):
+            execution_start_time = time.perf_counter()
+
+            # Compile the CUDA code
             with open("script.cu", "w") as f:
                 f.write(script_content)
 
-            # Compile the CUDA code
             compile_process = subprocess.run(
                 ["nvcc", "script.cu", "-o", "script.out"],
                 capture_output=True,
                 text=True,
             )
 
             if compile_process.returncode != 0:
-                return f"Compilation Error:\n{compile_process.stderr}"
+                return f"Compilation Error:\n{compile_process.stderr}", 0.0
 
             run_process = subprocess.run(
                 ["./script.out"], capture_output=True, text=True
             )
+            execution_end_time = time.perf_counter()
+
+            execution_time_sec = execution_end_time - execution_start_time
+            execution_time_ms = execution_time_sec * 1000
 
-            return run_process.stdout
+            return run_process.stdout, execution_time_ms
 
     except TimeoutException as e:
-        return f"Timeout Error: {str(e)}"
+        return f"Timeout Error: {str(e)}", 0.0
     except Exception as e:
-        return f"Error: {str(e)}"
+        return f"Error: {str(e)}", 0.0
     finally:
         if os.path.exists("script.cu"):
             os.remove("script.cu")