Skip to content

Commit

Permalink
More informative execution logs/telemetry (#38)
Browse files Browse the repository at this point in the history
* Initializing draft PR

* Rename import

* Log & time kernel execution

* User-informative logs/telemetry for queue time, execution time, etc.

* Address unpacking value error (was missing one value in error handlers)
  • Loading branch information
RizzwareEngineer authored Dec 3, 2024
1 parent 5b9bedc commit 688bf09
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 32 deletions.
4 changes: 2 additions & 2 deletions scripts/modal-test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
image=modal.Image.debian_slim(python_version="3.12")
.pip_install(["torch"])
)
async def run_script_on_modal():
async def run_pytorch_script_on_modal():
"""
Runs a Python script on Modal with GPU
"""
Expand All @@ -43,5 +43,5 @@ async def run_script_on_modal():
# Run the function
if __name__ == "__main__":
with modal_app.run():
result = run_script_on_modal.remote()
result = run_pytorch_script_on_modal.remote()
print(result)
46 changes: 32 additions & 14 deletions src/discord-cluster-manager/cogs/modal_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from discord import app_commands
from discord.ext import commands
import modal
import time
from utils import setup_logging

logger = setup_logging()
Expand Down Expand Up @@ -37,39 +38,56 @@ async def run_modal(
return

thread = await self.bot.create_thread(interaction, gpu_type.name, "Modal Job")
queue_start_time = time.perf_counter()

await interaction.response.send_message(
f"Created thread {thread.mention} for your Modal job"
)
await thread.send(f"Processing `{script.filename}` with {gpu_type.name}...")
await thread.send(f"**Processing `{script.filename}` with {gpu_type.name}...**")

try:
script_content = (await script.read()).decode("utf-8")
await thread.send("Running on Modal...")
result = await self.trigger_modal_run(script_content, script.filename)
await thread.send(f"```\nModal execution result:\n{result}\n```")
status_msg = await thread.send("**Running on Modal...**\n> ⏳ Waiting for available GPU...")

result, execution_time_ms = await self.trigger_modal_run(script_content, script.filename)

# Update status message to show completion
await status_msg.edit(content="**Running on Modal...**\n> ✅ Job completed!")

queue_end_time = time.perf_counter()
queue_time_ms = (queue_end_time - queue_start_time) * 1000

# Send metrics and results
await thread.send(f"\n**Script size:** {len(script_content)} bytes")
await thread.send(f"**Queue time:** {queue_time_ms:.3f} ms")
await thread.send(f"**Execution time:** {execution_time_ms:.3f} ms\n")
await thread.send(f"**Modal execution result:**\n```\n{result}\n```")

except Exception as e:
logger.error(f"Error processing request: {str(e)}", exc_info=True)
await thread.send(f"Error processing request: {str(e)}")
# Update status message to show error
await status_msg.edit(content="**Running on Modal...**\n> ❌ Job failed!")
await thread.send(f"**Error:** {str(e)}")


async def trigger_modal_run(self, script_content: str, filename: str) -> str:
async def trigger_modal_run(self, script_content: str, filename: str) -> tuple[str, float]:
logger.info("Attempting to trigger Modal run")

from modal_runner import modal_app

try:
print(f"Running {filename} with Modal")
print(f"Running {filename} with Modal")
with modal.enable_output():
with modal_app.run():
if filename.endswith(".py"):
from modal_runner import run_script

result = run_script.remote(script_content)
from modal_runner import run_pytorch_script
result, execution_time_ms = run_pytorch_script.remote(script_content)
elif filename.endswith(".cu"):
from modal_runner import run_cuda_script
result = run_cuda_script.remote(script_content)
return result
result, execution_time_ms = run_cuda_script.remote(script_content)

return result, execution_time_ms

except Exception as e:
logger.error(f"Error in trigger_modal_run: {str(e)}", exc_info=True)
return f"Error: {str(e)}"

return f"Error: {str(e)}", 0
64 changes: 48 additions & 16 deletions src/discord-cluster-manager/modal_runner.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from modal import App, Image
import signal
from contextlib import contextmanager
import signal

# Create a stub for the Modal app
# IMPORTANT: This has to stay in separate file or modal breaks
Expand Down Expand Up @@ -31,21 +31,25 @@ def timeout_handler(signum, frame):


@modal_app.function(
gpu="T4", image=Image.debian_slim(python_version="3.10").pip_install(["torch"])
gpu="T4",
image=Image.debian_slim(python_version="3.10").pip_install(["torch"])
)
def run_script(script_content: str, timeout_seconds: int = 300) -> str:
def run_pytorch_script(script_content: str, timeout_seconds: int = 300) -> tuple[str, float]:
"""
Executes the provided Python script in an isolated environment with a timeout
Executes the provided PyTorch GPU kernel in an isolated environment with a timeout
Args:
script_content: The Python script to execute
timeout_seconds: Maximum execution time in seconds (default: 300 seconds / 5 minutes)
script_content: The PyTorch script containing the GPU kernel to benchmark
timeout_seconds: Maximum execution time before timeout (default: 300 seconds)
Returns:
str: Output of the script or error message
tuple[str, float]: (Kernel output, execution time in milliseconds)
NOTE: Modal execution time is not programmatically accessible, so we manually calculate it
"""
import sys
from io import StringIO
import time

# Capture stdout
output = StringIO()
Expand All @@ -55,14 +59,22 @@ def run_script(script_content: str, timeout_seconds: int = 300) -> str:
with timeout(timeout_seconds):
# Create a new dictionary for local variables to avoid polluting the global namespace
local_vars = {}

execution_start_time = time.perf_counter()

# Execute the script in the isolated namespace
exec(script_content, {}, local_vars)
return output.getvalue()

execution_end_time = time.perf_counter()

execution_time_ms = (execution_end_time - execution_start_time) * 1000

return output.getvalue(), execution_time_ms

except TimeoutException as e:
return f"Timeout Error: {str(e)}"
return f"Timeout Error: {str(e)}", 0.0
except Exception as e:
return f"Error executing script: {str(e)}"
return f"Error executing script: {str(e)}", 0.0
finally:
sys.stdout = sys.__stdout__

Expand All @@ -73,40 +85,60 @@ def run_script(script_content: str, timeout_seconds: int = 300) -> str:
"nvidia/cuda:12.6.0-devel-ubuntu24.04", add_python="3.11"
),
)
def run_cuda_script(script_content: str, timeout_seconds: int = 600) -> str:
def run_cuda_script(script_content: str, timeout_seconds: int = 600) -> tuple[str, float]:
"""
Executes the provided CUDA kernel in an isolated environment with a timeout
Args:
script_content: The CUDA script containing the GPU kernel
timeout_seconds: Maximum execution time in seconds (default: 600 seconds)
Returns:
tuple[str, float]: (Kernel output, execution time in milliseconds)
NOTE: Modal execution time is not programmatically accessible, so we manually calculate it
"""
import sys
from io import StringIO
import subprocess
import os
import time

# Capture stdout
output = StringIO()
sys.stdout = output

try:
with timeout(timeout_seconds):
execution_start_time = time.perf_counter()

# Compile the CUDA code
with open("script.cu", "w") as f:
f.write(script_content)

# Compile the CUDA code
compile_process = subprocess.run(
["nvcc", "script.cu", "-o", "script.out"],
capture_output=True,
text=True,
)

if compile_process.returncode != 0:
return f"Compilation Error:\n{compile_process.stderr}"
return f"Compilation Error:\n{compile_process.stderr}", 0.0

run_process = subprocess.run(
["./script.out"], capture_output=True, text=True
)
execution_end_time = time.perf_counter()

execution_time_sec = execution_end_time - execution_start_time
execution_time_ms = execution_time_sec * 1000

return run_process.stdout
return run_process.stdout, execution_time_ms

except TimeoutException as e:
return f"Timeout Error: {str(e)}"
return f"Timeout Error: {str(e)}", 0.0
except Exception as e:
return f"Error: {str(e)}"
return f"Error: {str(e)}", 0.0
finally:
if os.path.exists("script.cu"):
os.remove("script.cu")
Expand Down

0 comments on commit 688bf09

Please sign in to comment.