Skip to content

Commit

Permalink
modal branch (#25)
Browse files Browse the repository at this point in the history
* modal branch

* Modal scheduler
  • Loading branch information
msaroufim authored Nov 19, 2024
1 parent ea64204 commit e60be08
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 31 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,16 @@ The key idea is that we're using Github Actions as a job scheduling engine and p

Right now the bot is running on my macbook but will some more permanent location

## Supported schedulers

* GitHub Actions
* Modal
* Slurm (not implemented yet)

## Usage instructions

`@Cluster-bot NVIDIA/AMD/MODAL` depending on which scheduleer you want to use. MODAL is configured by default to use T4 because that's cheap but it works with any GPU

## Why Github Actions

Every triggered job is containerized so we don't have to worry too much about security. We are exploring a K8 like setup but it's just harder to finish in a reasonable timeframe
Expand Down
101 changes: 71 additions & 30 deletions discord-bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import subprocess
import argparse
from enum import Enum
import modal

# Configure logging
def setup_logging():
Expand All @@ -37,6 +38,13 @@ class GPUType(Enum):
NVIDIA = "nvidia_workflow.yml"
AMD = "amd_workflow.yml"

# Scheduler types enum
class SchedulerType(Enum):
"""Enum defining supported scheduler types"""
GITHUB = "github"
MODAL = "modal"
SLURM = "slurm" # For future implementation

def get_gpu_type(message_content):
"""
Determine GPU type based on message content
Expand All @@ -45,6 +53,14 @@ def get_gpu_type(message_content):
return GPUType.AMD
return GPUType.NVIDIA # Default to NVIDIA if not specified

def get_scheduler_type(message_content):
"""
Determine scheduler type based on message content
"""
if "MODAL" in message_content.upper():
return SchedulerType.MODAL
return SchedulerType.GITHUB # Default to GitHub Actions

def get_github_branch_name():
"""
Runs a git command to determine the remote branch name, to be used in the GitHub Workflow
Expand Down Expand Up @@ -76,6 +92,21 @@ def get_github_branch_name():
intents.message_content = True
client = discord.Client(intents=intents)

async def trigger_modal_run(script_content: str, filename: str) -> str:
"""
Triggers a Modal run with the provided script
"""
logger.info("Attempting to trigger Modal run")
try:
from modal_runner import run_script, modal_app
with modal.enable_output():
with modal_app.run():
result = run_script.remote(script_content)
return result
except Exception as e:
logger.error(f"Error in trigger_modal_run: {str(e)}", exc_info=True)
return f"Error: {str(e)}"

async def trigger_github_action(script_content, filename, gpu_type):
"""
Triggers the GitHub action with custom script contents and filename
Expand Down Expand Up @@ -217,18 +248,19 @@ async def on_message(message):
for attachment in message.attachments:
logger.info(f"Processing attachment: {attachment.filename}")
if attachment.filename.endswith('.py'):
# Determine GPU type from message
# Determine GPU type and scheduler type from message
gpu_type = get_gpu_type(message.content)
logger.info(f"Selected {gpu_type.name} GPU for processing")
scheduler_type = get_scheduler_type(message.content)
logger.info(f"Selected {gpu_type.name} GPU with {scheduler_type.value} scheduler")

# Create a thread directly from the original message
thread = await message.create_thread(
name=f"{gpu_type.name} Training Job - {datetime.now().strftime('%Y-%m-%d %H:%M')}",
name=f"{scheduler_type.value.capitalize()} Job - {datetime.now().strftime('%Y-%m-%d %H:%M')}",
auto_archive_duration=1440 # Archive after 24 hours of inactivity
)

# Send initial message in the thread
await thread.send(f"Found {attachment.filename}! Starting training process on {gpu_type.name} GPU...")
await thread.send(f"Found `{attachment.filename}`! Starting process on {scheduler_type.value}...")

try:
# Download the file content
Expand All @@ -237,34 +269,43 @@ async def on_message(message):
script_content = script_content.decode('utf-8')
logger.info(f"Successfully read {attachment.filename} content")

# Trigger GitHub Action
run_id = await trigger_github_action(script_content, attachment.filename, gpu_type)
if scheduler_type == SchedulerType.MODAL:
# Run on Modal
await thread.send("Running on Modal...")
print("Script content:")
print(script_content)
print("Filename:")
print(attachment.filename)
result = await trigger_modal_run(script_content, attachment.filename)
await thread.send(f"```\nModal execution result:\n{result}\n```")

await asyncio.sleep(10)

if run_id:
logger.info(f"Successfully triggered {gpu_type.name} workflow with run ID: {run_id}")
await thread.send(f"GitHub Action triggered successfully on {gpu_type.name}! Run ID: {run_id}\nMonitoring progress...")

# Monitor the workflow
status, logs, url = await check_workflow_status(run_id, thread)
elif scheduler_type == SchedulerType.GITHUB:
# Run on GitHub Actions
run_id = await trigger_github_action(script_content, attachment.filename, gpu_type)

# Send results back to Discord thread
await thread.send(f"Training completed with status: {status}")

# Split logs if they're too long for Discord's message limit
if len(logs) > 1900:
chunks = [logs[i:i+1900] for i in range(0, len(logs), 1900)]
for i, chunk in enumerate(chunks):
await thread.send(f"```\nLogs (part {i+1}/{len(chunks)}):\n{chunk}\n```")
if run_id:
logger.info(f"Successfully triggered GitHub workflow with run ID: {run_id}")
await thread.send(f"GitHub Action triggered successfully! Run ID: {run_id}\nMonitoring progress...")

# Monitor the workflow
status, logs, url = await check_workflow_status(run_id, thread)

# Send results back to Discord thread
await thread.send(f"Training completed with status: {status}")

# Split logs if they're too long for Discord's message limit
if len(logs) > 1900:
chunks = [logs[i:i+1900] for i in range(0, len(logs), 1900)]
for i, chunk in enumerate(chunks):
await thread.send(f"```\nLogs (part {i+1}/{len(chunks)}):\n{chunk}\n```")
else:
await thread.send(f"```\nLogs:\n{logs}\n```")

if url:
await thread.send(f"View the full run at: {url}")
else:
await thread.send(f"```\nLogs:\n{logs}\n```")

if url:
await thread.send(f"View the full run at: {url}")
else:
logger.error(f"Missing run_id. Failed to trigger GitHub Action for {gpu_type.name}")
await thread.send(f"Failed to trigger GitHub Action for {gpu_type.name}. Please check the configuration.")
logger.error(f"Missing run_id. Failed to trigger GitHub Action")
await thread.send("Failed to trigger GitHub Action. Please check the configuration.")

except Exception as e:
logger.error(f"Error processing request: {str(e)}", exc_info=True)
Expand All @@ -273,7 +314,7 @@ async def on_message(message):
break

if not any(att.filename.endswith('.py') for att in message.attachments):
await message.reply("Please attach a Python file to your message. Include 'AMD' in your message to use AMD GPU, otherwise NVIDIA will be used.")
await message.reply("Please attach a Python file to your message. Include 'AMD' in your message to use AMD GPU, otherwise NVIDIA will be used. Include 'MODAL' to use Modal instead of GitHub Actions.")

# Run the bot
if __name__ == "__main__":
Expand Down
36 changes: 36 additions & 0 deletions modal_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from modal import App, Image

# Create a stub for the Modal app
modal_app = App("discord-bot-runner")

@modal_app.function(
gpu="T4",
image=Image.debian_slim(python_version="3.10").pip_install(["torch"])
)
def run_script(script_content: str) -> str:
"""
Executes the provided Python script in an isolated environment
"""
import sys
from io import StringIO

# Capture stdout
output = StringIO()
sys.stdout = output

try:
# Create a new dictionary for local variables to avoid polluting the global namespace
local_vars = {}
# Execute the script in the isolated namespace
exec(script_content, {}, local_vars)
return output.getvalue()
except Exception as e:
return f"Error executing script: {str(e)}"
finally:
sys.stdout = sys.__stdout__

# For testing the Modal function directly
if __name__ == "__main__":
with modal_app.run():
result = run_script.remote("print('Hello from Modal!')")
print(result)
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ aiohttp
discord.py
audioop-lts # discord.py imports using * syntax
python-dotenv
requests
requests
modal

0 comments on commit e60be08

Please sign in to comment.