Skip to content

Commit

Permalink
Working submission / eval
Browse files Browse the repository at this point in the history
  • Loading branch information
alexzhang13 committed Dec 8, 2024
1 parent 87fa46a commit 83ee787
Show file tree
Hide file tree
Showing 8 changed files with 160 additions and 21 deletions.
64 changes: 62 additions & 2 deletions .github/workflows/amd_workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,22 @@ on:
description: 'Name of Python script'
required: true
type: string
reference_content:
description: 'Content of the reference code script (optional)'
required: false
type: string
reference_filename:
description: 'Name of reference script (supports .py or .cu)'
required: false
type: string
eval_content:
description: 'Content of the outer eval code script (optional)'
required: false
type: string
eval_filename:
description: 'Name of outer eval script (supports .py or .cu)'
required: false
type: string

jobs:
train:
Expand All @@ -30,6 +46,32 @@ jobs:
with open('${{ github.event.inputs.filename }}', 'w') as f:
f.write('''${{ github.event.inputs.script_content }}''')
- name: Create reference scripts if provided
shell: bash
run: |
if [[ -n "${{ github.event.inputs.reference_filename }}" ]]; then
echo "Creating reference script..."
cat > "${{ github.event.inputs.reference_filename }}" <<EOL
${{ github.event.inputs.reference_content }}
EOL
cat "${{ github.event.inputs.reference_filename }}" # Debug: Show file contents
else
echo "No reference content provided."
fi
- name: Create eval scripts if provided
shell: bash
run: |
if [[ -n "${{ github.event.inputs.eval_filename }}" ]]; then
echo "Creating reference script..."
cat > "${{ github.event.inputs.eval_filename }}" <<EOL
${{ github.event.inputs.eval_content }}
EOL
cat "${{ github.event.inputs.eval_filename }}" # Debug: Show file contents
else
echo "No eval content provided."
fi
- name: Setup Virtual Environment and Install Dependencies
run: |
python -m venv ${VENV_DIR}
Expand All @@ -38,8 +80,26 @@ jobs:
pip install --pre pytorch-triton-rocm==3.1.0+cf34004b8a torch==2.6.0.dev20241023+rocm6.2 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
- name: Run script
shell: bash
run: |
python "${{ github.event.inputs.filename }}" > training.log 2>&1
if [[ -n "${{ github.event.inputs.eval_content }}" ]]; then
echo "Running Python file..."
python3 "${{ github.event.inputs.eval_filename }}" > training.log 2>&1
cat training.log # Debug: show output
else
echo "Running Python file..."
python3 "${{ github.event.inputs.filename }}" > training.log 2>&1
cat training.log # Debug: show output
fi
- name: Extract score
shell: bash
run: |
# Extract only the last occurrence of "score:"
score=$(grep -oP 'score:\s*\d+(\.\d+)?' training.log | tail -n 1 | awk '{print $2}')
echo "Score extracted: $score"
echo "::set-output name=score::$score"
- name: Upload training artifacts
uses: actions/upload-artifact@v4
Expand All @@ -48,4 +108,4 @@ jobs:
name: training-artifacts
path: |
training.log
${{ github.event.inputs.filename }}
${{ github.event.inputs.filename }}
2 changes: 1 addition & 1 deletion .github/workflows/nvidia_workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ jobs:
shell: bash
run: |
if [[ -n "${{ github.event.inputs.eval_filename }}" ]]; then
echo "Creating reference script..."
echo "Creating eval script..."
cat > "${{ github.event.inputs.eval_filename }}" <<EOL
${{ github.event.inputs.eval_content }}
EOL
Expand Down
9 changes: 7 additions & 2 deletions src/discord-cluster-manager/cogs/github_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ async def run_github(
gpu_type: app_commands.Choice[str],
use_followup: bool = False,
reference_script: discord.Attachment = None,
reference_code: str = None,
) -> discord.Thread:
if not script.filename.endswith(".py") and not script.filename.endswith(".cu"):
await interaction.response.send_message(
Expand All @@ -59,8 +60,12 @@ async def run_github(
script_content = (await script.read()).decode("utf-8")
selected_gpu = GPUType.AMD if gpu_type.value == "amd" else GPUType.NVIDIA

if reference_script is not None:
reference_content = (await reference_script.read()).decode("utf-8")
if reference_script is not None or reference_code is not None:
reference_content = (
reference_code
if reference_code is not None
else (await reference_script.read()).decode("utf-8")
)
eval_code = py_eval if script.filename.endswith(".py") else cu_eval

run_id = await self.trigger_github_action(
Expand Down
48 changes: 41 additions & 7 deletions src/discord-cluster-manager/cogs/leaderboard_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from typing import TYPE_CHECKING
from consts import GitHubGPU, ModalGPU
from utils import extract_score

import random

Expand Down Expand Up @@ -100,7 +101,7 @@ async def submit_modal(
)
@app_commands.choices(
gpu_type=[
app_commands.Choice(name=gpu.value, value=gpu.value) for gpu in GitHubGPU
app_commands.Choice(name=gpu.name, value=gpu.value) for gpu in GitHubGPU
]
)
async def submit_github(
Expand All @@ -113,9 +114,23 @@ async def submit_github(
shape: app_commands.Choice[str] = None,
):
try:
if not interaction.response.is_done():
await interaction.response.defer()

# Read the template file
submission_content = await script.read()

# Read and convert reference code
reference_code = None
with self.bot.leaderboard_db as db:
leaderboard_item = db.get_leaderboard(leaderboard_name)
if not leaderboard_item:
await interaction.response.send_message(
f"Leaderboard {leaderboard_name} not found.", ephemeral=True
)
return
reference_code = leaderboard_item["reference_code"]

# Call GH runner
github_cog = self.bot.get_cog("GitHubCog")

Expand All @@ -124,9 +139,27 @@ async def submit_github(
return

github_command = github_cog.run_github
print(github_command)
try:
github_thread = await github_command.callback(
github_cog,
interaction,
script,
gpu_type,
reference_code=reference_code,
use_followup=True,
)
except discord.errors.NotFound as e:
print(f"Webhook not found: {e}")
await interaction.followup.send("❌ The webhook was not found.")

message_contents = [
msg.content async for msg in github_thread.history(limit=None)
]

# Compute eval or submission score, call runner here.
score = random.random()
# TODO: Make this more robust later
score = extract_score("".join(message_contents))

with self.bot.leaderboard_db as db:
db.create_submission({
Expand All @@ -138,9 +171,10 @@ async def submit_github(
"submission_score": score,
})

await interaction.response.send_message(
f"Ran on GH. Leaderboard '{leaderboard_name}'. Submission title: {script.filename}. Submission user: {interaction.user.id}. Runtime: {score} ms",
ephemeral=True,
await interaction.followup.send(
f""""Ran on GH. Leaderboard '{leaderboard_name}'.
Submission title: {script.filename}. Submission user: {interaction.user.id}.
Runtime: {score} ms""",
)
except ValueError:
await interaction.response.send_message(
Expand Down Expand Up @@ -228,7 +262,7 @@ async def leaderboard_create(
})

await interaction.response.send_message(
f"Leaderboard '{leaderboard_name}'. Submission deadline: {date_value}",
f"Leaderboard '{leaderboard_name}'. Reference code: {reference_code}. Submission deadline: {date_value}",
ephemeral=True,
)
except ValueError:
Expand All @@ -245,7 +279,7 @@ async def get_leaderboard_submissions(
dtype: app_commands.Choice[str] = "fp32",
):
with self.bot.leaderboard_db as db:
leaderboard_id = db.get_leaderboard_id(leaderboard_name)
leaderboard_id = db.get_leaderboard(leaderboard_name)["id"]
if not leaderboard_id:
await interaction.response.send_message(
"Leaderboard not found.", ephemeral=True
Expand Down
3 changes: 2 additions & 1 deletion src/discord-cluster-manager/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ class SchedulerType(Enum):


class GitHubGPU(Enum):
T4 = "T4"
NVIDIA = "nvidia"
AMD = "amd"


class ModalGPU(Enum):
Expand Down
21 changes: 14 additions & 7 deletions src/discord-cluster-manager/leaderboard_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,20 @@ def get_leaderboards(self) -> list[LeaderboardItem]:
for lb in self.cursor.fetchall()
]

def get_leaderboard(self, leaderboard_name: str) -> int | None:
self.cursor.execute(
"SELECT * FROM leaderboard WHERE name = %s", (leaderboard_name,)
)

res = self.cursor.fetchone()

if res:
return LeaderboardItem(
id=res[0], name=res[1], deadline=res[2], reference_code=res[3]
)
else:
return None

def get_leaderboard_submissions(
self, leaderboard_name: str
) -> list[SubmissionItem]:
Expand All @@ -164,13 +178,6 @@ def get_leaderboard_submissions(
for submission in self.cursor.fetchall()
]

def get_leaderboard_id(self, leaderboard_name: str) -> int | None:
self.cursor.execute("SELECT * FROM leaderboard", (leaderboard_name,))

res = self.cursor.fetchone()

return res[0] if res else None


if __name__ == "__main__":
print(
Expand Down
3 changes: 2 additions & 1 deletion src/discord-cluster-manager/leaderboard_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from reference import metric
def main():
metric()
s = metric()
print(f'score:{s}')
if __name__ == '__main__':
main()
Expand Down
31 changes: 31 additions & 0 deletions src/discord-cluster-manager/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import subprocess
import datetime
import re
from typing import TypedDict


Expand Down Expand Up @@ -37,6 +38,36 @@ def get_github_branch_name():
return "main"


def get_user_from_id(id, interaction, bot):
if interaction.guild:
# In a guild, try to get the member by ID
member = interaction.guild.get_member(id)
if member:
username = member.name
return username
else:
return id
else:
# If the interaction is in DMs, we can get the user directly
user = bot.get_user(id)
if user:
username = user.name
return username
else:
return id


def extract_score(score_str: str) -> float:
"""
Extract score from output logs and push to DB (kind of hacky).
"""
match = re.search(r"score:\s*(-?\d+\.\d+)", score_str)
if match:
return float(match.group(1))
else:
return None


class LeaderboardItem(TypedDict):
name: str
deadline: datetime.datetime
Expand Down

0 comments on commit 83ee787

Please sign in to comment.