Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvements to submission UI #223

Merged
merged 3 commits into from
Mar 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 0 additions & 10 deletions src/discord-cluster-manager/bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,16 +223,6 @@ async def on_ready(self):

await self._setup_leaderboards()

async def create_thread(
self, interaction: discord.Interaction, gpu_name: str, job_name: str
) -> discord.Thread:
thread = await interaction.channel.create_thread(
name=f"{job_name} ({gpu_name})",
type=discord.ChannelType.public_thread,
auto_archive_duration=1440,
)
return thread

async def send_chunked_message(self, channel, content: str, code_block: bool = True):
"""
Send a long message in chunks to avoid Discord's message length limit
Expand Down
7 changes: 4 additions & 3 deletions src/discord-cluster-manager/cogs/github_cog.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import datetime
import json

from cogs.submit_cog import ProgressReporter, SubmitCog
from cogs.submit_cog import SubmitCog
from consts import AMD_REQUIREMENTS, NVIDIA_REQUIREMENTS, GitHubGPU, GPUType
from discord import app_commands
from github_runner import GitHubRun
from report import RunProgressReporter
from run_eval import CompileResult, EvalResult, FullResult, RunResult, SystemInfo
from utils import setup_logging

Expand All @@ -19,7 +20,7 @@ def _get_arch(self, gpu_type: app_commands.Choice[str]):
return None

async def _run_submission(
self, config: dict, gpu_type: GPUType, status: ProgressReporter
self, config: dict, gpu_type: GPUType, status: RunProgressReporter
) -> FullResult:
selected_gpu = GPUType.AMD if gpu_type.value == "amd" else GPUType.NVIDIA

Expand Down Expand Up @@ -91,7 +92,7 @@ async def _run_submission(
system = SystemInfo(**data.get("system", {}))
return FullResult(success=True, error="", runs=runs, system=system)

async def wait_callback(self, run: GitHubRun, status: ProgressReporter):
async def wait_callback(self, run: GitHubRun, status: RunProgressReporter):
await status.update(
f"⏳ Workflow [{run.run_id}]({run.html_url}): {run.status} "
f"({run.elapsed_time.total_seconds():.1f}s)"
Expand Down
38 changes: 14 additions & 24 deletions src/discord-cluster-manager/cogs/leaderboard_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from discord import app_commands
from discord.ext import commands, tasks
from leaderboard_db import LeaderboardDB, leaderboard_name_autocomplete
from report import MultiProgressReporter, RunProgressReporter
from run_eval import FullResult
from task import LeaderboardTask
from ui.misc import GPUSelectionView
Expand Down Expand Up @@ -45,25 +46,22 @@ async def async_submit_cog_job(
script: discord.Attachment,
command: Callable,
task: LeaderboardTask,
submission_content,
reporter: RunProgressReporter,
gpu: app_commands.Choice[str],
runner_name: str,
mode: SubmissionMode,
submission_id: int,
):
discord_thread, result = await command(
result = await command(
interaction,
script,
gpu,
reporter,
task=task,
mode=mode,
)
result: FullResult

# no point going further if this already failed
if discord_thread is None:
return -1

try:
if result.success:
score = None
Expand Down Expand Up @@ -93,11 +91,13 @@ async def async_submit_cog_job(
system=result.system,
)
except Exception as e:
logger.error("Error in leaderboard submission", exc_info=e)
await discord_thread.send(
logger.error("Error in recording leaderboard submission", exc_info=e)
await send_discord_message(
interaction,
"## Result:\n"
+ f"Leaderboard submission to '{leaderboard_name}' on {gpu.name} "
+ f"using {runner_name} runners failed!\n",
+ f"using {runner_name} could not be recorded in the database!\n",
ephemeral=True,
)

async def select_gpu_view(
Expand Down Expand Up @@ -251,7 +251,6 @@ async def on_submit_hook( # noqa: C901
await interaction.response.defer(ephemeral=True)

if cmd_gpus is not None:
needs_followup = True
selected_gpus = []
for g in cmd_gpus:
if g in task_gpus:
Expand All @@ -266,10 +265,8 @@ async def on_submit_hook( # noqa: C901
)
return -1
elif len(task_gpus) == 1:
needs_followup = True
selected_gpus = task_gpus
else:
needs_followup = False
view = await self.select_gpu_view(interaction, leaderboard_name, task_gpus)
selected_gpus = view.selected_gpus

Expand All @@ -291,15 +288,8 @@ async def on_submit_hook( # noqa: C901
user_name=user_name,
)

if needs_followup:
gpu_list = ", ".join([f"**{g.name}**" for g in selected_gpus])
plural = "" if len(selected_gpus) == 1 else "s"
await send_discord_message(
interaction,
f"Running for `{leaderboard_name}` on GPU{plural}: {gpu_list}",
ephemeral=True,
)

run_msg = f"Submission **{sub_id}**: `{script.filename}` for `{leaderboard_name}`"
reporter = MultiProgressReporter(run_msg)
try:
tasks = [
self.async_submit_cog_job(
Expand All @@ -308,7 +298,7 @@ async def on_submit_hook( # noqa: C901
script,
command,
task,
submission_content,
reporter.add_run(f"{gpu.name} on {gpu.runner}"),
app_commands.Choice(name=gpu.name, value=gpu.value),
gpu.runner,
mode,
Expand All @@ -326,15 +316,15 @@ async def on_submit_hook( # noqa: C901
script,
command,
task,
submission_content,
reporter.add_run(f"{gpu.name} on {gpu.runner} (secret)"),
app_commands.Choice(name=gpu.name, value=gpu.value),
gpu.runner,
SubmissionMode.PRIVATE,
sub_id,
)
for gpu, command in zip(selected_gpus, commands, strict=False)
]

await reporter.show(interaction)
await asyncio.gather(*tasks)
finally:
with self.bot.leaderboard_db as db:
Expand Down
9 changes: 5 additions & 4 deletions src/discord-cluster-manager/cogs/modal_cog.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import asyncio

import modal
from cogs.submit_cog import ProgressReporter, SubmitCog
from cogs.submit_cog import SubmitCog
from consts import GPU_TO_SM, MODAL_CUDA_INCLUDE_DIRS, GPUType, ModalGPU
from discord import app_commands
from report import RunProgressReporter
from run_eval import FullResult
from utils import setup_logging

Expand All @@ -18,17 +19,17 @@ def _get_arch(self, gpu_type: app_commands.Choice[str]):
return GPU_TO_SM[gpu_type.value.upper()]

async def _run_submission(
self, config: dict, gpu_type: GPUType, status: ProgressReporter
self, config: dict, gpu_type: GPUType, status: RunProgressReporter
) -> FullResult:
loop = asyncio.get_event_loop()
if config["lang"] == "cu":
config["include_dirs"] = config.get("include_dirs", []) + MODAL_CUDA_INCLUDE_DIRS
func_type = "pytorch" if config["lang"] == "py" else "cuda"
func_name = f"run_{func_type}_script_{gpu_type.value.lower()}"

logger.info(f"Starting modal run using {func_name}")
logger.info(f"Starting Modal run using {func_name}")

await status.push("⏳ Waiting for modal run to finish...")
await status.push("⏳ Waiting for Modal run to finish...")

result = await loop.run_in_executor(
None,
Expand Down
94 changes: 34 additions & 60 deletions src/discord-cluster-manager/cogs/submit_cog.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from enum import Enum
from typing import TYPE_CHECKING, Optional, Tuple, Type
from typing import TYPE_CHECKING, Optional, Type

if TYPE_CHECKING:
from bot import ClusterBot
Expand All @@ -9,46 +9,14 @@
from consts import SubmissionMode
from discord import app_commands
from discord.ext import commands
from report import generate_report, private_run_report
from report import MultiProgressReporter, RunProgressReporter, generate_report, private_run_report
from run_eval import FullResult
from task import LeaderboardTask
from utils import build_task_config, send_discord_message, setup_logging, with_error_handling

logger = setup_logging()


class ProgressReporter:
def __init__(self, status_msg: discord.Message, header: str):
self.header = header
self.lines = []
self.status = status_msg

@staticmethod
async def make_reporter(thread: discord.Thread, content: str):
status_msg = await thread.send(f"**{content}**\n")
return ProgressReporter(status_msg, content)

async def push(self, content: str | list[str]):
if isinstance(content, str):
self.lines.append(f"> {content}")
else:
for line in content:
self.lines.append(f"> {line}")
await self._update_message()

async def update(self, new_content: str):
self.lines[-1] = f"> {new_content}"
await self._update_message()

async def update_header(self, new_header):
self.header = new_header
await self._update_message()

async def _update_message(self):
message = str.join("\n", [f"**{self.header}**"] + self.lines)
await self.status.edit(content=message, suppress=True)


class SubmitCog(commands.Cog):
"""
Base class for code submission / run schedular cogs.
Expand Down Expand Up @@ -100,21 +68,23 @@ async def submit_leaderboard(
interaction: discord.Interaction,
script: discord.Attachment,
gpu_type: app_commands.Choice[str],
reporter: RunProgressReporter,
task: LeaderboardTask,
mode: SubmissionMode,
) -> Tuple[Optional[discord.Thread], Optional[FullResult]]:
) -> Optional[FullResult]:
"""
Function invoked by `leaderboard_cog` to handle a leaderboard run.
"""
thread, result = await self._handle_submission(
result = await self._handle_submission(
interaction,
gpu_type,
reporter,
script=script,
task=task,
mode=mode,
)

return thread, result
return result

@with_error_handling
async def run_script(
Expand All @@ -126,18 +96,22 @@ async def run_script(
"""
Function invoked by the `run` command to run a single script.
"""
reporter = MultiProgressReporter("Script run")
rep = reporter.add_run(f"{gpu_type.name}")
await reporter.show(interaction)
await self._handle_submission(
interaction, gpu_type, script=script, task=None, mode=SubmissionMode.SCRIPT
interaction, gpu_type, rep, script=script, task=None, mode=SubmissionMode.SCRIPT
)

async def _handle_submission(
self,
interaction: discord.Interaction,
gpu_type: app_commands.Choice[str],
reporter: RunProgressReporter,
script: discord.Attachment,
task: Optional[LeaderboardTask],
mode: SubmissionMode,
) -> Tuple[Optional[discord.Thread], Optional[FullResult]]:
) -> Optional[FullResult]:
"""
Generic function to handle code submissions.
Args:
Expand All @@ -147,49 +121,49 @@ async def _handle_submission(
task: Task specification, of provided

Returns:
if successful, returns the created discord thread, and the result of
the run.
if successful, returns the result of the run.
"""
thread_name = f"{self.name} - {mode.value.capitalize()} Job"

script_content = await self._validate_input_file(interaction, script)
if script_content is None:
return None, None
return None

# TODO figure out the correct way to handle messaging here
thread = await self.bot.create_thread(interaction, gpu_type.name, f"{thread_name}")
run_msg = (
f"Running {mode.value.capitalize()} job for `{script.filename}`"
f" by {interaction.user.display_name} on {self.name} with {gpu_type.name}"
)
status = await ProgressReporter.make_reporter(thread, f"{run_msg}...")

if mode != SubmissionMode.PRIVATE:
thread = await interaction.channel.create_thread(
name=f"{script.filename} on {gpu_type.name} ({self.name})",
type=discord.ChannelType.private_thread,
auto_archive_duration=1440,
)
await thread.add_tags(interaction.user)
config = build_task_config(
task=task, submission_content=script_content, arch=self._get_arch(gpu_type), mode=mode
)

logger.info("submitting task to runner %s", self.name)

result = await self._run_submission(config, gpu_type, status)
result = await self._run_submission(config, gpu_type, reporter)

if not result.success:
await status.update_header(f"{run_msg}... ❌ failure")
await status.push(result.error)
return thread, result
await reporter.update_title(reporter.title + " ❌ failure")
await reporter.push(result.error)
return result
else:
await status.update_header(f"{run_msg}... ✅ success")
await reporter.update_title(reporter.title + " ✅ success")

if mode == SubmissionMode.PRIVATE:
lines = private_run_report(result.runs)
await status.push(lines)
await reporter.push(private_run_report(result.runs))
else:
if mode == SubmissionMode.LEADERBOARD:
await reporter.push(private_run_report(result.runs))

try:
await generate_report(thread, result.runs)
await reporter.push(f"See results at {thread.jump_url}")
except Exception as E:
logger.error("Error generating report. Result: %s", result, exc_info=E)
raise

return thread, result
return result

async def _validate_input_file(
self,
Expand Down Expand Up @@ -225,7 +199,7 @@ async def _validate_input_file(
return None

async def _run_submission(
self, config: dict, gpu_type: app_commands.Choice[str], status: ProgressReporter
self, config: dict, gpu_type: app_commands.Choice[str], status: RunProgressReporter
) -> FullResult:
"""
Run a submission specified by `config`.
Expand Down
Loading
Loading