diff --git a/.github/workflows/train_workflow.yml b/.github/workflows/train_workflow.yml index 53d55c2..8b11e65 100644 --- a/.github/workflows/train_workflow.yml +++ b/.github/workflows/train_workflow.yml @@ -3,28 +3,50 @@ on: workflow_dispatch: inputs: script_content: - description: 'Content of train.py' + description: 'Content of training script (Python or CUDA)' required: true - type: string # Explicitly specify the type + type: string + script_type: + description: 'Script type (py or cu)' + required: true + type: choice + options: + - py + - cu jobs: train: - runs-on: ubuntu-latest + runs-on: [gpumode-nvidia-arc] + container: + image: nvidia/cuda:12.4.0-devel-ubuntu22.04 steps: - - name: Install dependencies + - name: Install Python dependencies + if: inputs.script_type == 'py' run: | pip install numpy - # pip install torch - need to find a way to cache this otherwise it will take a long time to install + # Add other Python dependencies as needed + + - name: Create training script + run: | + cat << 'EOL' > train.${{ inputs.script_type }} + ${{ inputs.script_content }} + EOL + cat train.${{ inputs.script_type }} - - name: Create and run training script + - name: Compile and run CUDA script + if: inputs.script_type == 'cu' + run: | + nvcc train.cu -o train_cuda + ./train_cuda > training.log 2>&1 + + - name: Run Python script + if: inputs.script_type == 'py' run: | - echo "${{ inputs.script_content }}" > train.py - cat train.py # Debug: print the content python train.py > training.log 2>&1 - + - name: Upload logs uses: actions/upload-artifact@v3 - if: always() # Upload logs whether the job succeeds or fails + if: always() with: name: training-logs - path: training.log \ No newline at end of file + path: training.log diff --git a/discord-bot.py b/discord-bot.py index 3d1f0dd..9ff6ca4 100644 --- a/discord-bot.py +++ b/discord-bot.py @@ -59,7 +59,7 @@ def get_github_branch_name(): client = discord.Client(intents=intents) -async def trigger_github_action(script_content): +async def trigger_github_action(script_content, script_type): """ Triggers the GitHub action with custom train.py contents """ @@ -74,7 +74,7 @@ async def trigger_github_action(script_content): workflow = repo.get_workflow("train_workflow.yml") logger.info("Found workflow, attempting to dispatch") - success = workflow.create_dispatch(get_github_branch_name(), {'script_content': script_content}) + success = workflow.create_dispatch(get_github_branch_name(), {'script_content': script_content, 'script_type': script_type}) logger.info(f"Workflow dispatch result: {success}") if success: @@ -172,6 +172,60 @@ async def on_ready(): except Exception as e: logger.warning(f'Failed to update nickname in guild {guild.name}: {e}') + +async def process_file(message, attachment, script_type): + """ + Generic file processor that handles both .py and .cu files + """ + # Reply to the original message + initial_reply = await message.reply(f"Found train.{script_type}! Starting training process...") + + # Create a new thread from the reply + thread = await initial_reply.create_thread( + name=f"Training Job - {datetime.now().strftime('%Y-%m-%d %H:%M')}", + auto_archive_duration=1440 + ) + + try: + # Download the file content + logger.info(f"Downloading train.{script_type} content") + script_content = await attachment.read() + script_content = script_content.decode('utf-8') + logger.info(script_content) + logger.info(f"Successfully read train.{script_type} content") + + # Trigger GitHub Action with appropriate script_type + run_id = await trigger_github_action(script_content, script_type) + + if run_id: + logger.info(f"Successfully triggered workflow with run ID: {run_id}") + await thread.send(f"GitHub Action triggered successfully! Run ID: {run_id}\nMonitoring progress...") + + # Monitor the workflow + status, logs, url = await check_workflow_status(run_id, thread) + + # Send results back to Discord thread + await thread.send(f"Training completed with status: {status}") + + # Split logs if they're too long for Discord's message limit + if len(logs) > 1900: + chunks = [logs[i:i+1900] for i in range(0, len(logs), 1900)] + for i, chunk in enumerate(chunks): + await thread.send(f"```\nLogs (part {i+1}/{len(chunks)}):\n{chunk}\n```") + else: + await thread.send(f"```\nLogs:\n{logs}\n```") + + if url: + await thread.send(f"View the full run at: {url}") + else: + logger.error("Failed to trigger GitHub Action") + await thread.send("Failed to trigger GitHub Action. Please check the configuration.") + + except Exception as e: + logger.error(f"Error processing request: {str(e)}", exc_info=True) + await thread.send(f"Error processing request: {str(e)}") + + @client.event async def on_message(message): # Ignore messages from the bot itself @@ -185,57 +239,14 @@ async def on_message(message): for attachment in message.attachments: logger.info(f"Processing attachment: {attachment.filename}") if attachment.filename == "train.py": - # Reply to the original message - initial_reply = await message.reply("Found train.py! Starting training process...") - - # Create a new thread from the reply - thread = await initial_reply.create_thread( - name=f"Training Job - {datetime.now().strftime('%Y-%m-%d %H:%M')}", - auto_archive_duration=1440 # Archive after 24 hours of inactivity - ) - - try: - # Download the file content - logger.info("Downloading train.py content") - script_content = await attachment.read() - script_content = script_content.decode('utf-8') - logger.info("Successfully read train.py content") - - # Trigger GitHub Action - run_id = await trigger_github_action(script_content) - - if run_id: - logger.info(f"Successfully triggered workflow with run ID: {run_id}") - await thread.send(f"GitHub Action triggered successfully! Run ID: {run_id}\nMonitoring progress...") - - # Monitor the workflow - status, logs, url = await check_workflow_status(run_id, thread) - - # Send results back to Discord thread - await thread.send(f"Training completed with status: {status}") - - # Split logs if they're too long for Discord's message limit - if len(logs) > 1900: - chunks = [logs[i:i+1900] for i in range(0, len(logs), 1900)] - for i, chunk in enumerate(chunks): - await thread.send(f"```\nLogs (part {i+1}/{len(chunks)}):\n{chunk}\n```") - else: - await thread.send(f"```\nLogs:\n{logs}\n```") - - if url: - await thread.send(f"View the full run at: {url}") - else: - logger.error("Failed to trigger GitHub Action") - await thread.send("Failed to trigger GitHub Action. Please check the configuration.") - - except Exception as e: - logger.error(f"Error processing request: {str(e)}", exc_info=True) - await thread.send(f"Error processing request: {str(e)}") - + await process_file(message, attachment, 'py') + break + elif attachment.filename == "train.cu": + await process_file(message, attachment, 'cu') break - if not any(att.filename == "train.py" for att in message.attachments): - await message.reply("Please attach a file named 'train.py' to your message.") + if not any(att.filename in ["train.py", "train.cu"] for att in message.attachments): + await message.reply("Please attach a file named 'train.py' or 'train.cu' to your message.") # Run the bot if __name__ == "__main__":