gpu-mode · S1ro1 · Nov 9, 2024 · Nov 9, 2024 · Nov 9, 2024 · Nov 9, 2024
diff --git a/.github/workflows/train_workflow.yml b/.github/workflows/train_workflow.yml
@@ -3,28 +3,50 @@ on:
   workflow_dispatch:
     inputs:
       script_content:
-        description: 'Content of train.py'
+        description: 'Content of training script (Python or CUDA)'
         required: true
-        type: string  # Explicitly specify the type
+        type: string
+      script_type:
+        description: 'Script type (py or cu)'
+        required: true
+        type: choice
+        options:
+          - py
+          - cu
 
 jobs:
   train:
-    runs-on: ubuntu-latest
+    runs-on: [gpumode-nvidia-arc]
+    container:
+      image: nvidia/cuda:12.4.0-devel-ubuntu22.04
     steps:
-      - name: Install dependencies
+      - name: Install Python dependencies
+        if: inputs.script_type == 'py'
         run: |
           pip install numpy
-          # pip install torch - need to find a way to cache this otherwise it will take a long time to install
+          # Add other Python dependencies as needed
+
+      - name: Create training script
+        run: |
+          cat << 'EOL' > train.${{ inputs.script_type }}
+          ${{ inputs.script_content }}
+          EOL
+          cat train.${{ inputs.script_type }}
 
-      - name: Create and run training script
+      - name: Compile and run CUDA script
+        if: inputs.script_type == 'cu'
+        run: |
+          nvcc train.cu -o train_cuda
+          ./train_cuda > training.log 2>&1
+
+      - name: Run Python script
+        if: inputs.script_type == 'py'
         run: |
-          echo "${{ inputs.script_content }}" > train.py
-          cat train.py  # Debug: print the content
           python train.py > training.log 2>&1
-        
+
       - name: Upload logs
         uses: actions/upload-artifact@v3
-        if: always()  # Upload logs whether the job succeeds or fails
+        if: always()
         with:
           name: training-logs
-          path: training.log
+          path: training.log
diff --git a/discord-bot.py b/discord-bot.py
@@ -59,7 +59,7 @@ def get_github_branch_name():
 client = discord.Client(intents=intents)
 
 
-async def trigger_github_action(script_content):
+async def trigger_github_action(script_content, script_type):
     """
     Triggers the GitHub action with custom train.py contents
     """
@@ -74,7 +74,7 @@ async def trigger_github_action(script_content):
         workflow = repo.get_workflow("train_workflow.yml")
         logger.info("Found workflow, attempting to dispatch")
 
-        success = workflow.create_dispatch(get_github_branch_name(), {'script_content': script_content})
+        success = workflow.create_dispatch(get_github_branch_name(), {'script_content': script_content, 'script_type': script_type})
         logger.info(f"Workflow dispatch result: {success}")
 
         if success:
@@ -172,6 +172,60 @@ async def on_ready():
         except Exception as e:
             logger.warning(f'Failed to update nickname in guild {guild.name}: {e}')
 
+
+async def process_file(message, attachment, script_type):
+    """
+    Generic file processor that handles both .py and .cu files
+    """
+    # Reply to the original message
+    initial_reply = await message.reply(f"Found train.{script_type}! Starting training process...")
+
+    # Create a new thread from the reply
+    thread = await initial_reply.create_thread(
+        name=f"Training Job - {datetime.now().strftime('%Y-%m-%d %H:%M')}",
+        auto_archive_duration=1440
+    )
+
+    try:
+        # Download the file content
+        logger.info(f"Downloading train.{script_type} content")
+        script_content = await attachment.read()
+        script_content = script_content.decode('utf-8')
+        logger.info(script_content)
+        logger.info(f"Successfully read train.{script_type} content")
+
+        # Trigger GitHub Action with appropriate script_type
+        run_id = await trigger_github_action(script_content, script_type)
+
+        if run_id:
+            logger.info(f"Successfully triggered workflow with run ID: {run_id}")
+            await thread.send(f"GitHub Action triggered successfully! Run ID: {run_id}\nMonitoring progress...")
+
+            # Monitor the workflow
+            status, logs, url = await check_workflow_status(run_id, thread)
+
+            # Send results back to Discord thread
+            await thread.send(f"Training completed with status: {status}")
+
+            # Split logs if they're too long for Discord's message limit
+            if len(logs) > 1900:
+                chunks = [logs[i:i+1900] for i in range(0, len(logs), 1900)]
+                for i, chunk in enumerate(chunks):
+                    await thread.send(f"```\nLogs (part {i+1}/{len(chunks)}):\n{chunk}\n```")
+            else:
+                await thread.send(f"```\nLogs:\n{logs}\n```")
+
+            if url:
+                await thread.send(f"View the full run at: {url}")
+        else:
+            logger.error("Failed to trigger GitHub Action")
+            await thread.send("Failed to trigger GitHub Action. Please check the configuration.")
+
+    except Exception as e:
+        logger.error(f"Error processing request: {str(e)}", exc_info=True)
+        await thread.send(f"Error processing request: {str(e)}")
+
+
 @client.event
 async def on_message(message):
     # Ignore messages from the bot itself
@@ -185,57 +239,14 @@ async def on_message(message):
             for attachment in message.attachments:
                 logger.info(f"Processing attachment: {attachment.filename}")
                 if attachment.filename == "train.py":
-                    # Reply to the original message
-                    initial_reply = await message.reply("Found train.py! Starting training process...")
-
-                    # Create a new thread from the reply
-                    thread = await initial_reply.create_thread(
-                        name=f"Training Job - {datetime.now().strftime('%Y-%m-%d %H:%M')}",
-                        auto_archive_duration=1440  # Archive after 24 hours of inactivity
-                    )
-
-                    try:
-                        # Download the file content
-                        logger.info("Downloading train.py content")
-                        script_content = await attachment.read()
-                        script_content = script_content.decode('utf-8')
-                        logger.info("Successfully read train.py content")
-
-                        # Trigger GitHub Action
-                        run_id = await trigger_github_action(script_content)
-
-                        if run_id:
-                            logger.info(f"Successfully triggered workflow with run ID: {run_id}")
-                            await thread.send(f"GitHub Action triggered successfully! Run ID: {run_id}\nMonitoring progress...")
-
-                            # Monitor the workflow
-                            status, logs, url = await check_workflow_status(run_id, thread)
-
-                            # Send results back to Discord thread
-                            await thread.send(f"Training completed with status: {status}")
-
-                            # Split logs if they're too long for Discord's message limit
-                            if len(logs) > 1900:
-                                chunks = [logs[i:i+1900] for i in range(0, len(logs), 1900)]
-                                for i, chunk in enumerate(chunks):
-                                    await thread.send(f"```\nLogs (part {i+1}/{len(chunks)}):\n{chunk}\n```")
-                            else:
-                                await thread.send(f"```\nLogs:\n{logs}\n```")
-
-                            if url:
-                                await thread.send(f"View the full run at: {url}")
-                        else:
-                            logger.error("Failed to trigger GitHub Action")
-                            await thread.send("Failed to trigger GitHub Action. Please check the configuration.")
-
-                    except Exception as e:
-                        logger.error(f"Error processing request: {str(e)}", exc_info=True)
-                        await thread.send(f"Error processing request: {str(e)}")
-
+                    await process_file(message, attachment, 'py')
+                    break
+                elif attachment.filename == "train.cu":
+                    await process_file(message, attachment, 'cu')
                     break
 
-            if not any(att.filename == "train.py" for att in message.attachments):
-                await message.reply("Please attach a file named 'train.py' to your message.")
+            if not any(att.filename in ["train.py", "train.cu"] for att in message.attachments):
+                await message.reply("Please attach a file named 'train.py' or 'train.cu' to your message.")
 
 # Run the bot
 if __name__ == "__main__":