diff --git a/apps/deepsirius-ui/public/templates/user-partitions.sh b/apps/deepsirius-ui/public/templates/user-partitions.sh new file mode 100644 index 0000000..28ee852 --- /dev/null +++ b/apps/deepsirius-ui/public/templates/user-partitions.sh @@ -0,0 +1,266 @@ +#!/bin/bash +# This script lists the user's available nodes in the cluster and their load using the SLURM CLI. + +# Function to print an error message and exit +error_exit() { + local error_message=$1 + echo "Error: $error_message" + exit 1 +} + +# Function to check if the user exists +check_user_is_registered() { + local username=$1 + local user_in_sacctmgr + user_in_sacctmgr=$(sacctmgr show user "$username" --noheader) + if [ -z "$user_in_sacctmgr" ]; then + error_exit "User $username does not exist." + fi +} + +# Function to list all partitions QoS limits +list_partitions_qos_limits() { + sacctmgr -P -r show qos format=Name,GrpTRES --noheader || error_exit "Failed to get partitions qos limits." +} + +# Function to list all partitions load +list_all_partitions_load() { + sinfo --Format="Partition:|,CPUsState:|,NodeList:|,Gres:|,GresUsed:" --noheader || error_exit "Failed to get node load information." +} + +# Function to list user partitions +list_user_partitions() { + local username=$1 + sacctmgr show assoc user="$username" format=partition -P --noheader || error_exit "Failed to get user partitions." +} + +# Function to list all partitions QoS +list_all_partitions_qos() { + scontrol show partition --oneliner || error_exit "Failed to get partitions qos information." +} + +# Function to format grep list +format_grep_list() { + local list=$1 + local grep_list="" + local partition + while IFS= read -r partition; do + grep_list+="$partition|" + done <<<"$list" + # Remove the trailing '|' + grep_list=${grep_list%|} + echo "$grep_list" +} + +# Function to format partition qos +parse_partition_qos() { + local partition_qos=$1 + local partition_name + local qos + IFS=' ' read -r -a properties <<<"$partition_qos" + for property in "${properties[@]}"; do + IFS='=' read -r key value <<<"$property" + case $key in + PartitionName) + partition_name=$value + ;; + QoS) + qos=$value + ;; + esac + done + echo "$partition_name|$qos" +} + +# Function to parse CPU state +parse_cpus_state() { + local cpus_state=$1 + IFS='/' read -r allocated idle other total <<<"$cpus_state" + local json_output + json_output="{" + json_output+="\"allocated\":\"$allocated\"," + json_output+="\"idle\":\"$idle\"," + json_output+="\"other\":\"$other\"," + json_output+="\"total\":\"$total\"" + json_output+="}" + echo "$json_output" +} + +parse_qos() { + local input_string="$1" + local cpu="null" + local gpu="null" + local mem="null" + + # Check if the input string is empty + if [[ -z "$input_string" ]]; then + # Construct the JSON object + json_output="{" + json_output+="\"cpu\":\"$cpu\"," + json_output+="\"gpu\":\"$gpu\"," + json_output+="\"mem\":\"$mem\"" + json_output+="}" + + # Print the JSON object + echo "$json_output" + return + fi + + # Split the string by commas and process each key-value pair + IFS=',' read -ra kv_pairs <<<"$input_string" + for kv in "${kv_pairs[@]}"; do + IFS='=' read -r key value <<<"$kv" + case "$key" in + "cpu") + cpu="$value" + ;; + "gres/gpu") + gpu="$value" + ;; + "mem") + mem="$value" + ;; + esac + done + + # Construct the JSON object + json_output="{" + json_output+="\"cpu\":\"$cpu\"," + json_output+="\"gpu\":\"$gpu\"," + json_output+="\"mem\":\"$mem\"" + json_output+="}" + + # Print the JSON object + echo "$json_output" +} + +# Main function to combine partition data +combine_partition_data() { + local username=$1 + + # Get the data from the functions + local qos_limits_list=$(list_partitions_qos_limits) + local partitions_load=$(list_all_partitions_load) + local user_partitions=$(list_user_partitions "$username") + local partitions_qos=$(list_all_partitions_qos) + + # Convert user_partitions to a grep list + local user_partitions_grep_list + user_partitions_grep_list=$(format_grep_list "$user_partitions") + + # Filter partitions_qos by user_partitions + local filtered_partitions_qos=$(echo "$partitions_qos" | grep -E "$user_partitions_grep_list") + + # if user_partitions is empty, set it to all partitions + if [ -z "$user_partitions" ]; then + user_partitions=$(echo "$partitions_load" | cut -d'|' -f1) + fi + + # format start of the JSON output + local json_complete_output="" + json_complete_output+="{" + json_complete_output+="\"username\":\"$username\"," + json_complete_output+="\"partitions\":" + + # Start the partitions array output + local partitions_array_output="" + partitions_array_output+="[" + # Process each user partition + while IFS= read -r user_partition; do + local partition_json_output="" + local qos="" + local qos_limits="" + local load="" + local node_list="" + local gres_total="" + local gres_used="" + + # Get QoS for the partition + while IFS= read -r partition_qos; do + parsed_qos=$(parse_partition_qos "$partition_qos") + IFS='|' read -r partition qos_name <<<"$parsed_qos" + if [[ "$partition" == "$user_partition" ]]; then + qos="$qos_name" + break + fi + done <<<"$filtered_partitions_qos" + + # Get QoS limits for the partition + while IFS='|' read -r qos_name limits; do + if [[ "$qos_name" == "$qos" ]]; then + # formatting limits + qos_limits=$(parse_qos "$limits") + break + fi + done <<<"$qos_limits_list" + + if [[ -z "$qos_limits" ]]; then + qos_limits=$(parse_qos "") + fi + + # Get load for the partition + while IFS='|' read -r partition cpus_state node_list gres_total gres_used; do + if [[ "$partition" == "$user_partition" ]]; then + load=$(parse_cpus_state "$cpus_state") + break + fi + done <<<"$partitions_load" + # Format and display the combined data + local formatted_gres_total + local formatted_gres_used + if [[ -n "$gres_total" ]]; then + formatted_gres_total=$(echo "$gres_total" | sed -e 's/gpu://g') + else + formatted_gres_total="" + fi + + if [[ -n "$gres_used" ]]; then + formatted_gres_used=$(echo "$gres_used" | sed -e 's/gpu://g') + else + formatted_gres_used="" + fi + + # Manually format and display the combined data as JSON + partition_json_output="{" + partition_json_output+="\"partitionName\":\"$user_partition\"," + partition_json_output+="\"qos\":\"$qos\"," + partition_json_output+="\"nodeList\":\"$node_list\"," + partition_json_output+="\"cpusState\":$load," + partition_json_output+="\"gresTotal\":\"$formatted_gres_total\"," + partition_json_output+="\"gresUsed\":\"$formatted_gres_used\"," + partition_json_output+="\"groupQoSLimit\":$qos_limits" + partition_json_output+="}" + # Add the partition JSON output to the partitions array output with a comma + partitions_array_output+="$partition_json_output," + done <<<"$user_partitions" + # format end of the JSON output + # remove last comma and + partitions_array_output="${partitions_array_output%,}" + # close the JSON array + partitions_array_output+="]" + # Add the partitions array output to the complete JSON output + json_complete_output+="$partitions_array_output" + # close the JSON output + json_complete_output+="}" + # Print the complete JSON output + echo "$json_complete_output" +} + +# Main function +main() { + local username=${1:-} + if [ -z "$username" ]; then + echo "Missing username argument." + exit 1 + fi + check_user_is_registered "$username" + + combine_partition_data "$username" + exit 0 +} + +# Exit on error, undefined variable, or error in pipeline +set -euo pipefail +# Execute the main function +export USERNAME=${INPUT_USERNAME} +main "$USERNAME" \ No newline at end of file diff --git a/apps/deepsirius-ui/src/components/workboard/node-component-forms/augmentation-form.tsx b/apps/deepsirius-ui/src/components/workboard/node-component-forms/augmentation-form.tsx index 7f60a1b..58325d4 100644 --- a/apps/deepsirius-ui/src/components/workboard/node-component-forms/augmentation-form.tsx +++ b/apps/deepsirius-ui/src/components/workboard/node-component-forms/augmentation-form.tsx @@ -1,7 +1,7 @@ 'use client'; import * as z from 'zod'; import { zodResolver } from '@hookform/resolvers/zod'; -import { slurmGPUOptions, slurmPartitionOptions } from '~/lib/constants'; +import { slurmGPUOptions } from '~/lib/constants'; import { useForm } from 'react-hook-form'; import { Form, @@ -23,9 +23,10 @@ import { Input } from '~/components/ui/input'; import { Button } from '~/components/ui/button'; import { Switch } from '~/components/ui/switch'; import { ScrollArea } from '~/components/ui/scroll-area'; +import { api } from '~/utils/api'; const slurmOptions = z.object({ - partition: z.enum(slurmPartitionOptions), + partition: z.string(), nGPU: z.enum(slurmGPUOptions), }); @@ -162,6 +163,7 @@ function useAugmentationForm(name: FormProps['name'] = '') { } export function AugmentationForm({ onSubmitHandler, name }: FormProps) { + const userPartitions = api.job.userPartitions.useQuery(); const form = useAugmentationForm(name); const onSubmit = () => { onSubmitHandler(form.getValues()); @@ -796,16 +798,35 @@ export function AugmentationForm({ onSubmitHandler, name }: FormProps) { render={({ field }) => ( Slurm Partition - - {slurmPartitionOptions.map((item) => ( - - {item} + {userPartitions.data?.partitions.map((option) => ( + + + {option.partition} + + + + {option.cpus.free} + + /{option.cpus.max} cpus,{' '} + + {option.gpus.free} + + /{option.gpus.max} gpus + ))} @@ -814,7 +835,11 @@ export function AugmentationForm({ onSubmitHandler, name }: FormProps) { Please select a slurm partition assigned for your user for submitting this job. - + + {userPartitions.isError && + `Error loading partitions: ${userPartitions.error.message}`} + {userPartitions.isLoading && `Searching user partitions...`} + )} /> diff --git a/apps/deepsirius-ui/src/components/workboard/node-component-forms/dataset-form.tsx b/apps/deepsirius-ui/src/components/workboard/node-component-forms/dataset-form.tsx index 4fef344..6f06255 100644 --- a/apps/deepsirius-ui/src/components/workboard/node-component-forms/dataset-form.tsx +++ b/apps/deepsirius-ui/src/components/workboard/node-component-forms/dataset-form.tsx @@ -36,10 +36,10 @@ import { SelectValue, } from '~/components/ui/select'; -import { slurmPartitionOptions } from '~/lib/constants'; +import { api } from '~/utils/api'; const slurmOptions = z.object({ - partition: z.enum(slurmPartitionOptions), + partition: z.string(), }); const powerSizes = ['16', '32', '64', '128', '256', '512', '1024'] as const; const strategies = ['uniform'] as const; @@ -101,6 +101,7 @@ function useDatasetForm( } export function DatasetForm({ onSubmitHandler, name, data }: FormProps) { + const userPartitions = api.job.userPartitions.useQuery(); const form = useDatasetForm(name, data); const { fields, append, remove } = useFieldArray({ name: 'data', @@ -405,16 +406,35 @@ export function DatasetForm({ onSubmitHandler, name, data }: FormProps) { render={({ field }) => ( Slurm Partition - - {slurmPartitionOptions.map((item) => ( - - {item} + {userPartitions.data?.partitions.map((option) => ( + + + {option.partition} + + + + {option.cpus.free} + + /{option.cpus.max} cpus,{' '} + + {option.gpus.free} + + /{option.gpus.max} gpus + ))} @@ -423,7 +443,11 @@ export function DatasetForm({ onSubmitHandler, name, data }: FormProps) { Please select a slurm partition assigned for your user for submitting this job. - + + {userPartitions.isError && + `Error loading partitions: ${userPartitions.error.message}`} + {userPartitions.isLoading && `Searching user partitions...`} + )} /> diff --git a/apps/deepsirius-ui/src/components/workboard/node-component-forms/finetune-form.tsx b/apps/deepsirius-ui/src/components/workboard/node-component-forms/finetune-form.tsx index 75356cf..6ecaca4 100644 --- a/apps/deepsirius-ui/src/components/workboard/node-component-forms/finetune-form.tsx +++ b/apps/deepsirius-ui/src/components/workboard/node-component-forms/finetune-form.tsx @@ -22,10 +22,11 @@ import { SelectValue, } from '~/components/ui/select'; import { Switch } from '~/components/ui/switch'; -import { slurmGPUOptions, slurmPartitionOptions } from '~/lib/constants'; +import { slurmGPUOptions } from '~/lib/constants'; +import { api } from '~/utils/api'; const slurmOptions = z.object({ - partition: z.enum(slurmPartitionOptions), + partition: z.string(), nGPU: z.enum(slurmGPUOptions), }); @@ -91,6 +92,7 @@ const lossOpts: FormFieldItems = [ ]; export function FinetuneForm({ onSubmitHandler }: FormProps) { + const userPartitions = api.job.userPartitions.useQuery(); const form = useFinetuneForm(); const onSubmit = () => { @@ -263,20 +265,35 @@ export function FinetuneForm({ onSubmitHandler }: FormProps) { render={({ field }) => ( Slurm Partition - - Slurm Partition - } - /> + - {slurmPartitionOptions.map((item) => ( - - {item} + {userPartitions.data?.partitions.map((option) => ( + + + {option.partition} + + + + {option.cpus.free} + + /{option.cpus.max} cpus,{' '} + + {option.gpus.free} + + /{option.gpus.max} gpus + ))} @@ -285,7 +302,11 @@ export function FinetuneForm({ onSubmitHandler }: FormProps) { Please select a slurm partition assigned for your user for submitting this job. - + + {userPartitions.isError && + `Error loading partitions: ${userPartitions.error.message}`} + {userPartitions.isLoading && `Searching user partitions...`} + )} /> diff --git a/apps/deepsirius-ui/src/components/workboard/node-component-forms/inference-form.tsx b/apps/deepsirius-ui/src/components/workboard/node-component-forms/inference-form.tsx index 75526ee..0b26df8 100644 --- a/apps/deepsirius-ui/src/components/workboard/node-component-forms/inference-form.tsx +++ b/apps/deepsirius-ui/src/components/workboard/node-component-forms/inference-form.tsx @@ -24,10 +24,11 @@ import { SelectValue, } from '~/components/ui/select'; import { Switch } from '~/components/ui/switch'; -import { slurmGPUOptions, slurmPartitionOptions } from '~/lib/constants'; +import { slurmGPUOptions } from '~/lib/constants'; +import { api } from '~/utils/api'; const slurmOptions = z.object({ - partition: z.enum(slurmPartitionOptions), + partition: z.string(), nGPU: z.enum(slurmGPUOptions), }); @@ -92,6 +93,7 @@ export function InferenceForm({ outputDir, inputImages, }: InferenceFormProps) { + const userPartitions = api.job.userPartitions.useQuery(); const form = useInferenceForm(outputDir, inputImages); const { fields, append, remove } = useFieldArray({ name: 'inputImages', @@ -296,20 +298,35 @@ export function InferenceForm({ render={({ field }) => ( Slurm Partition - - Slurm Partition - } - /> + - {slurmPartitionOptions.map((item) => ( - - {item} + {userPartitions.data?.partitions.map((option) => ( + + + {option.partition} + + + + {option.cpus.free} + + /{option.cpus.max} cpus,{' '} + + {option.gpus.free} + + /{option.gpus.max} gpus + ))} @@ -318,7 +335,11 @@ export function InferenceForm({ Please select a slurm partition assigned for your user for submitting this job. - + + {userPartitions.isError && + `Error loading partitions: ${userPartitions.error.message}`} + {userPartitions.isLoading && `Searching user partitions...`} + )} /> diff --git a/apps/deepsirius-ui/src/components/workboard/node-component-forms/network-form.tsx b/apps/deepsirius-ui/src/components/workboard/node-component-forms/network-form.tsx index 7d546be..5f76fde 100644 --- a/apps/deepsirius-ui/src/components/workboard/node-component-forms/network-form.tsx +++ b/apps/deepsirius-ui/src/components/workboard/node-component-forms/network-form.tsx @@ -22,10 +22,11 @@ import { SelectTrigger, SelectValue, } from '~/components/ui/select'; -import { slurmGPUOptions, slurmPartitionOptions } from '~/lib/constants'; +import { slurmGPUOptions } from '~/lib/constants'; +import { api } from '~/utils/api'; const slurmOptions = z.object({ - partition: z.enum(slurmPartitionOptions), + partition: z.string(), nGPU: z.enum(slurmGPUOptions), }); @@ -122,6 +123,7 @@ export function NetworkForm({ networkTypeName, onSubmitHandler, }: NetworkFormProps) { + const userPartitions = api.job.userPartitions.useQuery(); const form = useNetworkForm({ networkUserLabel, networkTypeName }); const onSubmit = () => { @@ -324,20 +326,35 @@ export function NetworkForm({ render={({ field }) => ( Slurm Partition - - Slurm Partition - } - /> + - {slurmPartitionOptions.map((item) => ( - - {item} + {userPartitions.data?.partitions.map((option) => ( + + + {option.partition} + + + + {option.cpus.free} + + /{option.cpus.max} cpus,{' '} + + {option.gpus.free} + + /{option.gpus.max} gpus + ))} @@ -346,7 +363,11 @@ export function NetworkForm({ Please select a slurm partition assigned for your user for submitting this job. - + + {userPartitions.isError && + `Error loading partitions: ${userPartitions.error.message}`} + {userPartitions.isLoading && `Searching user partitions...`} + )} /> diff --git a/apps/deepsirius-ui/src/lib/constants.ts b/apps/deepsirius-ui/src/lib/constants.ts index f7649bd..972159f 100644 --- a/apps/deepsirius-ui/src/lib/constants.ts +++ b/apps/deepsirius-ui/src/lib/constants.ts @@ -1,20 +1,3 @@ -export const slurmPartitionOptions = [ - 'cpu', - 'mnc', - 'imb', - 'mgn', - 'mgn-staff', - 'cnb', - 'cat', - 'ipe', - 'diff', - 'tepui1', - 'tepui2', - 'dev', - 'power', - // 'fake', -] as const; - export const slurmGPUOptions = ['1', '2', '4'] as const; export const checkStatusRefetchInterval = 30000; // 30 seconds diff --git a/apps/deepsirius-ui/src/lib/schemas/user-partitions.ts b/apps/deepsirius-ui/src/lib/schemas/user-partitions.ts new file mode 100644 index 0000000..bcc9ff7 --- /dev/null +++ b/apps/deepsirius-ui/src/lib/schemas/user-partitions.ts @@ -0,0 +1,29 @@ +import { z } from "zod"; + +const cpusStateSchema = z.object({ + allocated: z.string(), + idle: z.string(), + other: z.string(), + total: z.string(), +}); + +const groupQoSLimitSchema = z.object({ + cpu: z.string().optional(), + gpu: z.string().optional(), + mem: z.string().optional(), +}); + +const partitionSchema = z.object({ + partitionName: z.string(), + qos: z.string(), + nodeList: z.string(), + cpusState: cpusStateSchema, + gresTotal: z.string(), + gresUsed: z.string(), + groupQoSLimit: groupQoSLimitSchema, +}); + +export const userPartitionsResponseSchema = z.object({ + username: z.string(), + partitions: z.array(partitionSchema), +}); \ No newline at end of file diff --git a/apps/deepsirius-ui/src/server/api/routers/job.ts b/apps/deepsirius-ui/src/server/api/routers/job.ts index a152778..fabcd88 100644 --- a/apps/deepsirius-ui/src/server/api/routers/job.ts +++ b/apps/deepsirius-ui/src/server/api/routers/job.ts @@ -1,6 +1,24 @@ import { TRPCError } from '@trpc/server'; import { z } from 'zod'; import { createTRPCRouter, protectedSSHProcedure } from '~/server/api/trpc'; +import { userPartitionsResponseSchema } from '~/lib/schemas/user-partitions'; +import fs from 'fs/promises'; + +// from the query format State,Submit,Start,End,Elapsed,Partition,NodeList,AllocGRES,NCPUS,Reason,ExitCode +const reportSacctFormatSchema = z.object({ + state: z.string(), + submit: z.string().optional(), + start: z.string().optional(), + end: z.string().optional(), + elapsed: z.string().optional(), + partition: z.string().optional(), + nodeList: z.string().optional(), + allocGRES: z.string().optional(), + nCPUS: z.string().optional(), + reason: z.string().optional(), + exitCode: z.string().optional(), +}); + export const jobRouter = createTRPCRouter({ checkStatus: protectedSSHProcedure @@ -64,4 +82,155 @@ export const jobRouter = createTRPCRouter({ return { cancelStatus: 'CANCELLED' }; }), + report: protectedSSHProcedure + .input( + z.object({ + jobId: z.string(), + }), + ) + .query(async ({ ctx, input }) => { + const jobId = input.jobId; + + const connection = ctx.connection; + + const command = `sacct --format="State,Submit,Start,End,Elapsed,Partition,NodeList,AllocGRES,NCPUS,Reason,ExitCode" --parsable2 --job ${jobId} --noheader`; + const { stdout, stderr } = await connection.execCommand(command); + + if (stderr) { + throw new TRPCError({ + code: "INTERNAL_SERVER_ERROR", + message: stderr, + }); + } + + const data = stdout.trim(); + if (data.length === 0) { + throw new TRPCError({ + code: "NOT_FOUND", + message: "Job data not found", + }); + } + + const firstline = data.split("\n")[0]; + if (!firstline) { + throw new TRPCError({ + code: "INTERNAL_SERVER_ERROR", + message: `Error parsing job data for jobId ${jobId}`, + }); + } + + const [ + state, + submit, + start, + end, + elapsed, + partition, + nodeList, + allocGRES, + nCPUS, + reason, + exitCode, + ] = data.split("|"); + + const report = reportSacctFormatSchema.safeParse({ + state: state?.split(" ")[0] ?? state, // the state comes with a suffix that we don't need, so we split it and get the first part + submit, + start, + end, + elapsed, + partition, + nodeList, + allocGRES, + nCPUS, + reason, + exitCode, + }); + + if (report.error) { + throw new TRPCError({ + code: "INTERNAL_SERVER_ERROR", + message: "Error parsing job data:" + report.error.message, + }); + } + + return { + ...report.data + }; + }), + userPartitions: protectedSSHProcedure.query(async ({ ctx }) => { + const connection = ctx.connection; + + const templatePath = "public/templates/user-partitions.sh"; + const scriptTemplate = await fs.readFile(templatePath, "utf-8"); + + const content = scriptTemplate.replace( + "${INPUT_USERNAME}", + ctx.username, + ); + + const { stdout, stderr } = await connection.execCommand(content); + + if (stderr) { + throw new Error(stderr); + } + + if (stdout.trim().length === 0) { + throw new Error("Empty response from user partitions script."); + } + + const parsed = userPartitionsResponseSchema.safeParse( + JSON.parse(stdout.trim()), + ); + if (parsed.error) { + throw new Error(parsed.error.message); + } + + function parseNumberOrUndefined(value: string | null | undefined) { + if (value === null || value === undefined) { + return undefined; + } + if (value === "null") { + return undefined; + } + const parsed = parseInt(value, 10); + if (isNaN(parsed)) { + return undefined; + } + return parsed; + } + + const partitions = parsed.data.partitions.map((partition) => { + // if the groupQoSLimit is set for a limit it overrides the partition limit from cpusState and total gpus from gresTotal + const maxGpus = + parseNumberOrUndefined(partition.groupQoSLimit?.gpu) ?? + parseNumberOrUndefined(partition.gresTotal) ?? + 0; + const maxCpus = + parseNumberOrUndefined(partition.groupQoSLimit?.cpu) ?? + parseNumberOrUndefined(partition.cpusState.total) ?? + 0; + const usedCpus = + parseNumberOrUndefined(partition.cpusState.allocated) ?? 0; + const usedGpus = parseNumberOrUndefined(partition.gresUsed) ?? 0; + const freeCpus = maxCpus > 0 ? maxCpus - usedCpus : 0; + const freeGpus = maxGpus > 0 ? maxGpus - usedGpus : 0; + + return { + partition: partition.partitionName, + nodeList: partition.nodeList, + cpus: { + free: freeCpus, + max: maxCpus, + }, + gpus: { + free: freeGpus, + max: maxGpus, + }, + }; + }); + + return { partitions }; + }) }); +