-
Notifications
You must be signed in to change notification settings - Fork 5
/
find-best-partition
163 lines (125 loc) · 4.33 KB
/
find-best-partition
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/bin/bash
excludeGrp=rc_admin
if hash sinfo 2>/dev/null; then
echo " "
else
echo " --- Error: Slurm is not available in this node."
exit 1
fi
while getopts o:f: option
do
case "${option}"
in
o) optU=${OPTARG};;
f) optF=${OPTARG};;
esac
done
if [[ $optU = help ]]; then
echo " --- Find and print Slurm configuration: sh find-best-partition -f submit.sh -o set"
echo " --- Check for the best Slurm partition: sh find-best-partition -f submit.sh -o check"
fi
submissionScript=$optF
if [[ $optU = set ]]; then
if [[ ! $optF ]]; then
echo " --- Specify Slurm submission script with the -f option."
exit 1
fi
rm -rf tmpwdir
mkdir -p tmpwdir
# Dump Slurm partitions and their allowed groups to a file
scontrol show partition | grep "PartitionName\|AllowGroups" > tmpwdir/slurmPartInfo.txt
# Get total number of Slurm patitions.
numPar=$(scontrol show partition | grep "PartitionName" | wc -l)
# Re-count number of Slurm partitions
nL=$(cat tmpwdir/slurmPartInfo.txt | wc -l)
# Throw an error if there is a mismatch in number of Slurm patitions
if [ $((nL/2)) -ne $numPar ]; then
echo " --- There is error with number of SLURM partitions."
echo " --- Number of actual partitions: $numPar"
echo " --- Number of partitions in the file: $((nL/2))"
fi
# Number of groups associated with user
grps=$(groups)
nGrp=$(echo $grps | wc -w)
rm -f tmpwdir/allowedParts.txt
# Loop over SLURM partitions
for i in $(seq 2 2 $nL); do
sw=$(head -$i tmpwdir/slurmPartInfo.txt | tail -2)
parName=$(echo $sw | cut -d'=' -f2 | cut -d' ' -f1)
for j in $(seq 1 $nGrp); do
grpN=$(echo $grps | cut -d' ' -f$j)
if [ $grpN != $excludeGrp ]; then
if [[ $sw = *"$grpN"* ]] || [[ $sw = *"AllowGroups=ALL"* ]]; then
#echo " --- Group: $grpN, Partition: $parName"
echo $parName >> tmpwdir/allowedParts.txt
fi
fi
done
done
rm -f tmpwdir/slurmPartInfo.txt
fi
if [[ $optU = check ]]; then
if [[ ! $optF ]]; then
echo " --- Specify Slurm submission script with the -f option."
exit 1
fi
if [ ! -e tmpwdir/allowedParts.txt ]; then
echo " --- First running: sh find-best-partition -f $submissionScript -o set"
sh find-best-partition -f $submissionScript -o set
fi
# Check specific Slurm submission line number for partition name
lineN=$(grep -n " -p " $submissionScript | cut -d':' -f1)
if [ ! $lineN ]; then
lineN=$(grep -n " --partition " $submissionScript | cut -d':' -f1)
if [ ! $lineN ]; then
echo " --- Error: Specifiy a default partition in Slurm submission script."
exit 1
fi
fi
rm -f tmpwdir/result.txt
rm -f tmpwdir/error.log
echo " "
errorSt=0
# Loop over each allowed Slurm partition listed in allowedParts.txt file
for i in $(cat tmpwdir/allowedParts.txt); do
# Change the partition name
subSName=tmpwdir/slurm_$i.sh
sed "${lineN}s/.*/#SBATCH -p $i/" $submissionScript > $subSName
tmpF=${subSName/.sh}_tmp.txt
timCurrent=$(date +%s)
# run sbatch with --test-only to get time
sbatch --test-only $subSName > $tmpF 2>&1
if [[ $(cat $tmpF) = *"error"* ]] || [[ $(cat $tmpF) = *"failure"* ]]; then
echo " --- Error using partition: $i"
echo " " >> tmpwdir/error.log
echo " --- Error using partition: $i" >> tmpwdir/error.log
cat $tmpF >> tmpwdir/error.log
errorSt=1
continue
fi
swT=$(cat $tmpF | cut -d' ' -f7)
swT=${swT/T/" "}
timEp=$(date -d "$swT" +%s 2>/dev/null)
timeDiff=$(($timEp-$timCurrent))
if [[ $timeDiff -lt 0 ]]; then
echo " --- Error using partition: $i"
echo " " >> tmpwdir/error.log
echo " --- Error using partition: $i" >> tmpwdir/error.log
cat $tmpF >> tmpwdir/error.log
echo "Current Time: $timCurrent, Partition Run Time: $timEp, Diff: $timeDiff" >> tmpwdir/error.log
errorSt=1
continue
fi
echo " --- $timeDiff: $i" >> tmpwdir/result.txt
done
if [ $errorSt = 1 ]; then
echo " --- Check tmpwdir/error.log for error log"
fi
echo " "
echo " --- Waiting time to run this job on SLURM partitions sorted by time (sec)"
sort -k2 -n tmpwdir/result.txt
# Remove temp files
rm -f tmpwdir/slurm_*.txt tmpwdir/result.txt
echo " "
echo " --- Find SLURM submission scripts inside tmpwdir/ folder"
fi