-
Notifications
You must be signed in to change notification settings - Fork 4
/
run_stage_inr.sh
executable file
·96 lines (91 loc) · 1.9 KB
/
run_stage_inr.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/bin/bash
NPROC_PER_NODE=1
NNODES=1
NODE_RANK=0
MASTER_ADDR=master
OPTS=""
EPOCH_TEST=-1
SEED=0
for i in "$@"; do
case $i in
-nn=*|--n-nodes=*)
NNODES="${i#*=}"
shift
;;
-np=*|--nproc-per-node=*)
NPROC_PER_NODE="${i#*=}"
shift
;;
-nr=*|--node-rank=*)
NODE_RANK="${i#*=}"
shift
;;
-m=*)
MODEL_CONFIG="${i#*=}"
shift
;;
-r=*)
OUTPUT="${i#*=}"
shift
;;
-s=*)
SEED="${i#*=}"
shift
;;
-e=*)
EPOCH_TEST="${i#*=}"
shift
;;
-p=*)
POSTFIX="${i#*=}"
shift
;;
--master=*)
MASTER_ADDR="${i#*=}"
shift
;;
--eval)
OPTS+=" --eval"
shift
;;
--resume)
OPTS+=" --resume"
shift
;;
--resume-lr-reduction)
OPTS+=" --resume-lr-reduction"
shift
;;
--fp16_compress)
OPTS+=" --fp16_compress"
shift
;;
*)
OPTS+=" ${i}"
shift
;;
esac
done
if [ "$NNODES" -gt 1 ]; then
MASTER_ADDR=$MASTER_ADDR
MASTER_ADDR+=.$TASK_GROUP_NAME
sleep 60
else
MASTER_ADDR=localhost
fi
echo $MASTER_ADDR
python -m torch.distributed.launch --nproc_per_node=$NPROC_PER_NODE \
--nnodes=$NNODES \
--master_addr=$MASTER_ADDR \
--master_port=2901 \
--node_rank=$NODE_RANK \
./src/main_stage_inr.py \
-m=$MODEL_CONFIG \
-r=$OUTPUT \
-e=$EPOCH_TEST \
-p=$POSTFIX \
--nproc_per_node=$NPROC_PER_NODE \
--nnodes=$NNODES \
--node_rank=$NODE_RANK \
--seed=$SEED \
$OPTS