diff --git a/.gitmodules b/.gitmodules index 9e7334c7..76d73b8a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,4 @@ [submodule "benchmarks"] path = benchmarks - url = http://github.com/lambdal/benchmarks + url = https://github.com/tensorflow/benchmarks.git + branch = master diff --git a/README.md b/README.md index 8614e1b3..c42c0f34 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ This is the code used for a few of the blog posts on: https://lambdalabs.com/blo Environment: - OS: Ubuntu 18.04 -- TensorFlow version: 1.14.0 +- TensorFlow version: 1.15.3 - CUDA Version 10.0 - CUDNN Version 7.6.2 @@ -19,15 +19,47 @@ git clone https://github.com/lambdal/lambda-tensorflow-benchmark.git #### Step Two: Run benchmark with thermal profile ``` -TF_XLA_FLAGS=--tf_xla_auto_jit=2 ./batch_benchmark.sh min_num_gpus max_num_gpus num_runs num_batches_per_run thermal_sampling_frequency +./benchmark.sh -l -h -n -b -t python display_thermal.py path-to-thermal.log --thermal_threshold -# example of benchmarking 4 2080_Ti (all used), 1 run, 200 batches per run, measuring thermal every 2 second. 2080_Ti throttles at 89 C. -TF_XLA_FLAGS=--tf_xla_auto_jit=2 ./batch_benchmark.sh 4 4 1 200 2 -python display_thermal.py i9-7920X-GeForce_RTX_2080_Ti.logs/resnet152-syn-replicated-fp32-4gpus-32-1-thermal.log --thermal_threshold 89 +# example of benchmarking 4 2080_Ti (all used), 1 run, 100 batches per run, measuring thermal every 2 second. 2080_Ti throttles at 89 C. +./benchmark.sh -l 4 -h 4 -n 1 -b 100 -t 2 -c config_resnet50_replicated_fp32_train_syn +python display_thermal.py path-to-thermal/1 --thermal_threshold 89 ``` +#### AMD + +Follow the guidance [here](https://github.com/ROCmSoftwarePlatform/tensorflow-upstream) + +``` +alias drun='sudo docker run \ + -it \ + --network=host \ + --device=/dev/kfd \ + --device=/dev/dri \ + --ipc=host \ + --shm-size 16G \ + --group-add video \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + -v $HOME/dockerx:/dockerx' + +drun rocm/tensorflow:latest + +apt install rocm-libs hipcub miopen-hip +pip3 install --user tensorflow-rocm --upgrade +pip3 install tensorflow + +cd /home/dockerx +git clone https://github.com/lambdal/lambda-tensorflow-benchmark.git --recursive +git checkout tf2 +git submodule update --init --recursive + +./benchmark.sh -l 1 -h 1 -n 1 -b 100 -t 2 -c config_resnet50_replicated_fp32_train_syn +``` + + #### Note Use large num_batches_per_run for a thorough test. @@ -38,7 +70,7 @@ Use large num_batches_per_run for a thorough test. * Input proper gpu_indices (a comma seperated list, default 0) and num_iterations (default 10) ``` cd lambda-tensorflow-benchmark -./benchmark.sh gpu_indices num_iterations +./benchmark.sh -i -n ``` #### Step Three: Report results @@ -46,15 +78,15 @@ cd lambda-tensorflow-benchmark * Check the repo directory for folder \-\.logs (generated by benchmark.sh) * Use the same num_iterations and gpu_indices for both benchmarking and reporting. ``` -./report.sh -.logs num_iterations gpu_indices +./report.sh -.logs ``` #### Batch process: ``` -TF_XLA_FLAGS=--tf_xla_auto_jit=2 ./batch_benchmark.sh min_num_gpus max_num_gpus num_iterations +TF_XLA_FLAGS=--tf_xla_auto_jit=2 ./benchmark.sh -l -h -n -./batch_report.sh -.logs min_num_gpus max_num_gpus num_iterations +./report.sh -.logs ./gether.sh ``` diff --git a/batch_benchmark.sh b/batch_benchmark.sh deleted file mode 100755 index 55fdd207..00000000 --- a/batch_benchmark.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -e - -MIN_NUM_GPU=${1:-1} -MAX_NUM_GPU=${2:-1} - -ITERATIONS=${3:-3} -NUM_BACHES=${4:-100} -THERMAL_INTERVAL=${5:-1} - -join_by() { - local IFS="$1" - shift - echo "$*" -} - -main() { - for gpu in `seq ${MAX_NUM_GPU} -1 ${MIN_NUM_GPU}`; do - gpus=`seq 0 1 $((gpu-1))` - gpus=$(join_by , $gpus) - ./benchmark.sh $gpus $ITERATIONS $NUM_BACHES $THERMAL_INTERVAL - done -} - - -main "$@" diff --git a/batch_report.sh b/batch_report.sh deleted file mode 100755 index c961d91b..00000000 --- a/batch_report.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -e - -REPORT_DIR=$1 - -MIN_NUM_GPU=${2:-0} -MAX_NUM_GPU=${3:-0} - -ITERATIONS=${4:-3} - -join_by() { - local IFS="$1" - shift - echo "$*" -} - - -main() { - for gpu in `seq ${MAX_NUM_GPU} -1 ${MIN_NUM_GPU}`; do - gpus=`seq 0 1 $((gpu-1))` - gpus=$(join_by , $gpus) - ./report.sh $REPORT_DIR $ITERATIONS $gpus - done -} - - -main "$@" diff --git a/benchmark.sh b/benchmark.sh index 46fa131f..c031df97 100755 --- a/benchmark.sh +++ b/benchmark.sh @@ -1,37 +1,18 @@ #!/bin/bash -e -GPU_INDEX=${1:-0} -IFS=', ' read -r -a gpus <<< "$GPU_INDEX" - -ITERATIONS=${2:-100} -NUM_BATCHES=${3:-100} -THERMAL_INTERVAL=${4:-1} - -MIN_NUM_GPU=${#gpus[@]} -MAX_NUM_GPU=$MIN_NUM_GPU -export CUDA_VISIBLE_DEVICES=$GPU_INDEX - -SCRIPT_DIR="$(pwd)/benchmarks/scripts/tf_cnn_benchmarks" - -CPU_NAME="$(lscpu | awk '/Model name:/ { - if ($3" "$4 ~ "AMD Ryzen") print $6; - else if ($5 ~ "CPU") print $4; - else print $5; - exit -}')" - -GPU_NAME="$(nvidia-smi -i 0 --query-gpu=gpu_name --format=csv,noheader 2>/dev/null || echo PLACEHOLDER )" -GPU_NAME="${GPU_NAME// /_}" - -CONFIG_NAME="${CPU_NAME}-${GPU_NAME}" -echo $CONFIG_NAME - - -DATA_DIR="/home/${USER}/nfs/imagenet_mini" -LOG_DIR="$(pwd)/${CONFIG_NAME}.logs" +installed() { + command -v "$1" >/dev/null 2>&1 +} -THROUGHPUT="$(mktemp)" -echo 0 > $THROUGHPUT +die() { + echo "$0: $*" 1>&2 + exit 1 +} +[ -z "$GPU_VENDOR" ] && if installed nvidia-smi; then + GPU_VENDOR='nvidia' +elif installed rocm-smi; then + GPU_VENDOR='amd' +fi declare -A DATASET_NAMES=( [resnet50]=imagenet @@ -51,11 +32,65 @@ if git submodule status | grep -q ^-; then git submodule update --init --recursive fi +# list GPUs by their model name +lsgpu() { + case $GPU_VENDOR in + "nvidia") nvidia-smi --query-gpu=gpu_name --format=csv,noheader;; + "amd") rocm-smi --showproductname | awk -F'\t' '/Card series/ { print $5 }';; + esac +} + gpu_ram() { # Prints all GPUs' memory in GB - nvidia-smi --query-gpu=memory.total --format=csv,noheader | awk '{ printf "%.0f\n", $1 / 1000 }' | head -n1 - # head -n1 becuase we're assuming all GPUs have the same capacity. - # It might be interesting to explore supporting different GPUs in the same machine but not right now + if [ $GPU_VENDOR = nvidia ]; then + nvidia-smi --query-gpu=memory.total --format=csv,noheader | + awk '{ printf "%.0f\n", $1 / 1024 }' + # NVidia-SMI reports in MiB. + # 1GB = 953.674MiB + # 2070 Max-Q advertised: 8GB - NVidia-SMI: 7,982MiB or 8.4GB or 7.8GiB + # GTX Titan advertised: 12GB - NVidia-SMI: 12,212MiB or 12.8GB or 11.9GiB + # Titan RTX advertised: 24GB - NVidia-SMI: 24,219MiB or 25.4GB or 23.7GiB + # Quadro RTX 8000 advertised: 48GB - NVidia-SMI: 48,601MiB or 51.0GB or 47.5GiB + + # awk 'END {printf "%.0f\n", 0.49 }' = 0 + # awk 'END {printf "%.0f\n", 0.5 }' = 1 + # awk 'END {print int(0.49) }' = 0 + # awk 'END {print int(0.5) }' = 0 + else + rocm-smi --showmeminfo vram --csv | sed '/^$/d' | + awk -F, 'NR!=1 { printf "%.0f\n", $2 / (1024^3) }' + fi | head -n1 # becuase we're assuming all GPUs have the same capacity. +} + +# returns the appropriate batch size for the model in $1 +# - takes current precision into account +batch_size() { + case "${GPU_RAM}" in + '6GB') multiplier=6;; + '8GB') multiplier=8;; + # 11GB for 2080Ti + '11GB'|'12GB') multiplier=12;; + '16GB') multiplier=16;; + '24GB') multiplier=24;; + '32GB') multiplier=32;; + # 47GB for Quadro RTX + '47GB'|'48GB') multiplier=48;; + *) + cat 1>&2 <<- EOF + Batchsize for VRAM size $GPU_RAM is not optimized. + Try adding $GPU_RAM to the case statement in config.sh. + EOF + exit 1 + ;; + esac + case "$precision" in + fp16) multiplier=$((multiplier * 2));; + *);; + esac + + eval base=\$$1 + unrounded=$(echo "($base) * $multiplier" | bc -l) + printf '%0.0f\n' "$unrounded" } metadata() { @@ -66,27 +101,30 @@ metadata() { print="printf %s$OFS%s\n" # Total RAM - $awk '/MemTotal:/ { print "Memory", ($2 / 1000000) "GB"}' /proc/meminfo - - # GPU Models - nvidia-smi --query-gpu=index,gpu_name --format=csv,noheader | \ - $awk -v FS=', ' '{ print "GPU " $1, $2 }' + $awk '/MemTotal:/ { print "Memory", ($2 / (1024^2)) "GB"}' /proc/meminfo # CPU Model - lscpu | $awk '/Model name:/ { - if($5 ~ "CPU") cpu=$4; - else cpu=$5; - print "CPU", cpu; - exit; - }' - - # CUDA Toolkit Version - nvcc --version | $awk '/release/ { print "CUDA", $6 }' - # Nvidia Driver Version - modinfo nvidia | $awk '/^version:/ { print "Nvidia", $2 }' + lscpu | $awk -F: '/Model name:/ { sub(/^[\t ]*/, "", $2); print "CPU Model", $2 }' + $print Motherboard "$BOARD_NAME" + + if [ "$GPU_VENDOR" = "nvidia" ]; then + # GPU Models + nvidia-smi --query-gpu=index,gpu_name --format=csv,noheader | \ + $awk -v FS=', ' '{ print "GPU " $1, $2 }' + + # CUDA Toolkit Version + nvcc --version | $awk '/release/ { print "CUDA", $6 }' + + # Nvidia Driver Version + modinfo nvidia | $awk '/^version:/ { print "Nvidia", $2 }' + elif [ "$GPU_VENDOR" = "amd" ]; then + rocm-smi --showproductname --csv | sed '/^$/d' | $awk -F, 'NR!=1 { print $1, $2 }' + hipcc --version | $awk 'NR==1 { print "HIP", $3 }' + modinfo amdgpu | $awk '/^version:/ { print "AMDGPU", $2 }' + fi # Tensorflow Version - $print 'TF' $(python3 2>/dev/null <<- EOF + $print 'TF' $($PYTHON 2>/dev/null <<- EOF import tensorflow print(tensorflow.__version__) EOF @@ -96,15 +134,24 @@ metadata() { $print Kernel "$(uname -r)" # Python Version - python3 --version | tr ' ' "$OFS" + $PYTHON --version | tr ' ' "$OFS" } +gpu_temps() { + if [ "$GPU_VENDOR" = "nvidia" ]; then + nvidia-smi \ + --query-gpu=temperature.gpu --format=csv,noheader,nounits | awk '{ printf("%s, ", $0) }' + elif [ "$GPU_VENDOR" = "amd" ]; then + # rocm-smi adds a new-line before and after output + rocm-smi --showtemp --csv | sed '/^$/d' | awk -F, 'NR!=1 { print $3 }' + fi + +} run_thermal() { # Outputs - # UNIX Timestamp, throughput, temp[, temp[, temp...]] - while printf "%s, %s, %s\n" "$(date +%s)" "$(cat $THROUGHPUT)" "$(nvidia-smi \ - --query-gpu=temperature.gpu --format=csv,noheader,nounits | awk '{ printf("%s, ", $0) }')"; do + # timestamp, throughput, temp[, temp[, temp...]] + while printf "%s, %s, %s\n" "$(date +%s)" "$(cat $THROUGHPUT)" "$(gpu_temps)"; do sleep $THERMAL_INTERVAL done } @@ -113,7 +160,7 @@ run_benchmark() { pushd "$SCRIPT_DIR" &> /dev/null # Example: model=alexnet; alexnet=1536 - eval batch_size=\$$model + batch_size="$(batch_size $model)" # Example: syn-replicated-fp32-1gpus outer_dir="${data_mode}-${variable_update}-${precision}-${num_gpus}gpus" @@ -158,52 +205,123 @@ run_benchmark() { run_thermal >> $thermal_log & thermal_loop="$!" # process ID of while loop - # append timestamp to 'images/sec' value to be parsed later - # this could be replaced with an awk script - # awk '/images\/sec/ { printf("%s ", $0); system("date +%s"); next } 1 { print; system(""); }' | - # awk needs the `system("")` call because it will buffer output otherwise - python3 -u tf_cnn_benchmarks.py "${args[@]}" |& + $PYTHON -u tf_cnn_benchmarks.py "${args[@]}" |& while read line; do case "$line" in - *images/sec*) set $line; echo "$3" > "$THROUGHPUT"; echo "$line $(date +%s)";; + # Here's is an example of the line we're looking for: + # 100 images/sec: 95.8 +/- 0.0 (jitter = 0.4) 7.427 1587077440 + # + # Not to be confused with: + # total images/sec: 95.80 1587077440 + # + # We could use Awk here instead of the whileloop but getting it + # to not buffer output doesn't look pretty + # + # We append a timestamp to the line for no reason + [0-9]*images/sec*) + set $line; echo "$3" > "$THROUGHPUT"; echo "$line $(date +%s)"; + nvlink="$(nvidia-smi nvlink -s | wc -l)" + + # Timestamp,CPU Name,Motherboard,GPU Name,GPU Count,Data Mode,Run Mode,Variable Update,XLA,NVlink,Model,Precision,Batch Size,Result + echo "$(date +%s),$CPU_NAME,$BOARD_NAME,$GPU_NAME,$num_gpus,$data_mode,$run_mode,$variable_update,${TF_XLA_FLAGS##*=},$nvlink,$model,$batch_size,$3" \ + >> ${LOG_DIR}/log.csv;; *) echo "$line";; esac done | tee "$throughput_log" kill "$thermal_loop" 2>/dev/null + popd &> /dev/null } run_benchmark_all() { for model in $MODELS; do - for num_gpus in `seq ${MAX_NUM_GPU} -1 ${MIN_NUM_GPU}`; do - for iter in $(seq 1 $ITERATIONS); do - run_benchmark - done + for iter in $(seq 1 $ITERATIONS); do + run_benchmark done - done + done } +parse_opts() { + while getopts "i:l:h:n:b:c:v:t:" opt; do + case "$opt" in + i) GPU_INDEX="$OPTARG";; + l) MIN_NUM_GPU="$OPTARG";; + h) MAX_NUM_GPU="$OPTARG";; + n) ITERATIONS="$OPTARG";; + b) NUM_BATCHES="$OPTARG";; + c) SETTING="$OPTARG";; + v) GPU_VENDOR="$OPTARG";; + t) THERMAL_INTERVAL="$OPTARG";; + *) die "unrecognized option: $opt";; + esac + done + + if [ -z "$GPU_INDEX" ] && [ -z "$MAX_NUM_GPU" ]; then + MAX_NUM_GPU=$(lsgpu | wc -l) + GPU_INDEX="$(seq 0 $((MAX_NUM_GPU-1)) | paste -sd,)" + elif [ -z "$GPU_INDEX" ] && [ -n "$MAX_NUM_GPU" ]; then + GPU_INDEX="$(seq 0 $((MAX_NUM_GPU-1)) | paste -sd,)" + elif [ -n "$GPU_INDEX" ] && [ -z "$MAX_NUM_GPU" ]; then + MAX_NUM_GPU="$(echo "$GPU_INDEX" | awk -F, '{print NF}')" + elif [ "$MAX_NUM_GPU" -gt "$(echo "$GPU_INDEX" | awk -F, '{print NF}')" ]; then + die "maximum number of GPUs($MAX_NUM_GPU) higher than that allowed by GPU indicies($GPU_INDEX)" + fi # after this point, GPU_INDEX & MAX_NUM_GPU are defined + + if [ "$MAX_NUM_GPU" -gt "${MIN_NUM_GPU:="$MAX_NUM_GPU"}" ]; then + die "min number of GPUs($MIN_NUM_GPU) higher max($MAX_NUM_GPU)" + fi +} main() { - mkdir -p "$LOG_DIR" || true + ITERATIONS=100 + NUM_BATCHES=100 + THERMAL_INTERVAL=1 + PYTHON=python3 + SETTING=config GPU_RAM="$(gpu_ram)GB" - . config.sh - metadata > "$LOG_DIR/metadata" + parse_opts "$@" + + case $GPU_VENDOR in + "nvidia") export CUDA_VISIBLE_DEVICES=$GPU_INDEX;; + "amd") export HIP_VISIBLE_DEVICES=$GPU_INDEX;; + esac + + GPU_NAME="$(lsgpu | sort -u)"; + [ "$(echo $GPU_NAME | wc -l)" -eq 1 ] || + die "refusing to run benchmark with different GPU models" - for run_mode in $RUN_MODE; do - for precision in $PRECISION; do - for data_mode in $DATA_MODE; do - for variable_update in $VARIABLE_UPDATE; do - for distortions in true false; do - if [ $data_mode = syn ] && $distortions; then - # skip distortion for synthetic data - : - else - run_benchmark_all - fi + CPU_NAME="$(lscpu | sed -En '/Model name:/ { s/^Model name:\s*//; s/\([^)]*\)//g }')" + BOARD_NAME="$(sed -E 's/^\s+|\s+$//g' /sys/devices/virtual/dmi/id/board_name)" + SCRIPT_DIR="$(pwd)/benchmarks/scripts/tf_cnn_benchmarks" + CONFIG_NAME="${CPU_NAME// /_}-${GPU_NAME// /_}" + DATA_DIR="/home/${USER}/nfs/imagenet_mini" + LOG_DIR="$(pwd)/${CONFIG_NAME}.logs" + echo $CONFIG_NAME + echo 0 > ${THROUGHPUT=$(mktemp)} + + mkdir -p "$LOG_DIR" || true + . ${SETTING}".sh" + + metadata > "$LOG_DIR/metadata" + $PYTHON -c 'import tensorflow as tf; exit(tf.test.is_gpu_available())' && + die "either could not import Tensorflow or tf.test.is_gpu_available() returned false - exiting" + + for num_gpus in `seq ${MAX_NUM_GPU} -1 ${MIN_NUM_GPU}`; do + for run_mode in $RUN_MODE; do + for precision in $PRECISION; do + for data_mode in $DATA_MODE; do + for variable_update in $VARIABLE_UPDATE; do + for distortions in true false; do + if [ $data_mode = syn ] && $distortions; then + # skip distortion for synthetic data + : + else + run_benchmark_all + fi + done done done done diff --git a/benchmarks b/benchmarks index 8459a23a..aef6daa9 160000 --- a/benchmarks +++ b/benchmarks @@ -1 +1 @@ -Subproject commit 8459a23af411ea79968c9af645afdad77b01eeb4 +Subproject commit aef6daa90a467a1fc7ce8395cd0067e5fda1ecff diff --git a/config.sh b/config.sh index 778756f8..a35e064a 100755 --- a/config.sh +++ b/config.sh @@ -5,60 +5,11 @@ PRECISION="fp32 fp16" RUN_MODE="train inference" DATA_MODE="syn" -case "${GPU_RAM:-'12GB'}" in - '6GB') - resnet50=32 - resnet152=16 - inception3=32 - inception4=8 - vgg16=32 - alexnet=256 - ssd300=16 - ;; - '8GB') - resnet50=48 - resnet152=32 - inception3=48 - inception4=12 - vgg16=48 - alexnet=384 - ssd300=32 - ;; - '12GB') - resnet50=64 - resnet152=32 - inception3=64 - inception4=16 - vgg16=64 - alexnet=512 - ssd300=32 - ;; - '24GB') - resnet50=128 - resnet152=64 - inception3=128 - inception4=32 - vgg16=128 - alexnet=1024 - ssd300=64 - ;; - '32GB') - resnet50=192 - resnet152=96 - inception3=192 - inception4=48 - vgg16=192 - alexnet=1536 - ssd300=96 - ;; - '48GB') - resnet50=256 - resnet152=128 - inception3=256 - inception4=64 - vgg16=256 - alexnet=2048 - ssd300=128 - ;; - *) echo "Batchsize for VRAM size '$GPU_RAM' not optimized" >&2;; -esac +# Base batch size multipliers + resnet50='5 + 1/3' + resnet152='2 + 2/3' +inception3='5 + 1/3' +inception4='1 + 1/3' + vgg16='5 + 1/3' + alexnet='42 + 2/3' + ssd300='2 + 2/3' diff --git a/config_resnet50_replicated_fp32_train_syn.sh b/config_resnet50_replicated_fp32_train_syn.sh new file mode 100644 index 00000000..d6e0373c --- /dev/null +++ b/config_resnet50_replicated_fp32_train_syn.sh @@ -0,0 +1,15 @@ +#!/bin/sh +MODELS="resnet50" +VARIABLE_UPDATE="replicated" +PRECISION="fp32" +RUN_MODE="train" +DATA_MODE="syn" + +# Base batch size multipliers + resnet50='5 + 1/3' + resnet152='2 + 2/3' +inception3='5 + 1/3' +inception4='1 + 1/3' + vgg16='5 + 1/3' + alexnet='42 + 2/3' + ssd300='2 + 2/3' diff --git a/report.sh b/report.sh index a195dc13..67148c9e 100755 --- a/report.sh +++ b/report.sh @@ -60,7 +60,7 @@ for param_dir in */; do :------:|:------:| $(for model_dir in *; do model="$(basename $model_dir)" - avg="$(awk '/total images/ { s+=$3 } END { print s/(ARGC-1) }' `find $model_dir/throughput -type f`)" + avg="$(awk '!/total/ && /images\/sec/ { s+=$3; c++ } END { print s/c }' `find $model_dir/throughput -type f`)" echo "$model | $avg |" done)