From d5a9e584364394ed4b8754e0cccf4cf63a7a0ea2 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 23 Sep 2023 07:46:24 -0400 Subject: [PATCH] `benchmarks`: minor refactor - rename pat var to arg_pat - clarified that we're also compiling stats cache when doing indexed benchmarks - ensured all benchmark support data is created by checking last one created instead of first one - bump from 2.3.0 to 2.3.1 [skip ci] --- scripts/benchmarks.sh | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/scripts/benchmarks.sh b/scripts/benchmarks.sh index 75bcd2974..c78a57cf4 100755 --- a/scripts/benchmarks.sh +++ b/scripts/benchmarks.sh @@ -37,10 +37,10 @@ # It uses the following commands: apply, cat, count, luau, sample, schema, select, snappy, sort, tojsonl # and to xlsx. It's a good example of how qsv can be used to automate data preparation & analysis tasks. -pat="$1" +arg_pat="$1" # the version of this script -bm_version=2.3.0 +bm_version=2.3.1 # configurable variables --------------------------------------- # change as needed to reflect your environment/workloads @@ -136,8 +136,8 @@ function cleanup_files { rm -f extsort_sorted.csv } -# if pat is equal to "help", show usage -if [[ "$pat" == "help" ]]; then +# if arg_pat is equal to "help", show usage +if [[ "$arg_pat" == "help" ]]; then echo "Quicksilver (qsv) Benchmark Script v$bm_version" echo "" echo "Usage: ./benchmarks.sh " @@ -155,9 +155,9 @@ if [[ "$pat" == "help" ]]; then exit fi -# if pat is equal to "reset", download and prepare the benchmark data again +# if arg_pat is equal to "reset", download and prepare the benchmark data again # the results/benchmark_results.csv historical archive will be preserved -if [[ "$pat" == "reset" ]]; then +if [[ "$arg_pat" == "reset" ]]; then rm -f "$datazip" rm -f "$filestem".* rm -f communityboards.csv @@ -173,8 +173,8 @@ if [[ "$pat" == "reset" ]]; then exit fi -# if pat is equal to "clean", clean up temporary files -if [[ "$pat" == "clean" ]]; then +# if arg_pat is equal to "clean", clean up temporary files +if [[ "$arg_pat" == "clean" ]]; then cleanup_files echo "> Temporary files cleaned up..." exit @@ -204,7 +204,7 @@ if [ ! -r communityboards.csv ]; then echo "" fi -if [ ! -r data_to_exclude.csv ]; then +if [ ! -r seachset_patterns.txt ]; then echo "> Preparing benchmark support data..." # create an index so benchmark data preparation commands can run faster "$qsv_bin" index "$data" @@ -263,7 +263,7 @@ function run { local name="$1" shift - if [[ "$name" == *"$pat"* ]]; then + if [[ "$name" == *"$arg_pat"* ]]; then if [ -z "$index" ]; then commands_without_index_name+=("$name") add_command "without_index" "$@" @@ -276,7 +276,7 @@ function run { # --------------------------------------- # Queue commands for benchmarking -# commands with an --index prefix will be benchmarked with an index +# commands with an --index prefix will be benchmarked with an index and a stats cache # template: run run apply_calcconv "$qsv_bin apply calcconv --formatstr \"{Unique Key} meters in miles\" --new-column new_col $data" @@ -460,9 +460,11 @@ for command_no_index in "${commands_without_index[@]}"; do done # --------------------------------------- -# then, run benchmarks with an index +# then, run benchmarks with an index and stats cache # an index enables random access and unlocks multi-threading in several commands -echo "> Benchmarking WITH INDEX..." +# the stats cache enables faster stats computation as it will use the cached stats +# when its valid and available, instead of computing the stats from scratch +echo "> Benchmarking WITH INDEX and STATS CACHE..." if [ "$with_index_count" -gt 0 ]; then echo " Preparing index and stats cache..." @@ -492,7 +494,7 @@ done # --------------------------------------- # Finalize benchmark results. Sort the latest results by version, tstamp & name. # compute and add records per second for each benchmark using qsv's luau command. -# We compute recs_per_sec by dividing 1M (the number of rows in NYC 311 sample data) +# We compute recs_per_sec by dividing the number of rows in the benchmark data # by the mean run time of the three runs. # We then append/concatenate the latest results to benchmark_results.csv - which is # a historical archive, so we can track performance over multiple releases. @@ -503,7 +505,7 @@ echo "" -o results/results_work.csv # compute records per second for each benchmark using luau by dividing rowcount by mean -# we then round the result to a whole number and format with commas for readability +# we then round the result to a whole number luau_cmd="recs_per_sec=( $rowcount / mean); return tonumber(string.format(\"%.0f\",recs_per_sec))" "$qsv_bin" luau map recs_per_sec "$luau_cmd" results/results_work.csv -o results/latest_results.csv @@ -557,7 +559,7 @@ if [ ! -f "results/run_info_history.tsv" ]; then fi # append the run info to latest_run_info.csv -echo -e "$version\t$now\t$now_sec\t$bm_version\t$platform\t$num_cores\t$mem_size\t$qsv_bin\t$kind\t$pat\t$total_count\t$wo_index_count\t$with_index_count\t$warmup_runs\t$benchmark_runs\t$elapsed\t$qsv_envvars\t$raw_version" >>results/latest_run_info.tsv +echo -e "$version\t$now\t$now_sec\t$bm_version\t$platform\t$num_cores\t$mem_size\t$qsv_bin\t$kind\t$arg_pat\t$total_count\t$wo_index_count\t$with_index_count\t$warmup_runs\t$benchmark_runs\t$elapsed\t$qsv_envvars\t$raw_version" >>results/latest_run_info.tsv # now update the run_info_history.tsv "$qsv_bin" cat rowskey results/latest_run_info.tsv results/run_info_history.tsv \