Skip to content

Commit

Permalink
Merge pull request #1310 from jqnatividad/benchmarks-cleanup
Browse files Browse the repository at this point in the history
Quicksilver Benchmarks 2.1.1
  • Loading branch information
jqnatividad authored Sep 18, 2023
2 parents 17924a4 + e4f6341 commit 69d2eca
Showing 1 changed file with 82 additions and 85 deletions.
167 changes: 82 additions & 85 deletions scripts/benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# it was also designed to be a useful tool for users to benchmark qsv on their own systems,
# so it be can run on hardware and workloads that reflect your requirements/environment.
#
# Make sure you're using a release-optimized `qsv`.
# Make sure you're using a release-optimized `qsv`.
# If you can't use the prebuilt binaries at https://github.com/jqnatividad/qsv/releases/latest,
# build it to have at least the apply, geocode, luau, to and polars features enabled:
# i.e. `cargo build --release --locked -F feature_capable,apply,geocode,luau,to,polars` or
Expand All @@ -39,49 +39,78 @@

pat="$1"

# the version of this script
bm_version=2.1.1

# configurable variables - change as needed to reflect your environment/workloads
qsv_bin=qsv
benchmark_data_url=https://raw.githubusercontent.com/wiki/jqnatividad/qsv/files/NYC_311_SR_2010-2020-sample-1M.7z
# where to download the benchmark data compressed file - this could be a zip or 7z file
datazip=/tmp/NYC_311_SR_2010-2020-sample-1M.7z
datazip=NYC_311_SR_2010-2020-sample-1M.7z
# where to store the benchmark data
data=NYC_311_SR_2010-2020-sample-1M.csv
warmup_runs=2
benchmark_runs=3
data_filename=$(basename -- "$data")
filestem="${data_filename%.*}"

# check if binaries are installed ---------
# check if qsv is installed
if ! command -v "$qsv_bin" &>/dev/null; then
echo "qsv could not be found"
echo "Please install Quicksilver (qsv) from https://qsv.dathere.com"
exit
fi

# set sevenz_bin to "7z" on Linux/Cygwin and "7zz" on macOS
if [[ "$OSTYPE" == "darwin"* ]]; then
sevenz_bin=7zz
else
sevenz_bin=7z
fi

# check if 7z is installed
if ! command -v "$sevenz_bin" &>/dev/null; then
echo "ERROR: $sevenz_bin could not be found"
echo "Please install 7-Zip v23.01 and above"
exit
fi

# check if hyperfine is installed
if ! command -v hyperfine &>/dev/null; then
echo "ERROR: hyperfine could not be found"
echo "Please install hyperfine v1.17.0 and above"
exit
fi

# qsv version metadata ----------------
# get current version of qsv
raw_version=$("$qsv_bin" --version)
version=$(echo $raw_version | cut -d' ' -f2 | cut -d'-' -f1)
version=$(echo "$raw_version" | cut -d' ' -f2 | cut -d'-' -f1)
# get target platform from version
platform=$(echo $raw_version | sed 's/.*(\([a-z0-9_-]*\) compiled with Rust.*/\1/')
platform=$(echo "$raw_version" | sed 's/.*(\([a-z0-9_-]*\) compiled with Rust.*/\1/')
# get qsv kind
kind=$(echo $raw_version | sed 's/.* \([a-zA-Z]*\)$/\1/')
kind=$(echo "$raw_version" | sed 's/.* \([a-zA-Z]*\)$/\1/')

# get num cores & memory size
if [[ "$OSTYPE" == "darwin"* ]]; then
# macOS
num_cores=$(sysctl -n hw.ncpu)
mem_size=$(sysctl -n hw.memsize)
# macOS
num_cores=$(sysctl -n hw.ncpu)
mem_size=$(sysctl -n hw.memsize)
elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
# Linux
num_cores=$(nproc)
mem_size=$(free -b | awk '/Mem/ {print $7}')
# Linux
num_cores=$(nproc)
mem_size=$(free -b | awk '/Mem/ {print $7}')
elif [[ "$OSTYPE" == "msys" || "$OSTYPE" == "cygwin" ]]; then
# Windows
num_cores=$(wmic cpu get NumberOfCores | grep -Eo '^[0-9]+')
mem_size=$(wmic OS get FreePhysicalMemory | grep -Eo '[0-9]+')
mem_size=$((mem_size * 1024))
# Windows
num_cores=$(wmic cpu get NumberOfCores | grep -Eo '^[0-9]+')
mem_size=$(wmic OS get FreePhysicalMemory | grep -Eo '[0-9]+')
mem_size=$((mem_size * 1024))
else
echo "Unsupported operating system: $OSTYPE"
exit 1
echo "Unsupported operating system: $OSTYPE"
exit 1
fi

# the version of this script
bm_version=2.1.0

function cleanup_files {
# Clean up temporary files
rm -f "$filestem".csv.*
Expand Down Expand Up @@ -146,53 +175,22 @@ SECONDS=0

cleanup_files

# check if qsv is installed
if ! command -v "$qsv_bin" &> /dev/null
then
echo "qsv could not be found"
echo "Please install Quicksilver (qsv) from https://qsv.dathere.com"
exit
fi

# set sevenz_bin to "7z" on Linux/Cygwin and "7zz" on macOS
if [[ "$OSTYPE" == "darwin"* ]]; then
sevenz_bin=7zz
else
sevenz_bin=7z
fi

# check if 7z is installed
if ! command -v "$sevenz_bin" &> /dev/null
then
echo "ERROR: $sevenz_bin could not be found"
echo "Please install 7-Zip v23.01 and above"
exit
fi

# check if hyperfine is installed
if ! command -v hyperfine &> /dev/null
then
echo "ERROR: hyperfine could not be found"
echo "Please install hyperfine v1.17.0 and above"
exit
fi

if [ ! -r "$data" ]; then
echo "> Downloading Benchmark data..."
curl -sS "$benchmark_data_url" > "$datazip"
curl -sS "$benchmark_data_url" >"$datazip"
"$sevenz_bin" e -y "$datazip"
echo ""
fi

# we get the rowcount, just in case the benchmark data was modified by the user to tailor
# the benchmark to their system/workload. We use the rowcount to compute records per second
rowcount=$("$qsv_bin" count "$data")
printf "Benchmark data rowcount: %'.0f\n" $rowcount
printf "Benchmark data rowcount: %'.0f\n" "$rowcount"
echo ""

if [ ! -r communityboards.csv ]; then
echo "> Downloading community board data..."
curl -sS https://raw.githubusercontent.com/wiki/jqnatividad/qsv/files/communityboards.csv > communityboards.csv
curl -sS https://raw.githubusercontent.com/wiki/jqnatividad/qsv/files/communityboards.csv >communityboards.csv
echo ""
fi

Expand All @@ -211,11 +209,11 @@ if [ ! -r data_to_exclude.csv ]; then
echo " benchmark_data.jsonl..."
"$qsv_bin" tojsonl "$data" --output benchmark_data.jsonl
echo " benchmark_data.schema.json..."
"$qsv_bin" schema "$data" --stdout > benchmark_data.csv.schema.json
"$qsv_bin" schema "$data" --stdout >benchmark_data.csv.schema.json
echo " benchmark_data.snappy..."
"$qsv_bin" snappy compress "$data" --output benchmark_data.snappy
echo " searchset_patterns.txt..."
printf "homeless\npark\nnoise\n" > searchset_patterns.txt
printf "homeless\npark\nnoise\n" >searchset_patterns.txt
echo ""
fi

Expand All @@ -229,8 +227,8 @@ commands_with_index_name=()
function add_command {
local dest_array="$1"
shift
local cmd="$@"
local cmd="$*"

if [[ "$dest_array" == "without_index" ]]; then
commands_without_index+=("$cmd")
else
Expand All @@ -242,13 +240,13 @@ function run {
local index=
while true; do
case "$1" in
--index)
index="yes"
shift
;;
*)
break
;;
--index)
index="yes"
shift
;;
*)
break
;;
esac
done

Expand Down Expand Up @@ -276,7 +274,7 @@ run apply_datefmt "$qsv_bin apply datefmt \"Created Date\" $data"
run apply_datefmt_multi "$qsv_bin apply datefmt \"Created Date,Closed Date,Due Date\" $data"
run apply_dynfmt "$qsv_bin apply dynfmt --formatstr \"{Created Date} {Complaint Type} - {BBL} {City}\" --new-column new_col $data"
run apply_emptyreplace "$qsv_bin" apply emptyreplace \"Bridge Highway Name\" --replacement Unspecified "$data"
run apply_op_eudex "$qsv_bin apply operations lower,eudex Agency --comparand Queens --new-column Agency_queens_soundex $data"
run apply_op_eudex "$qsv_bin apply operations lower,eudex Agency --comparand Queens --new-column Agency_queens_soundex $data"
run apply_op_string "$qsv_bin apply operations lower Agency $data"
run apply_op_similarity "$qsv_bin apply operations lower,simdln Agency --comparand brooklyn --new-column Agency_sim-brooklyn_score $data"
run behead "$qsv_bin" behead "$data"
Expand Down Expand Up @@ -352,15 +350,15 @@ run sortcheck_unsorted_all "$qsv_bin" sortcheck --all data_unsorted.csv
run split "$qsv_bin" split --size 50000 split_tempdir "$data"
run --index split_index "$qsv_bin" split --size 50000 split_tempdir "$data"
run --index split_index_j1 "$qsv_bin" split --size 50000 -j 1 split_tempdir "$data"
run sqlp "$qsv_bin" sqlp "$data" -Q '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run sqlp "$qsv_bin" sqlp "$data" -Q '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run sqlp_format_arrow "$qsv_bin" sqlp --format arrow "$data" -Q '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run sqlp_format_json "$qsv_bin" sqlp --format json "$data" -Q '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run sqlp_format_parquet "$qsv_bin" sqlp --format parquet "$data" -Q '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run sqlp_format_parquet_statistics "$qsv_bin" sqlp --format parquet --statistics "$data" -Q '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run sqlp_lowmemory "$qsv_bin" sqlp "$data" -Q --low-memory '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run sqlp_nooptimizations "$qsv_bin" sqlp "$data" -Q --no-optimizations '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run sqlp_tryparsedates "$qsv_bin" sqlp "$data" -Q --try-parsedates '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run sqlp_tryparsedates_inferlen "$qsv_bin" sqlp "$data" -Q --infer-len 10000 --try-parsedates '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run sqlp_lowmemory "$qsv_bin" sqlp "$data" -Q --low-memory '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run sqlp_nooptimizations "$qsv_bin" sqlp "$data" -Q --no-optimizations '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run sqlp_tryparsedates "$qsv_bin" sqlp "$data" -Q --try-parsedates '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run sqlp_tryparsedates_inferlen "$qsv_bin" sqlp "$data" -Q --infer-len 10000 --try-parsedates '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run stats "$qsv_bin" stats --force "$data"
run --index stats_index "$qsv_bin" stats --force "$data"
run --index stats_index_j1 "$qsv_bin" stats -j 1 --force "$data"
Expand Down Expand Up @@ -388,8 +386,7 @@ run --index validate_no_schema_index "$qsv_bin" validate "$data"
with_index_count=${#commands_with_index[@]}
wo_index_count=${#commands_without_index[@]}
total_count=$((with_index_count + wo_index_count))
printf "> Commands to benchmark: $total_count, w/o index: $wo_index_count, with index: $with_index_count\n\n"

printf "> Commands to benchmark: %s, w/o index: %s, with index: %s\n\n" "$total_count" "$wo_index_count" "$with_index_count"
# ---------------------------------------
# Prepare benchmark results directory

Expand All @@ -400,7 +397,7 @@ fi

# Init latest_results.csv. It stores the benchmark results for this run
rm -f results/latest_results.csv
echo "version,tstamp,name,mean,stddev,median,user,system,min,max" > results/latest_results.csv
echo "version,tstamp,name,mean,stddev,median,user,system,min,max" >results/latest_results.csv

# check if the file benchmark_results.csv exists, if it doesn't create it
# by copying the empty latest_results.csv
Expand Down Expand Up @@ -432,11 +429,11 @@ for command_no_index in "${commands_without_index[@]}"; do
echo "$name_idx. ${commands_without_index_name[$idx]}"
hyperfine --warmup "$warmup_runs" -i --runs "$benchmark_runs" --export-csv results/hf_result.csv \
"$command_no_index"

# prepend version, tstamp & benchmark name to the hyperfine results
echo "version,tstamp,name" > results/results_work.csv
echo "$version,$now,${commands_without_index_name[$idx]}" >> results/results_work.csv
echo "version,tstamp,name" >results/results_work.csv
echo "$version,$now,${commands_without_index_name[$idx]}" >>results/results_work.csv

# remove the command column from the hyperfine results, we just need the name
"$qsv_bin" select '!command' results/hf_result.csv -o results/hf_result_nocmd.csv

Expand All @@ -462,16 +459,16 @@ if [ "$with_index_count" -gt 0 ]; then
rm -f "$data".idx
"$qsv_bin" index "$data"
"$qsv_bin" stats "$data" --everything --infer-dates --force \
--output benchmark_work.stats.csv
--output benchmark_work.stats.csv
fi

idx=0
for command_with_index in "${commands_with_index[@]}"; do
echo "$name_idx. ${commands_with_index_name[$idx]}"
hyperfine --warmup "$warmup_runs" -i --runs "$benchmark_runs" --export-csv results/hf_result.csv \
"$command_with_index"
echo "version,tstamp,name" > results/results_work.csv
echo "$version,$now,${commands_with_index_name[$idx]}" >> results/results_work.csv
echo "version,tstamp,name" >results/results_work.csv
echo "$version,$now,${commands_with_index_name[$idx]}" >>results/results_work.csv
"$qsv_bin" select '!command' results/hf_result.csv -o results/hf_result_nocmd.csv
"$qsv_bin" cat columns results/results_work.csv results/hf_result_nocmd.csv \
-o results/entry.csv
Expand All @@ -493,13 +490,13 @@ done
echo ""
# sort the benchmark results by version, tstamp & name
"$qsv_bin" sort --select version,tstamp,name results/latest_results.csv \
-o results/results_work.csv
-o results/results_work.csv

# compute records per second for each benchmark using luau by dividing rowcount by mean
# we then round the result to a whole number and format with commas for readability
luau_cmd="recs_per_sec=( $rowcount / mean); return numWithCommas(recs_per_sec)"
"$qsv_bin" luau --begin file:benchmark_helper.luau map recs_per_sec "$luau_cmd" \
results/results_work.csv -o results/latest_results.csv
results/results_work.csv -o results/latest_results.csv

# Concatenate the final results of this run to results/bechmark_results.csv
"$qsv_bin" cat rowskey results/latest_results.csv results/benchmark_results.csv \
Expand All @@ -516,7 +513,7 @@ elapsed=$SECONDS

# Init latest_run_info.csv. It stores the benchmark run info for this run
rm -f results/latest_run_info.tsv
echo -e "version\ttstamp\tlogtime\tbm_version\tplatform\tcores\tmem\tkind\targument\ttotal_count\two_index_count\twith_index_count\twarmup_runs\tbenchmark_runs\telapsed_secs\tversion_info" > results/latest_run_info.tsv
echo -e "version\ttstamp\tlogtime\tbm_version\tplatform\tcores\tmem\tkind\targument\ttotal_count\two_index_count\twith_index_count\twarmup_runs\tbenchmark_runs\telapsed_secs\tversion_info" >results/latest_run_info.tsv

# check if the file run_info_history.csv exists, if it doesn't create it
# by copying the empty latest_run_info.csv
Expand All @@ -525,7 +522,7 @@ if [ ! -f "results/run_info_history.tsv" ]; then
fi

# append the run info to latest_run_info.csv
echo -e "$version\t$now\t$now_sec\t$bm_version\t$platform\t$num_cores\t$mem_size\t$kind\t$pat\t$total_count\t$wo_index_count\t$with_index_count\t$warmup_runs\t$benchmark_runs\t$elapsed\t$raw_version" >> results/latest_run_info.tsv
echo -e "$version\t$now\t$now_sec\t$bm_version\t$platform\t$num_cores\t$mem_size\t$kind\t$pat\t$total_count\t$wo_index_count\t$with_index_count\t$warmup_runs\t$benchmark_runs\t$elapsed\t$raw_version" >>results/latest_run_info.tsv

# now update the run_info_history.tsv
"$qsv_bin" cat rowskey results/latest_run_info.tsv results/run_info_history.tsv \
Expand Down

0 comments on commit 69d2eca

Please sign in to comment.