Skip to content

Commit

Permalink
Add end-to-end benchmarking scripts
Browse files Browse the repository at this point in the history
You can invoke these from the root of the repo like:

   ./scripts/benchmark.sh ./tests/data/index/dataset_500/dataset_500_observations.csv

This will:
 - create a precovery database in a temporary directory
 - load the data into the database, measuring the time that takes
 - do a precovery search on 10 orbits, measuring the time that takes
 - print some results to stdout
 - write detailed results to two JSON files
   (precovery_create_benchmark_{timestamp}.json and
   precovery_search_benchmark_{timestamp}.json) in the current
   directory

This code was largely written by GPT-4, with some minor manual tweaks.
  • Loading branch information
spenczar committed Apr 14, 2023
1 parent bd9942c commit d2054b3
Show file tree
Hide file tree
Showing 3 changed files with 330 additions and 0 deletions.
50 changes: 50 additions & 0 deletions scripts/benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/bin/bash

# Set default healpix order to 10
: "${ORDER:=10}"
: "${N_ITER:=8}"

while getopts ":d:o:n:" opt; do
case $opt in
d)
DB_DIR=$OPTARG
;;
n)
N_ITER=$OPTARG
;;
o)
ORDER=$OPTARG
;;
\?)
echo "Invalid option -$OPTARG" >&2
exit 1
;;
esac
done

shift $((OPTIND-1))

if [[ -z "${DB_DIR}" ]]; then
DB_DIR=$(mktemp -d)
CLEANUP_DB_DIR=true
fi

DATA_FILE=$1

if [[ -z "${DATA_FILE}" ]]; then
echo "Usage: $0 <data file> [-d <database directory>] [-n <order>]"
exit 1
fi

NOW=$(date +"%Y-%m-%d_%H-%M-%S")

echo "Creating precovery database in ${DB_DIR} with ORDER=${ORDER}..."
python scripts/measure_precovery_db_creation_time.py "${DB_DIR}" "${DATA_FILE}" "testdata" -p "${ORDER}" -n "${N_ITER}" -o "precovery_create_benchmark_${NOW}.json"

echo "Running precovery benchmark on ${DB_DIR}..."
python scripts/measure_precover_search_time.py "${DB_DIR}" -o "precovery_search_benchmark_${NOW}.json"

if [[ "${CLEANUP_DB_DIR}" == "true" ]]; then
echo "Cleaning up temporary database directory..."
rm -rf "${DB_DIR}"
fi
140 changes: 140 additions & 0 deletions scripts/measure_precover_search_time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import argparse
import json
import os
import time
from datetime import datetime
from typing import List

import pandas as pd

from precovery.orbit import EpochTimescale, Orbit
from precovery.precovery_db import FrameCandidate, PrecoveryCandidate, PrecoveryDatabase


def sample_orbits():
sample_orbits_file = os.path.join(
os.path.dirname(__file__), "..", "tests", "data", "sample_orbits.csv"
)
df = pd.read_csv(sample_orbits_file)
orbits = []
for i in range(len(df)):
orbit = Orbit.keplerian(
i,
df["a"].values[i],
df["e"].values[i],
df["i"].values[i],
df["om"].values[i],
df["w"].values[i],
df["ma"].values[i],
df["mjd_tt"].values[i],
EpochTimescale.TT,
df["H"].values[i],
df["G"].values[i],
)
orbits.append(orbit)
return orbits


def measure_precover_performance(
database_directory: str, orbits: List[Orbit]
) -> tuple[PrecoveryDatabase, List[dict]]:
db = PrecoveryDatabase.from_dir(database_directory)
results = []

for orbit in orbits:
start_time = time.time()
precover_results = db.precover(orbit)
elapsed_time = time.time() - start_time

precovery_count = sum(
isinstance(res, PrecoveryCandidate) for res in precover_results
)
frame_count = sum(isinstance(res, FrameCandidate) for res in precover_results)

results.append(
{
"elapsed_time": elapsed_time,
"precovery_count": precovery_count,
"frame_count": frame_count,
}
)

return db, results


def calculate_statistics(results: List[dict]):
import numpy as np

elapsed_times = [res["elapsed_time"] for res in results]

mean_time = np.mean(elapsed_times)
median_time = np.median(elapsed_times)
p10 = np.percentile(elapsed_times, 10)
p90 = np.percentile(elapsed_times, 90)
stdev = np.std(elapsed_times)

return {
"mean_time": mean_time,
"median_time": median_time,
"p10": p10,
"p90": p90,
"stdev": stdev,
}


def save_results_to_json(
db: PrecoveryDatabase,
output_json_file: str,
results: List[dict],
database_directory: str,
run_timestamp: str,
):
statistics = calculate_statistics(results)

report = {
"results": results,
"statistics": statistics,
"database_directory": database_directory,
"run_timestamp": run_timestamp,
"database": {
"size_bytes": db.frames.idx.n_bytes(),
"n_frames": db.frames.idx.n_frames(),
"dataset_ids": list(db.frames.idx.get_dataset_ids()),
},
}

with open(output_json_file, "w") as f:
json.dump(report, f, indent=4)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Measure the performance of the precover method on a PrecoveryDatabase."
)
parser.add_argument(
"database_directory", help="Path to the PrecoveryDatabase directory."
)
parser.add_argument(
"-o",
"--output_json_file",
default="precover_bench.json",
help="Path to the output JSON file.",
)
args = parser.parse_args()

orbits_to_test = sample_orbits()[:10] # Test with 10 sample orbits

db, results = measure_precover_performance(args.database_directory, orbits_to_test)

print("\nResults for precover method:")
for i, result in enumerate(results, start=1):
print(
f"Orbit {i}: {result['elapsed_time']:.2f} seconds, "
f"{result['precovery_count']} PrecoveryCandidates, {result['frame_count']} FrameCandidates"
)

run_timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
save_results_to_json(
db, args.output_json_file, results, args.database_directory, run_timestamp
)
print(f"\nResults saved to JSON file: {args.output_json_file}")
140 changes: 140 additions & 0 deletions scripts/measure_precovery_db_creation_time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import argparse
import json
import shutil
import time
from datetime import datetime

import numpy as np

from precovery.precovery_db import PrecoveryDatabase


def count_lines(file_path):
with open(file_path) as f:
return sum(1 for _ in f)


def measure_db_creation_time(database_directory, csv_file_path, dataset_id, nside):
start_time = time.time()

db = PrecoveryDatabase.create(database_directory, nside)
db.frames.add_dataset(dataset_id)
db.frames.load_csv(csv_file_path, dataset_id)

elapsed_time = time.time() - start_time
db.frames.close()

return elapsed_time


def print_statistics(elapsed_times):
elapsed_times_array = np.array(elapsed_times)
print("Summary statistics:")
print(f" Mean time: {np.mean(elapsed_times_array):.2f} seconds")
print(f" Median time: {np.median(elapsed_times_array):.2f} seconds")
print(f" P10 time: {np.percentile(elapsed_times_array, 10):.2f} seconds")
print(f" P90 time: {np.percentile(elapsed_times_array, 90):.2f} seconds")
print(f" Standard deviation: {np.std(elapsed_times_array):.2f} seconds")


def save_results_to_json(
db, output_json_file, elapsed_times, num_lines, input_data_filename, run_timestamp
):
elapsed_times_array = np.array(elapsed_times)
results = {
"input_data_filename": input_data_filename,
"number_of_rows": num_lines,
"healpix_nside": db.frames.healpix_nside,
"run_timestamp": run_timestamp,
"database_statistics": {
"number_of_frames": db.frames.idx.n_frames(),
"database_size_bytes": db.frames.idx.n_bytes(),
"unique_datasets": list(db.frames.idx.get_dataset_ids()),
},
"summary_statistics": {
"mean_time": np.mean(elapsed_times_array),
"median_time": np.median(elapsed_times_array),
"p10_time": np.percentile(elapsed_times_array, 10),
"p90_time": np.percentile(elapsed_times_array, 90),
"stdev_time": np.std(elapsed_times_array),
},
"execution_times": elapsed_times,
}

with open(output_json_file, "w") as json_file:
json.dump(results, json_file, indent=4)


def main(args):
elapsed_times = []
num_lines = count_lines(args.csv_file_path)

for i in range(args.num_iterations):
print(f"Running iteration {i + 1}/{args.num_iterations}")
shutil.rmtree(args.database_directory, ignore_errors=True)
elapsed_time = measure_db_creation_time(
args.database_directory,
args.csv_file_path,
args.dataset_id,
2**args.healpixel_order,
)
elapsed_times.append(elapsed_time)

run_timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
if args.output_json_file is not None:
output_json_file = args.output_json_file
else:
output_json_file = "precovery_create_benchmark.json"

# Load the existing database for gathering statistics
db = PrecoveryDatabase.from_dir(args.database_directory)

save_results_to_json(
db,
output_json_file,
elapsed_times,
num_lines,
args.csv_file_path,
run_timestamp,
)
print(f"Results saved to {output_json_file}")

print_statistics(elapsed_times)

# Close the database instance
db.frames.close()


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Measure Precovery database creation time."
)
parser.add_argument(
"database_directory", help="Directory path for the Precovery database."
)
parser.add_argument("csv_file_path", help="Path to the input CSV file.")
parser.add_argument("dataset_id", help="Dataset ID for the input data.")
parser.add_argument(
"-p", "--healpixel_order", type=int, default=12, help="Healpixel order to use."
)
parser.add_argument(
"-n",
"--num_iterations",
type=int,
default=20,
help="Number of times to create the database (default: 20).",
)
parser.add_argument(
"-o",
"--output_json_file",
help="Output JSON file (default: results_TIMESTAMP.json).",
)
parser.add_argument(
"-v", "--verbose", action="store_true", help="Display verbose output."
)
args = parser.parse_args()

if args.verbose:
print(f"Running with the following arguments: {args}")

main(args)

0 comments on commit d2054b3

Please sign in to comment.