diff --git a/scripts/benchmark.sh b/scripts/benchmark.sh new file mode 100755 index 0000000..de52cad --- /dev/null +++ b/scripts/benchmark.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +# Set default healpix order to 10 +: "${ORDER:=10}" +: "${N_ITER:=8}" + +while getopts ":d:o:n:" opt; do + case $opt in + d) + DB_DIR=$OPTARG + ;; + n) + N_ITER=$OPTARG + ;; + o) + ORDER=$OPTARG + ;; + \?) + echo "Invalid option -$OPTARG" >&2 + exit 1 + ;; + esac +done + +shift $((OPTIND-1)) + +if [[ -z "${DB_DIR}" ]]; then + DB_DIR=$(mktemp -d) + CLEANUP_DB_DIR=true +fi + +DATA_FILE=$1 + +if [[ -z "${DATA_FILE}" ]]; then + echo "Usage: $0 [-d ] [-n ]" + exit 1 +fi + +NOW=$(date +"%Y-%m-%d_%H-%M-%S") + +echo "Creating precovery database in ${DB_DIR} with ORDER=${ORDER}..." +python scripts/measure_precovery_db_creation_time.py "${DB_DIR}" "${DATA_FILE}" "testdata" -p "${ORDER}" -n "${N_ITER}" -o "precovery_create_benchmark_${NOW}.json" + +echo "Running precovery benchmark on ${DB_DIR}..." +python scripts/measure_precover_search_time.py "${DB_DIR}" -o "precovery_search_benchmark_${NOW}.json" + +if [[ "${CLEANUP_DB_DIR}" == "true" ]]; then + echo "Cleaning up temporary database directory..." + rm -rf "${DB_DIR}" +fi diff --git a/scripts/measure_precover_search_time.py b/scripts/measure_precover_search_time.py new file mode 100644 index 0000000..b450bd4 --- /dev/null +++ b/scripts/measure_precover_search_time.py @@ -0,0 +1,140 @@ +import argparse +import json +import os +import time +from datetime import datetime +from typing import List + +import pandas as pd + +from precovery.orbit import EpochTimescale, Orbit +from precovery.precovery_db import FrameCandidate, PrecoveryCandidate, PrecoveryDatabase + + +def sample_orbits(): + sample_orbits_file = os.path.join( + os.path.dirname(__file__), "..", "tests", "data", "sample_orbits.csv" + ) + df = pd.read_csv(sample_orbits_file) + orbits = [] + for i in range(len(df)): + orbit = Orbit.keplerian( + i, + df["a"].values[i], + df["e"].values[i], + df["i"].values[i], + df["om"].values[i], + df["w"].values[i], + df["ma"].values[i], + df["mjd_tt"].values[i], + EpochTimescale.TT, + df["H"].values[i], + df["G"].values[i], + ) + orbits.append(orbit) + return orbits + + +def measure_precover_performance( + database_directory: str, orbits: List[Orbit] +) -> tuple[PrecoveryDatabase, List[dict]]: + db = PrecoveryDatabase.from_dir(database_directory) + results = [] + + for orbit in orbits: + start_time = time.time() + precover_results = db.precover(orbit) + elapsed_time = time.time() - start_time + + precovery_count = sum( + isinstance(res, PrecoveryCandidate) for res in precover_results + ) + frame_count = sum(isinstance(res, FrameCandidate) for res in precover_results) + + results.append( + { + "elapsed_time": elapsed_time, + "precovery_count": precovery_count, + "frame_count": frame_count, + } + ) + + return db, results + + +def calculate_statistics(results: List[dict]): + import numpy as np + + elapsed_times = [res["elapsed_time"] for res in results] + + mean_time = np.mean(elapsed_times) + median_time = np.median(elapsed_times) + p10 = np.percentile(elapsed_times, 10) + p90 = np.percentile(elapsed_times, 90) + stdev = np.std(elapsed_times) + + return { + "mean_time": mean_time, + "median_time": median_time, + "p10": p10, + "p90": p90, + "stdev": stdev, + } + + +def save_results_to_json( + db: PrecoveryDatabase, + output_json_file: str, + results: List[dict], + database_directory: str, + run_timestamp: str, +): + statistics = calculate_statistics(results) + + report = { + "results": results, + "statistics": statistics, + "database_directory": database_directory, + "run_timestamp": run_timestamp, + "database": { + "size_bytes": db.frames.idx.n_bytes(), + "n_frames": db.frames.idx.n_frames(), + "dataset_ids": list(db.frames.idx.get_dataset_ids()), + }, + } + + with open(output_json_file, "w") as f: + json.dump(report, f, indent=4) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Measure the performance of the precover method on a PrecoveryDatabase." + ) + parser.add_argument( + "database_directory", help="Path to the PrecoveryDatabase directory." + ) + parser.add_argument( + "-o", + "--output_json_file", + default="precover_bench.json", + help="Path to the output JSON file.", + ) + args = parser.parse_args() + + orbits_to_test = sample_orbits()[:10] # Test with 10 sample orbits + + db, results = measure_precover_performance(args.database_directory, orbits_to_test) + + print("\nResults for precover method:") + for i, result in enumerate(results, start=1): + print( + f"Orbit {i}: {result['elapsed_time']:.2f} seconds, " + f"{result['precovery_count']} PrecoveryCandidates, {result['frame_count']} FrameCandidates" + ) + + run_timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + save_results_to_json( + db, args.output_json_file, results, args.database_directory, run_timestamp + ) + print(f"\nResults saved to JSON file: {args.output_json_file}") diff --git a/scripts/measure_precovery_db_creation_time.py b/scripts/measure_precovery_db_creation_time.py new file mode 100644 index 0000000..5ad55c3 --- /dev/null +++ b/scripts/measure_precovery_db_creation_time.py @@ -0,0 +1,140 @@ +import argparse +import json +import shutil +import time +from datetime import datetime + +import numpy as np + +from precovery.precovery_db import PrecoveryDatabase + + +def count_lines(file_path): + with open(file_path) as f: + return sum(1 for _ in f) + + +def measure_db_creation_time(database_directory, csv_file_path, dataset_id, nside): + start_time = time.time() + + db = PrecoveryDatabase.create(database_directory, nside) + db.frames.add_dataset(dataset_id) + db.frames.load_csv(csv_file_path, dataset_id) + + elapsed_time = time.time() - start_time + db.frames.close() + + return elapsed_time + + +def print_statistics(elapsed_times): + elapsed_times_array = np.array(elapsed_times) + print("Summary statistics:") + print(f" Mean time: {np.mean(elapsed_times_array):.2f} seconds") + print(f" Median time: {np.median(elapsed_times_array):.2f} seconds") + print(f" P10 time: {np.percentile(elapsed_times_array, 10):.2f} seconds") + print(f" P90 time: {np.percentile(elapsed_times_array, 90):.2f} seconds") + print(f" Standard deviation: {np.std(elapsed_times_array):.2f} seconds") + + +def save_results_to_json( + db, output_json_file, elapsed_times, num_lines, input_data_filename, run_timestamp +): + elapsed_times_array = np.array(elapsed_times) + results = { + "input_data_filename": input_data_filename, + "number_of_rows": num_lines, + "healpix_nside": db.frames.healpix_nside, + "run_timestamp": run_timestamp, + "database_statistics": { + "number_of_frames": db.frames.idx.n_frames(), + "database_size_bytes": db.frames.idx.n_bytes(), + "unique_datasets": list(db.frames.idx.get_dataset_ids()), + }, + "summary_statistics": { + "mean_time": np.mean(elapsed_times_array), + "median_time": np.median(elapsed_times_array), + "p10_time": np.percentile(elapsed_times_array, 10), + "p90_time": np.percentile(elapsed_times_array, 90), + "stdev_time": np.std(elapsed_times_array), + }, + "execution_times": elapsed_times, + } + + with open(output_json_file, "w") as json_file: + json.dump(results, json_file, indent=4) + + +def main(args): + elapsed_times = [] + num_lines = count_lines(args.csv_file_path) + + for i in range(args.num_iterations): + print(f"Running iteration {i + 1}/{args.num_iterations}") + shutil.rmtree(args.database_directory, ignore_errors=True) + elapsed_time = measure_db_creation_time( + args.database_directory, + args.csv_file_path, + args.dataset_id, + 2**args.healpixel_order, + ) + elapsed_times.append(elapsed_time) + + run_timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%S") + if args.output_json_file is not None: + output_json_file = args.output_json_file + else: + output_json_file = "precovery_create_benchmark.json" + + # Load the existing database for gathering statistics + db = PrecoveryDatabase.from_dir(args.database_directory) + + save_results_to_json( + db, + output_json_file, + elapsed_times, + num_lines, + args.csv_file_path, + run_timestamp, + ) + print(f"Results saved to {output_json_file}") + + print_statistics(elapsed_times) + + # Close the database instance + db.frames.close() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Measure Precovery database creation time." + ) + parser.add_argument( + "database_directory", help="Directory path for the Precovery database." + ) + parser.add_argument("csv_file_path", help="Path to the input CSV file.") + parser.add_argument("dataset_id", help="Dataset ID for the input data.") + parser.add_argument( + "-p", "--healpixel_order", type=int, default=12, help="Healpixel order to use." + ) + parser.add_argument( + "-n", + "--num_iterations", + type=int, + default=20, + help="Number of times to create the database (default: 20).", + ) + parser.add_argument( + "-o", + "--output_json_file", + help="Output JSON file (default: results_TIMESTAMP.json).", + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="Display verbose output." + ) + args = parser.parse_args() + + if args.verbose: + print(f"Running with the following arguments: {args}") + + main(args)