diff --git a/README.md b/README.md index 763cbce2..86571a9a 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ These steps were tested on a fresh repository clone, Ubuntu ??.04. ./dependency/install_dependencies.sh # Compile a custom fork of PostgreSQL, load TPC-H, train the Proto-X agent, and tune. -./scripts/quickstart.sh postgres path/to/put/pgdata/in tpch 0.01 protox +./scripts/quickstart.sh postgres dir/to/put/dbdata/in/ tpch 0.01 protox ``` ## Overview diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py index d5c8c407..82adeff5 100644 --- a/benchmark/tpch/cli.py +++ b/benchmark/tpch/cli.py @@ -21,8 +21,8 @@ def tpch_group(dbgym_cfg: DBGymConfig): @tpch_group.command(name="data") @click.argument("scale-factor", type=float) @click.pass_obj -# The reason generate-data is separate from create-pgdata is because generate-data is generic -# to all DBMSs while create-pgdata is specific to Postgres. +# The reason generate data is separate from create dbdata is because generate-data is generic +# to all DBMSs while create dbdata is specific to a single DBMS. def tpch_data(dbgym_cfg: DBGymConfig, scale_factor: float): _clone(dbgym_cfg) _generate_data(dbgym_cfg, scale_factor) diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py index 72d74b4d..f81a877f 100644 --- a/dbms/postgres/cli.py +++ b/dbms/postgres/cli.py @@ -1,5 +1,5 @@ """ -At a high level, this file's goal is to (1) install+build postgres and (2) create pgdata. +At a high level, this file's goal is to (1) build postgres and (2) create dbdata (aka pgdata). On the other hand, the goal of tune.protox.env.util.postgres is to provide helpers to manage a Postgres instance during agent tuning. util.pg provides helpers used by *both* of the above files (as well as other files). @@ -13,7 +13,7 @@ from benchmark.tpch.load_info import TpchLoadInfo from dbms.load_info_base_class import LoadInfoBaseClass -from misc.utils import DBGymConfig, conv_inputpath_to_realabspath, link_result, open_and_save, save_file, get_pgdata_tgz_name, default_pgbin_path, WORKSPACE_PATH_PLACEHOLDER, default_pgdata_parent_dpath, is_ssd +from misc.utils import DBGymConfig, conv_inputpath_to_realabspath, link_result, open_and_save, save_file, get_dbdata_tgz_name, default_pgbin_path, WORKSPACE_PATH_PLACEHOLDER, default_dbdata_parent_dpath, is_ssd from util.shell import subprocess_run from sqlalchemy import Connection from util.pg import SHARED_PRELOAD_LIBRARIES, conn_execute, sql_file_execute, DBGYM_POSTGRES_DBNAME, create_conn, DEFAULT_POSTGRES_PORT, DBGYM_POSTGRES_USER, DBGYM_POSTGRES_PASS, DEFAULT_POSTGRES_DBNAME @@ -31,7 +31,7 @@ def postgres_group(dbgym_cfg: DBGymConfig): @postgres_group.command( name="build", - help="Download and build the Postgres repository and all necessary extensions/shared libraries. Does not create pgdata.", + help="Download and build the Postgres repository and all necessary extensions/shared libraries. Does not create dbdata.", ) @click.pass_obj @click.option("--rebuild", is_flag=True, help="Include this flag to rebuild Postgres even if it already exists.") @@ -40,46 +40,46 @@ def postgres_build(dbgym_cfg: DBGymConfig, rebuild: bool): @postgres_group.command( - name="pgdata", - help="Build a .tgz file of pgdata with various specifications for its contents.", + name="dbdata", + help="Build a .tgz file of dbdata with various specifications for its contents.", ) @click.pass_obj @click.argument("benchmark_name", type=str) @click.option("--scale-factor", type=float, default=1) @click.option("--pgbin-path", type=Path, default=None, help=f"The path to the bin containing Postgres executables. The default is {default_pgbin_path(WORKSPACE_PATH_PLACEHOLDER)}.") @click.option( - "--intended-pgdata-hardware", + "--intended-dbdata-hardware", type=click.Choice(["hdd", "ssd"]), default="hdd", - help=f"The intended hardware pgdata should be on. Used as a sanity check for --pgdata-parent-dpath.", + help=f"The intended hardware dbdata should be on. Used as a sanity check for --dbdata-parent-dpath.", ) @click.option( - "--pgdata-parent-dpath", + "--dbdata-parent-dpath", default=None, type=Path, - help=f"The path to the parent directory of the pgdata which will be actively tuned. The default is {default_pgdata_parent_dpath(WORKSPACE_PATH_PLACEHOLDER)}.", + help=f"The path to the parent directory of the dbdata which will be actively tuned. The default is {default_dbdata_parent_dpath(WORKSPACE_PATH_PLACEHOLDER)}.", ) -def postgres_pgdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, intended_pgdata_hardware: str, pgdata_parent_dpath: Path): +def postgres_dbdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, intended_dbdata_hardware: str, dbdata_parent_dpath: Path): # Set args to defaults programmatically (do this before doing anything else in the function) if pgbin_path == None: pgbin_path = default_pgbin_path(dbgym_cfg.dbgym_workspace_path) - if pgdata_parent_dpath == None: - pgdata_parent_dpath = default_pgdata_parent_dpath(dbgym_cfg.dbgym_workspace_path) + if dbdata_parent_dpath == None: + dbdata_parent_dpath = default_dbdata_parent_dpath(dbgym_cfg.dbgym_workspace_path) # Convert all input paths to absolute paths pgbin_path = conv_inputpath_to_realabspath(dbgym_cfg, pgbin_path) - pgdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, pgdata_parent_dpath) + dbdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, dbdata_parent_dpath) # Check assertions on args - if intended_pgdata_hardware == "hdd": - assert not is_ssd(pgdata_parent_dpath), f"Intended hardware is HDD but pgdata_parent_dpath ({pgdata_parent_dpath}) is an SSD" - elif intended_pgdata_hardware == "ssd": - assert is_ssd(pgdata_parent_dpath), f"Intended hardware is SSD but pgdata_parent_dpath ({pgdata_parent_dpath}) is an HDD" + if intended_dbdata_hardware == "hdd": + assert not is_ssd(dbdata_parent_dpath), f"Intended hardware is HDD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an SSD" + elif intended_dbdata_hardware == "ssd": + assert is_ssd(dbdata_parent_dpath), f"Intended hardware is SSD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an HDD" else: assert False - # Create pgdata - _create_pgdata(dbgym_cfg, benchmark_name, scale_factor, pgbin_path, pgdata_parent_dpath) + # Create dbdata + _create_dbdata(dbgym_cfg, benchmark_name, scale_factor, pgbin_path, dbdata_parent_dpath) def _get_pgbin_symlink_path(dbgym_cfg: DBGymConfig) -> Path: @@ -108,52 +108,52 @@ def _build_repo(dbgym_cfg: DBGymConfig, rebuild): dbms_postgres_logger.info(f"Set up repo in {expected_repo_symlink_dpath}") -def _create_pgdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, pgdata_parent_dpath: Path) -> None: +def _create_dbdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, dbdata_parent_dpath: Path) -> None: """ - I chose *not* for this function to skip by default if pgdata_tgz_symlink_path already exists. This + I chose *not* for this function to skip by default if dbdata_tgz_symlink_path already exists. This is because, while the generated data is deterministic given benchmark_name and scale_factor, any - change in the _create_pgdata() function would result in a different pgdata. Since _create_pgdata() + change in the _create_dbdata() function would result in a different dbdata. Since _create_dbdata() may change somewhat frequently, I decided to get rid of the footgun of having changes to - _create_pgdata() not propagate to [pgdata].tgz by default. + _create_dbdata() not propagate to [dbdata].tgz by default. """ - # It's ok for the pgdata/ directory to be temporary. It just matters that the .tgz is saved in a safe place. - pgdata_dpath = pgdata_parent_dpath / "pgdata_being_created" - # We might be reusing the same pgdata_parent_dpath, so delete pgdata_dpath if it already exists - if pgdata_dpath.exists(): - shutil.rmtree(pgdata_dpath) + # It's ok for the dbdata/ directory to be temporary. It just matters that the .tgz is saved in a safe place. + dbdata_dpath = dbdata_parent_dpath / "dbdata_being_created" + # We might be reusing the same dbdata_parent_dpath, so delete dbdata_dpath if it already exists + if dbdata_dpath.exists(): + shutil.rmtree(dbdata_dpath) # Call initdb. # Save any script we call from pgbin_symlink_dpath because they are dependencies generated from another task run. save_file(dbgym_cfg, pgbin_path / "initdb") - subprocess_run(f'./initdb -D "{pgdata_dpath}"', cwd=pgbin_path) + subprocess_run(f'./initdb -D "{dbdata_dpath}"', cwd=pgbin_path) - # Start Postgres (all other pgdata setup requires postgres to be started). + # Start Postgres (all other dbdata setup requires postgres to be started). # Note that subprocess_run() never returns when running "pg_ctl start", so I'm using subprocess.run() instead. - start_postgres(dbgym_cfg, pgbin_path, pgdata_dpath) + start_postgres(dbgym_cfg, pgbin_path, dbdata_dpath) # Set up Postgres. - _generic_pgdata_setup(dbgym_cfg) - _load_benchmark_into_pgdata(dbgym_cfg, benchmark_name, scale_factor) + _generic_dbdata_setup(dbgym_cfg) + _load_benchmark_into_dbdata(dbgym_cfg, benchmark_name, scale_factor) # Stop Postgres so that we don't "leak" processes. - stop_postgres(dbgym_cfg, pgbin_path, pgdata_dpath) + stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath) # Create .tgz file. - # Note that you can't pass "[pgdata].tgz" as an arg to cur_task_runs_data_path() because that would create "[pgdata].tgz" as a dir. - pgdata_tgz_real_fpath = dbgym_cfg.cur_task_runs_data_path( + # Note that you can't pass "[dbdata].tgz" as an arg to cur_task_runs_data_path() because that would create "[dbdata].tgz" as a dir. + dbdata_tgz_real_fpath = dbgym_cfg.cur_task_runs_data_path( mkdir=True - ) / get_pgdata_tgz_name(benchmark_name, scale_factor) - # We need to cd into pgdata_dpath so that the tar file does not contain folders for the whole path of pgdata_dpath. - subprocess_run(f"tar -czf {pgdata_tgz_real_fpath} .", cwd=pgdata_dpath) + ) / get_dbdata_tgz_name(benchmark_name, scale_factor) + # We need to cd into dbdata_dpath so that the tar file does not contain folders for the whole path of dbdata_dpath. + subprocess_run(f"tar -czf {dbdata_tgz_real_fpath} .", cwd=dbdata_dpath) # Create symlink. - # Only link at the end so that the link only ever points to a complete pgdata. - pgdata_tgz_symlink_path = link_result(dbgym_cfg, pgdata_tgz_real_fpath) - dbms_postgres_logger.info(f"Created pgdata in {pgdata_tgz_symlink_path}") + # Only link at the end so that the link only ever points to a complete dbdata. + dbdata_tgz_symlink_path = link_result(dbgym_cfg, dbdata_tgz_real_fpath) + dbms_postgres_logger.info(f"Created dbdata in {dbdata_tgz_symlink_path}") -def _generic_pgdata_setup(dbgym_cfg: DBGymConfig): +def _generic_dbdata_setup(dbgym_cfg: DBGymConfig): # get necessary vars pgbin_real_dpath = _get_pgbin_symlink_path(dbgym_cfg).resolve() assert pgbin_real_dpath.exists() @@ -181,15 +181,15 @@ def _generic_pgdata_setup(dbgym_cfg: DBGymConfig): cwd=pgbin_real_dpath, ) - # Create the dbgym database. since one pgdata dir maps to one benchmark, all benchmarks will use the same database - # as opposed to using databases named after the benchmark + # Create the dbgym database. Since one dbdata dir maps to one benchmark, all benchmarks will use the same database + # as opposed to using databases named after the benchmark. subprocess_run( f"./psql -c \"create database {DBGYM_POSTGRES_DBNAME} with owner = '{dbgym_pguser}'\" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost", cwd=pgbin_real_dpath, ) -def _load_benchmark_into_pgdata( +def _load_benchmark_into_dbdata( dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float ): with create_conn(use_psycopg=False) as conn: @@ -197,13 +197,13 @@ def _load_benchmark_into_pgdata( load_info = TpchLoadInfo(dbgym_cfg, scale_factor) else: raise AssertionError( - f"_load_benchmark_into_pgdata(): the benchmark of name {benchmark_name} is not implemented" + f"_load_benchmark_into_dbdata(): the benchmark of name {benchmark_name} is not implemented" ) - _load_into_pgdata(dbgym_cfg, conn, load_info) + _load_into_dbdata(dbgym_cfg, conn, load_info) -def _load_into_pgdata(dbgym_cfg: DBGymConfig, conn: Connection, load_info: LoadInfoBaseClass): +def _load_into_dbdata(dbgym_cfg: DBGymConfig, conn: Connection, load_info: LoadInfoBaseClass): sql_file_execute(dbgym_cfg, conn, load_info.get_schema_fpath()) # truncate all tables first before even loading a single one @@ -222,21 +222,21 @@ def _load_into_pgdata(dbgym_cfg: DBGymConfig, conn: Connection, load_info: LoadI sql_file_execute(dbgym_cfg, conn, constraints_fpath) -def start_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, pgdata_dpath: Path) -> None: - _start_or_stop_postgres(dbgym_cfg, pgbin_path, pgdata_dpath, True) +def start_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path) -> None: + _start_or_stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath, True) -def stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, pgdata_dpath: Path) -> None: - _start_or_stop_postgres(dbgym_cfg, pgbin_path, pgdata_dpath, False) +def stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path) -> None: + _start_or_stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath, False) -def _start_or_stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, pgdata_dpath: Path, is_start: bool) -> None: +def _start_or_stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path, is_start: bool) -> None: # They should be absolute paths and should exist assert pgbin_path.is_absolute() and pgbin_path.exists() - assert pgdata_dpath.is_absolute() and pgdata_dpath.exists() + assert dbdata_dpath.is_absolute() and dbdata_dpath.exists() # The inputs may be symlinks so we need to resolve them first pgbin_real_dpath = pgbin_path.resolve() - pgdata_dpath = pgdata_dpath.resolve() + dbdata_dpath = dbdata_dpath.resolve() pgport = DEFAULT_POSTGRES_PORT save_file(dbgym_cfg, pgbin_real_dpath / "pg_ctl") @@ -244,7 +244,7 @@ def _start_or_stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, pgdata_dpa # We use subprocess.run() because subprocess_run() never returns when running "pg_ctl start". # The reason subprocess_run() never returns is because pg_ctl spawns a postgres process so .poll() always returns None. # On the other hand, subprocess.run() does return normally, like calling `./pg_ctl` on the command line would do. - result = subprocess.run(f"./pg_ctl -D \"{pgdata_dpath}\" -o '-p {pgport}' start", cwd=pgbin_real_dpath, shell=True) + result = subprocess.run(f"./pg_ctl -D \"{dbdata_dpath}\" -o '-p {pgport}' start", cwd=pgbin_real_dpath, shell=True) result.check_returncode() else: - subprocess_run(f"./pg_ctl -D \"{pgdata_dpath}\" -o '-p {pgport}' stop", cwd=pgbin_real_dpath) \ No newline at end of file + subprocess_run(f"./pg_ctl -D \"{dbdata_dpath}\" -o '-p {pgport}' stop", cwd=pgbin_real_dpath) \ No newline at end of file diff --git a/experiments/load_per_machine_envvars.sh b/experiments/load_per_machine_envvars.sh index 905c6c01..b9772d3c 100644 --- a/experiments/load_per_machine_envvars.sh +++ b/experiments/load_per_machine_envvars.sh @@ -2,9 +2,9 @@ host=$(hostname) if [ "$host" == "dev4" ]; then - export PGDATA_PARENT_DPATH=/mnt/nvme1n1/phw2/dbgym_tmp/ + export DBDATA_PARENT_DPATH=/mnt/nvme1n1/phw2/dbgym_tmp/ elif [ "$host" == "dev6" ]; then - export PGDATA_PARENT_DPATH=/mnt/nvme0n1/phw2/dbgym_tmp/ + export DBDATA_PARENT_DPATH=/mnt/nvme0n1/phw2/dbgym_tmp/ else echo "Did not recognize host \"$host\"" exit 1 diff --git a/experiments/protox_tpch_sf0point1/main.sh b/experiments/protox_tpch_sf0point1/main.sh index 876791ec..480f28ca 100755 --- a/experiments/protox_tpch_sf0point1/main.sh +++ b/experiments/protox_tpch_sf0point1/main.sh @@ -3,12 +3,12 @@ set -euxo pipefail SCALE_FACTOR=0.1 -INTENDED_PGDATA_HARDWARE=ssd +INTENDED_DBDATA_HARDWARE=ssd . ./experiments/load_per_machine_envvars.sh -echo $PGDATA_PARENT_DPATH +echo $DBDATA_PARENT_DPATH # space for testing. uncomment this to run individual commands from the script (copy pasting is harder because there are envvars) -# python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 4 --max-concurrent 4 --workload-timeout 100 --query-timeout 15 --tune-duration-during-hpo 0.1 --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH +# python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 4 --max-concurrent 4 --workload-timeout 100 --query-timeout 15 --tune-duration-during-hpo 0.1 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH python3 task.py tune protox agent tune tpch --scale-factor $SCALE_FACTOR --tune-duration-during-tune 0.2 python3 task.py tune protox agent replay tpch --scale-factor $SCALE_FACTOR exit 0 @@ -19,15 +19,15 @@ python3 task.py benchmark tpch workload --scale-factor $SCALE_FACTOR # postgres python3 task.py dbms postgres build -python3 task.py dbms postgres pgdata tpch --scale-factor $SCALE_FACTOR --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH +python3 task.py dbms postgres dbdata tpch --scale-factor $SCALE_FACTOR --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH exit 0 # embedding -python3 task.py tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH # long datagen so that train doesn't crash +python3 task.py tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH # long datagen so that train doesn't crash python3 task.py tune protox embedding train tpch --scale-factor $SCALE_FACTOR --iterations-per-epoch 1 --num-points-to-sample 1 --num-batches 1 --batch-size 64 --start-epoch 15 --num-samples 4 --train-max-concurrent 4 --num-curate 2 # agent -python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 4 --max-concurrent 4 --workload-timeout 100 --query-timeout 15 --tune-duration-during-hpo 1 --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH --build-space-good-for-boot +python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 4 --max-concurrent 4 --workload-timeout 100 --query-timeout 15 --tune-duration-during-hpo 1 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH --build-space-good-for-boot python3 task.py tune protox agent tune tpch --scale-factor $SCALE_FACTOR python3 task.py tune protox agent replay tpch --scale-factor $SCALE_FACTOR diff --git a/experiments/protox_tpch_sf10/main.sh b/experiments/protox_tpch_sf10/main.sh index 43a29f2e..62814340 100755 --- a/experiments/protox_tpch_sf10/main.sh +++ b/experiments/protox_tpch_sf10/main.sh @@ -3,11 +3,11 @@ set -euxo pipefail SCALE_FACTOR=10 -INTENDED_PGDATA_HARDWARE=ssd +INTENDED_DBDATA_HARDWARE=ssd . ./experiments/load_per_machine_envvars.sh # space for testing. uncomment this to run individual commands from the script (copy pasting is harder because there are envvars) -python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --max-concurrent 4 --tune-duration-during-hpo 4 --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH --build-space-good-for-boot +python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --max-concurrent 4 --tune-duration-during-hpo 4 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH --build-space-good-for-boot # python3 task.py tune protox agent tune tpch --scale-factor $SCALE_FACTOR --tune-duration-during-tune 4 # python3 task.py tune protox agent tune tpch --scale-factor $SCALE_FACTOR --enable-boot-during-tune --tune-duration-during-tune 4 # python3 task.py tune protox agent replay tpch --scale-factor $SCALE_FACTOR @@ -20,12 +20,12 @@ python3 task.py benchmark tpch workload --scale-factor $SCALE_FACTOR # postgres python3 task.py dbms postgres build -python3 task.py dbms postgres pgdata tpch --scale-factor $SCALE_FACTOR --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH +python3 task.py dbms postgres dbdata tpch --scale-factor $SCALE_FACTOR --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH # embedding -python3 task.py tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH +python3 task.py tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH python3 task.py tune protox embedding train tpch --scale-factor $SCALE_FACTOR --train-max-concurrent 10 # agent -python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --max-concurrent 4 --tune-duration-during-hpo 4 --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH --build-space-good-for-boot +python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --max-concurrent 4 --tune-duration-during-hpo 4 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH --build-space-good-for-boot python3 task.py tune protox agent tune tpch --scale-factor $SCALE_FACTOR diff --git a/misc/utils.py b/misc/utils.py index 7be57610..fb1dbde4 100644 --- a/misc/utils.py +++ b/misc/utils.py @@ -48,8 +48,8 @@ def get_scale_factor_string(scale_factor: float | str) -> str: else: return str(scale_factor).replace(".", "point") -def get_pgdata_tgz_name(benchmark_name: str, scale_factor: float) -> str: - return f"{benchmark_name}_sf{get_scale_factor_string(scale_factor)}_pristine_pgdata.tgz" +def get_dbdata_tgz_name(benchmark_name: str, scale_factor: float) -> str: + return f"{benchmark_name}_sf{get_scale_factor_string(scale_factor)}_pristine_dbdata.tgz" # Other parameters @@ -134,15 +134,15 @@ def get_pgdata_tgz_name(benchmark_name: str, scale_factor: float) -> str: / "data" / (workload_name + ".link") ) -default_pristine_pgdata_snapshot_path = ( +default_pristine_dbdata_snapshot_path = ( lambda workspace_path, benchmark_name, scale_factor: get_symlinks_path_from_workspace_path( workspace_path ) / "dbgym_dbms_postgres" / "data" - / (get_pgdata_tgz_name(benchmark_name, scale_factor) + ".link") + / (get_dbdata_tgz_name(benchmark_name, scale_factor) + ".link") ) -default_pgdata_parent_dpath = ( +default_dbdata_parent_dpath = ( lambda workspace_path: get_tmp_path_from_workspace_path( workspace_path ) @@ -202,8 +202,8 @@ def __init__(self, config_path): ) self.dbgym_symlinks_path.mkdir(parents=True, exist_ok=True) # tmp is a workspace for this run only - # one use for it is to place the unzipped pgdata - # there's no need to save the actual pgdata dir in run_*/ because we just save a symlink to + # one use for it is to place the unzipped dbdata + # there's no need to save the actual dbdata dir in run_*/ because we just save a symlink to # the .tgz file we unzipped self.dbgym_tmp_path = get_tmp_path_from_workspace_path(self.dbgym_workspace_path) if self.dbgym_tmp_path.exists(): diff --git a/scripts/pat_test.sh b/scripts/pat_test.sh index a21ef809..4353a5dc 100755 --- a/scripts/pat_test.sh +++ b/scripts/pat_test.sh @@ -3,11 +3,11 @@ set -euxo pipefail SCALE_FACTOR=0.01 -INTENDED_PGDATA_HARDWARE=ssd +INTENDED_DBDATA_HARDWARE=ssd . ./experiments/load_per_machine_envvars.sh # space for testing. uncomment this to run individual commands from the script (copy pasting is harder because there are envvars) -python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 2 --max-concurrent 2 --workload-timeout 15 --query-timeout 1 --tune-duration-during-hpo 0.01 --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH --build-space-good-for-boot +python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 2 --max-concurrent 2 --workload-timeout 15 --query-timeout 1 --tune-duration-during-hpo 0.01 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH --build-space-good-for-boot python3 task.py tune protox agent tune tpch --scale-factor $SCALE_FACTOR --tune-duration-during-tune 0.02 python3 task.py tune protox agent replay tpch --scale-factor $SCALE_FACTOR exit 0 @@ -18,16 +18,16 @@ python3 task.py benchmark tpch workload --scale-factor $SCALE_FACTOR # postgres python3 task.py dbms postgres build -python3 task.py dbms postgres pgdata tpch --scale-factor $SCALE_FACTOR --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH +python3 task.py dbms postgres dbdata tpch --scale-factor $SCALE_FACTOR --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH exit 0 # embedding -# python3 task.py tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --default-sample-limit 64 --file-limit 64 --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH # short datagen for testing -python3 task.py tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH # long datagen so that train doesn't crash +# python3 task.py tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --default-sample-limit 64 --file-limit 64 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH # short datagen for testing +python3 task.py tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH # long datagen so that train doesn't crash python3 task.py tune protox embedding train tpch --scale-factor $SCALE_FACTOR --iterations-per-epoch 1 --num-points-to-sample 1 --num-batches 1 --batch-size 64 --start-epoch 15 --num-samples 4 --train-max-concurrent 4 --num-curate 2 # agent -python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 2 --max-concurrent 2 --workload-timeout 15 --query-timeout 1 --tune-duration-during-hpo 0.01 --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH --build-space-good-for-boot +python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 2 --max-concurrent 2 --workload-timeout 15 --query-timeout 1 --tune-duration-during-hpo 0.01 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH --build-space-good-for-boot python3 task.py tune protox agent tune tpch --scale-factor $SCALE_FACTOR python3 task.py tune protox agent replay tpch --scale-factor $SCALE_FACTOR diff --git a/scripts/quickstart.sh b/scripts/quickstart.sh index d12f4ca3..847fae4d 100644 --- a/scripts/quickstart.sh +++ b/scripts/quickstart.sh @@ -3,11 +3,11 @@ set -euxo pipefail DBMS=$1 -PGDATA_PARENT_DPATH=$2 +DBDATA_PARENT_DPATH=$2 BENCHMARK=$3 SCALE_FACTOR=$4 AGENT=$5 -INTENDED_PGDATA_HARDWARE=ssd +INTENDED_DBDATA_HARDWARE=ssd # Benchmark python3 task.py benchmark $BENCHMARK data $SCALE_FACTOR @@ -15,11 +15,11 @@ python3 task.py benchmark $BENCHMARK workload --scale-factor $SCALE_FACTOR # DBMS python3 task.py dbms $DBMS build -python3 task.py dbms $DBMS pgdata tpch --scale-factor $SCALE_FACTOR --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH +python3 task.py dbms $DBMS dbdata tpch --scale-factor $SCALE_FACTOR --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH # Tune -python3 task.py tune $AGENT embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH # long datagen so that train doesn't crash +python3 task.py tune $AGENT embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH # long datagen so that train doesn't crash python3 task.py tune $AGENT embedding train tpch --scale-factor $SCALE_FACTOR --iterations-per-epoch 1 --num-points-to-sample 1 --num-batches 1 --batch-size 64 --start-epoch 15 --num-samples 4 --train-max-concurrent 4 --num-curate 2 -python3 task.py tune $AGENT agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 2 --max-concurrent 2 --workload-timeout 15 --query-timeout 1 --tune-duration-during-hpo 0.01 --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH --build-space-good-for-boot +python3 task.py tune $AGENT agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 2 --max-concurrent 2 --workload-timeout 15 --query-timeout 1 --tune-duration-during-hpo 0.01 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH --build-space-good-for-boot python3 task.py tune $AGENT agent tune tpch --scale-factor $SCALE_FACTOR python3 task.py tune $AGENT agent replay tpch --scale-factor $SCALE_FACTOR diff --git a/scripts/wan_test.sh b/scripts/wan_test.sh index 2e2c9f19..a700dd31 100755 --- a/scripts/wan_test.sh +++ b/scripts/wan_test.sh @@ -10,7 +10,7 @@ python3 task.py benchmark tpch generate-data 1 python3 task.py benchmark tpch generate-workload queries_15721_15723 15721 15723 # Create tpch_sf1.tgz -python3 task.py dbms postgres pgdata tpch --scale-factor 1 +python3 task.py dbms postgres dbdata tpch --scale-factor 1 # Run Proto-X python3 task.py dbms postgres start diff --git a/task.py b/task.py index 6aa61a2f..c20cdf62 100644 --- a/task.py +++ b/task.py @@ -6,7 +6,7 @@ from benchmark.cli import benchmark_group from dbms.cli import dbms_group -from misc.utils import DBGymConfig, is_ssd +from misc.utils import DBGymConfig from tune.cli import tune_group # TODO(phw2): save commit, git diff, and run command diff --git a/tune/protox/agent/build_trial.py b/tune/protox/agent/build_trial.py index 58e1aeb7..53e782a5 100644 --- a/tune/protox/agent/build_trial.py +++ b/tune/protox/agent/build_trial.py @@ -158,8 +158,8 @@ def _build_utilities( pg_conn = PostgresConn( dbgym_cfg=dbgym_cfg, pgport=pgport, - pristine_pgdata_snapshot_fpath=Path(hpo_params["pgconn_info"]["pristine_pgdata_snapshot_path"]), - pgdata_parent_dpath=Path(hpo_params["pgconn_info"]["pgdata_parent_dpath"]), + pristine_dbdata_snapshot_fpath=Path(hpo_params["pgconn_info"]["pristine_dbdata_snapshot_path"]), + dbdata_parent_dpath=Path(hpo_params["pgconn_info"]["dbdata_parent_dpath"]), pgbin_path=Path(hpo_params["pgconn_info"]["pgbin_path"]), enable_boot=enable_boot, boot_config_fpath=hpo_params["boot_config_fpath"][str(tuning_mode)], diff --git a/tune/protox/agent/coerce_config.py b/tune/protox/agent/coerce_config.py index 3c19900c..db8f06eb 100644 --- a/tune/protox/agent/coerce_config.py +++ b/tune/protox/agent/coerce_config.py @@ -35,8 +35,8 @@ def coerce_config(dbgym_cfg: DBGymConfig, space: dict[str, Any], hpo_params: dic "pgport": 5432, "pguser": "admin", "pgpass": "", - "pristine_pgdata_snapshot_path": "/mnt/nvme0n1/wz2/noisepage/pgdata", - "pgdata_parent_dpath": "/mnt/nvme0n1/wz2/noisepage/", + "pristine_dbdata_snapshot_path": "/mnt/nvme0n1/wz2/noisepage/pgdata", + "dbdata_parent_dpath": "/mnt/nvme0n1/wz2/noisepage/", "pgbin_path": "/mnt/nvme0n1/wz2/noisepage/", }, "benchmark_config": benchmark_config, diff --git a/tune/protox/agent/hpo.py b/tune/protox/agent/hpo.py index 8aea2033..bc3d8432 100644 --- a/tune/protox/agent/hpo.py +++ b/tune/protox/agent/hpo.py @@ -22,22 +22,22 @@ from ray.train import SyncConfig from tune.protox.agent.build_trial import build_trial -from misc.utils import DEFAULT_BOOT_CONFIG_FPATH, DEFAULT_WORKLOAD_TIMEOUT, DBGymConfig, TuningMode, link_result, open_and_save, restart_ray, conv_inputpath_to_realabspath, default_pristine_pgdata_snapshot_path, default_workload_path, default_embedder_path, default_benchmark_config_path, default_benchbase_config_path, WORKSPACE_PATH_PLACEHOLDER, BENCHMARK_NAME_PLACEHOLDER, WORKLOAD_NAME_PLACEHOLDER, SCALE_FACTOR_PLACEHOLDER, DEFAULT_SYSKNOBS_PATH, default_pgbin_path, workload_name_fn, default_pgdata_parent_dpath, default_hpoed_agent_params_fname, is_ssd +from misc.utils import DEFAULT_BOOT_CONFIG_FPATH, DEFAULT_WORKLOAD_TIMEOUT, DBGymConfig, TuningMode, link_result, open_and_save, restart_ray, conv_inputpath_to_realabspath, default_pristine_dbdata_snapshot_path, default_workload_path, default_embedder_path, default_benchmark_config_path, default_benchbase_config_path, WORKSPACE_PATH_PLACEHOLDER, BENCHMARK_NAME_PLACEHOLDER, WORKLOAD_NAME_PLACEHOLDER, SCALE_FACTOR_PLACEHOLDER, DEFAULT_SYSKNOBS_PATH, default_pgbin_path, workload_name_fn, default_dbdata_parent_dpath, default_hpoed_agent_params_fname, is_ssd METRIC_NAME = "Best Metric" class AgentHPOArgs: - def __init__(self, benchmark_name, workload_name, embedder_path, benchmark_config_path, benchbase_config_path, sysknobs_path, pristine_pgdata_snapshot_path, pgdata_parent_dpath, pgbin_path, workload_path, seed, agent, max_concurrent, num_samples, tune_duration_during_hpo, workload_timeout, query_timeout, enable_boot_during_hpo, boot_config_fpath_during_hpo, build_space_good_for_boot): + def __init__(self, benchmark_name, workload_name, embedder_path, benchmark_config_path, benchbase_config_path, sysknobs_path, pristine_dbdata_snapshot_path, dbdata_parent_dpath, pgbin_path, workload_path, seed, agent, max_concurrent, num_samples, tune_duration_during_hpo, workload_timeout, query_timeout, enable_boot_during_hpo, boot_config_fpath_during_hpo, build_space_good_for_boot): self.benchmark_name = benchmark_name self.workload_name = workload_name self.embedder_path = embedder_path self.benchmark_config_path = benchmark_config_path self.benchbase_config_path = benchbase_config_path self.sysknobs_path = sysknobs_path - self.pristine_pgdata_snapshot_path = pristine_pgdata_snapshot_path - self.pgdata_parent_dpath = pgdata_parent_dpath + self.pristine_dbdata_snapshot_path = pristine_dbdata_snapshot_path + self.dbdata_parent_dpath = dbdata_parent_dpath self.pgbin_path = pgbin_path self.workload_path = workload_path self.seed = seed @@ -90,28 +90,22 @@ def __init__(self, benchmark_name, workload_name, embedder_path, benchmark_confi help=f"The path to the file configuring the space of system knobs the tuner can tune.", ) @click.option( - "--pristine-pgdata-snapshot-path", + "--pristine-dbdata-snapshot-path", default=None, type=Path, - help=f"The path to the .tgz snapshot of the pgdata directory to use as a starting point for tuning. The default is {default_pristine_pgdata_snapshot_path(WORKSPACE_PATH_PLACEHOLDER, BENCHMARK_NAME_PLACEHOLDER, SCALE_FACTOR_PLACEHOLDER)}.", + help=f"The path to the .tgz snapshot of the dbdata directory to use as a starting point for tuning. The default is {default_pristine_dbdata_snapshot_path(WORKSPACE_PATH_PLACEHOLDER, BENCHMARK_NAME_PLACEHOLDER, SCALE_FACTOR_PLACEHOLDER)}.", ) @click.option( - "--pristine-pgdata-snapshot-path", - default=None, - type=Path, - help=f"The path to the .tgz snapshot of the pgdata directory to use as a starting point for tuning. The default is {default_pristine_pgdata_snapshot_path(WORKSPACE_PATH_PLACEHOLDER, BENCHMARK_NAME_PLACEHOLDER, SCALE_FACTOR_PLACEHOLDER)}.", -) -@click.option( - "--intended-pgdata-hardware", + "--intended-dbdata-hardware", type=click.Choice(["hdd", "ssd"]), default="hdd", - help=f"The intended hardware pgdata should be on. Used as a sanity check for --pgdata-parent-dpath.", + help=f"The intended hardware dbdata should be on. Used as a sanity check for --dbdata-parent-dpath.", ) @click.option( - "--pgdata-parent-dpath", + "--dbdata-parent-dpath", default=None, type=Path, - help=f"The path to the parent directory of the pgdata which will be actively tuned. The default is {default_pgdata_parent_dpath(WORKSPACE_PATH_PLACEHOLDER)}.", + help=f"The path to the parent directory of the dbdata which will be actively tuned. The default is {default_dbdata_parent_dpath(WORKSPACE_PATH_PLACEHOLDER)}.", ) @click.option( "--pgbin-path", @@ -198,9 +192,9 @@ def hpo( benchmark_config_path, benchbase_config_path, sysknobs_path, - pristine_pgdata_snapshot_path, - intended_pgdata_hardware, - pgdata_parent_dpath, + pristine_dbdata_snapshot_path, + intended_dbdata_hardware, + dbdata_parent_dpath, pgbin_path, workload_path, seed, @@ -222,10 +216,10 @@ def hpo( benchmark_config_path = default_benchmark_config_path(benchmark_name) if benchbase_config_path == None: benchbase_config_path = default_benchbase_config_path(benchmark_name) - if pristine_pgdata_snapshot_path == None: - pristine_pgdata_snapshot_path = default_pristine_pgdata_snapshot_path(dbgym_cfg.dbgym_workspace_path, benchmark_name, scale_factor) - if pgdata_parent_dpath == None: - pgdata_parent_dpath = default_pgdata_parent_dpath(dbgym_cfg.dbgym_workspace_path) + if pristine_dbdata_snapshot_path == None: + pristine_dbdata_snapshot_path = default_pristine_dbdata_snapshot_path(dbgym_cfg.dbgym_workspace_path, benchmark_name, scale_factor) + if dbdata_parent_dpath == None: + dbdata_parent_dpath = default_dbdata_parent_dpath(dbgym_cfg.dbgym_workspace_path) if pgbin_path == None: pgbin_path = default_pgbin_path(dbgym_cfg.dbgym_workspace_path) if workload_path == None: @@ -238,22 +232,22 @@ def hpo( benchmark_config_path = conv_inputpath_to_realabspath(dbgym_cfg, benchmark_config_path) benchbase_config_path = conv_inputpath_to_realabspath(dbgym_cfg, benchbase_config_path) sysknobs_path = conv_inputpath_to_realabspath(dbgym_cfg, sysknobs_path) - pristine_pgdata_snapshot_path = conv_inputpath_to_realabspath(dbgym_cfg, pristine_pgdata_snapshot_path) - pgdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, pgdata_parent_dpath) + pristine_dbdata_snapshot_path = conv_inputpath_to_realabspath(dbgym_cfg, pristine_dbdata_snapshot_path) + dbdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, dbdata_parent_dpath) pgbin_path = conv_inputpath_to_realabspath(dbgym_cfg, pgbin_path) workload_path = conv_inputpath_to_realabspath(dbgym_cfg, workload_path) boot_config_fpath_during_hpo = conv_inputpath_to_realabspath(dbgym_cfg, boot_config_fpath_during_hpo) # Check assertions on args - if intended_pgdata_hardware == "hdd": - assert not is_ssd(pgdata_parent_dpath), f"Intended hardware is HDD but pgdata_parent_dpath ({pgdata_parent_dpath}) is an SSD" - elif intended_pgdata_hardware == "ssd": - assert is_ssd(pgdata_parent_dpath), f"Intended hardware is SSD but pgdata_parent_dpath ({pgdata_parent_dpath}) is an HDD" + if intended_dbdata_hardware == "hdd": + assert not is_ssd(dbdata_parent_dpath), f"Intended hardware is HDD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an SSD" + elif intended_dbdata_hardware == "ssd": + assert is_ssd(dbdata_parent_dpath), f"Intended hardware is SSD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an HDD" else: assert False # Create args object - hpo_args = AgentHPOArgs(benchmark_name, workload_name, embedder_path, benchmark_config_path, benchbase_config_path, sysknobs_path, pristine_pgdata_snapshot_path, pgdata_parent_dpath, pgbin_path, workload_path, seed, agent, max_concurrent, num_samples, tune_duration_during_hpo, workload_timeout, query_timeout, enable_boot_during_hpo, boot_config_fpath_during_hpo, build_space_good_for_boot) + hpo_args = AgentHPOArgs(benchmark_name, workload_name, embedder_path, benchmark_config_path, benchbase_config_path, sysknobs_path, pristine_dbdata_snapshot_path, dbdata_parent_dpath, pgbin_path, workload_path, seed, agent, max_concurrent, num_samples, tune_duration_during_hpo, workload_timeout, query_timeout, enable_boot_during_hpo, boot_config_fpath_during_hpo, build_space_good_for_boot) _tune_hpo(dbgym_cfg, hpo_args) @@ -606,8 +600,8 @@ def _tune_hpo(dbgym_cfg: DBGymConfig, hpo_args: AgentHPOArgs) -> None: hpo_args.workload_path, embedder_path, pgconn_info={ - "pristine_pgdata_snapshot_path": hpo_args.pristine_pgdata_snapshot_path, - "pgdata_parent_dpath": hpo_args.pgdata_parent_dpath, + "pristine_dbdata_snapshot_path": hpo_args.pristine_dbdata_snapshot_path, + "dbdata_parent_dpath": hpo_args.dbdata_parent_dpath, "pgbin_path": hpo_args.pgbin_path, }, benchbase_config=benchbase_config, diff --git a/tune/protox/embedding/datagen.py b/tune/protox/embedding/datagen.py index e1be247a..3e4889c8 100644 --- a/tune/protox/embedding/datagen.py +++ b/tune/protox/embedding/datagen.py @@ -23,14 +23,14 @@ conv_inputpath_to_realabspath, default_benchmark_config_path, default_workload_path, - default_pristine_pgdata_snapshot_path, + default_pristine_dbdata_snapshot_path, default_pgbin_path, traindata_fname, link_result, open_and_save, save_file, workload_name_fn, - default_pgdata_parent_dpath, + default_dbdata_parent_dpath, is_ssd, ) from tune.protox.embedding.loss import COST_COLUMNS @@ -69,22 +69,22 @@ @click.option("--pgbin-path", type=Path, default=None, help=f"The path to the bin containing Postgres executables. The default is {default_pgbin_path(WORKSPACE_PATH_PLACEHOLDER)}.") # TODO(phw2): need to run pgtune before gathering data @click.option( - "--pristine-pgdata-snapshot-path", + "--pristine-dbdata-snapshot-path", default=None, type=Path, - help=f"The path to the .tgz snapshot of the pgdata directory to build an embedding space over. The default is {default_pristine_pgdata_snapshot_path(WORKSPACE_PATH_PLACEHOLDER, BENCHMARK_NAME_PLACEHOLDER, SCALE_FACTOR_PLACEHOLDER)}.", + help=f"The path to the .tgz snapshot of the dbdata directory to build an embedding space over. The default is {default_pristine_dbdata_snapshot_path(WORKSPACE_PATH_PLACEHOLDER, BENCHMARK_NAME_PLACEHOLDER, SCALE_FACTOR_PLACEHOLDER)}.", ) @click.option( - "--intended-pgdata-hardware", + "--intended-dbdata-hardware", type=click.Choice(["hdd", "ssd"]), default="hdd", - help=f"The intended hardware pgdata should be on. Used as a sanity check for --pgdata-parent-dpath.", + help=f"The intended hardware dbdata should be on. Used as a sanity check for --dbdata-parent-dpath.", ) @click.option( - "--pgdata-parent-dpath", + "--dbdata-parent-dpath", default=None, type=Path, - help=f"The path to the parent directory of the pgdata which will be actively tuned. The default is {default_pristine_pgdata_snapshot_path(WORKSPACE_PATH_PLACEHOLDER, BENCHMARK_NAME_PLACEHOLDER, SCALE_FACTOR_PLACEHOLDER)}.", + help=f"The path to the parent directory of the dbdata which will be actively tuned. The default is {default_pristine_dbdata_snapshot_path(WORKSPACE_PATH_PLACEHOLDER, BENCHMARK_NAME_PLACEHOLDER, SCALE_FACTOR_PLACEHOLDER)}.", ) @click.option( "--benchmark-config-path", @@ -154,9 +154,9 @@ def datagen( query_subset, scale_factor, pgbin_path, - pristine_pgdata_snapshot_path, - intended_pgdata_hardware, - pgdata_parent_dpath, + pristine_dbdata_snapshot_path, + intended_dbdata_hardware, + dbdata_parent_dpath, benchmark_config_path, workload_path, seed, @@ -191,12 +191,12 @@ def datagen( ) if pgbin_path == None: pgbin_path = default_pgbin_path(dbgym_cfg.dbgym_workspace_path) - if pristine_pgdata_snapshot_path == None: - pristine_pgdata_snapshot_path = default_pristine_pgdata_snapshot_path( + if pristine_dbdata_snapshot_path == None: + pristine_dbdata_snapshot_path = default_pristine_dbdata_snapshot_path( dbgym_cfg.dbgym_workspace_path, benchmark_name, scale_factor ) - if pgdata_parent_dpath == None: - pgdata_parent_dpath = default_pgdata_parent_dpath(dbgym_cfg.dbgym_workspace_path) + if dbdata_parent_dpath == None: + dbdata_parent_dpath = default_dbdata_parent_dpath(dbgym_cfg.dbgym_workspace_path) if max_concurrent == None: max_concurrent = os.cpu_count() if seed == None: @@ -206,14 +206,14 @@ def datagen( workload_path = conv_inputpath_to_realabspath(dbgym_cfg, workload_path) benchmark_config_path = conv_inputpath_to_realabspath(dbgym_cfg, benchmark_config_path) pgbin_path = conv_inputpath_to_realabspath(dbgym_cfg, pgbin_path) - pristine_pgdata_snapshot_path = conv_inputpath_to_realabspath(dbgym_cfg, pristine_pgdata_snapshot_path) - pgdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, pgdata_parent_dpath) + pristine_dbdata_snapshot_path = conv_inputpath_to_realabspath(dbgym_cfg, pristine_dbdata_snapshot_path) + dbdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, dbdata_parent_dpath) # Check assertions on args - if intended_pgdata_hardware == "hdd": - assert not is_ssd(pgdata_parent_dpath), f"Intended hardware is HDD but pgdata_parent_dpath ({pgdata_parent_dpath}) is an SSD" - elif intended_pgdata_hardware == "ssd": - assert is_ssd(pgdata_parent_dpath), f"Intended hardware is SSD but pgdata_parent_dpath ({pgdata_parent_dpath}) is an HDD" + if intended_dbdata_hardware == "hdd": + assert not is_ssd(dbdata_parent_dpath), f"Intended hardware is HDD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an SSD" + elif intended_dbdata_hardware == "ssd": + assert is_ssd(dbdata_parent_dpath), f"Intended hardware is SSD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an HDD" else: assert False @@ -238,7 +238,7 @@ def datagen( # Group args together to reduce the # of parameters we pass into functions # I chose to group them into separate objects instead because it felt hacky to pass a giant args object into every function generic_args = EmbeddingDatagenGenericArgs( - benchmark_name, workload_name, scale_factor, benchmark_config_path, seed, workload_path, pristine_pgdata_snapshot_path, pgdata_parent_dpath + benchmark_name, workload_name, scale_factor, benchmark_config_path, seed, workload_path, pristine_dbdata_snapshot_path, dbdata_parent_dpath ) dir_gen_args = EmbeddingDirGenArgs( leading_col_tbls, @@ -252,31 +252,31 @@ def datagen( # run all steps start_time = time.time() - pgdata_dpath = untar_snapshot(dbgym_cfg, generic_args.pristine_pgdata_snapshot_path, generic_args.pgdata_parent_dpath) + dbdata_dpath = untar_snapshot(dbgym_cfg, generic_args.pristine_dbdata_snapshot_path, generic_args.dbdata_parent_dpath) pgbin_path = default_pgbin_path(dbgym_cfg.dbgym_workspace_path) - start_postgres(dbgym_cfg, pgbin_path, pgdata_dpath) + start_postgres(dbgym_cfg, pgbin_path, dbdata_dpath) _gen_traindata_dir(dbgym_cfg, generic_args, dir_gen_args) _combine_traindata_dir_into_parquet(dbgym_cfg, generic_args, file_gen_args) datagen_duration = time.time() - start_time with open(f"{dbgym_cfg.dbgym_this_run_path}/datagen_time.txt", "w") as f: f.write(f"{datagen_duration}") - stop_postgres(dbgym_cfg, pgbin_path, pgdata_dpath) + stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath) -def untar_snapshot(dbgym_cfg: DBGymConfig, pgdata_snapshot_fpath: Path, pgdata_parent_dpath: Path) -> Path: +def untar_snapshot(dbgym_cfg: DBGymConfig, dbdata_snapshot_fpath: Path, dbdata_parent_dpath: Path) -> Path: # It should be an absolute path and it should exist - assert pgdata_snapshot_fpath.is_absolute() and pgdata_snapshot_fpath.exists(), f"untar_snapshot(): pgdata_snapshot_fpath ({pgdata_snapshot_fpath}) either doesn't exist or is not absolute" + assert dbdata_snapshot_fpath.is_absolute() and dbdata_snapshot_fpath.exists(), f"untar_snapshot(): dbdata_snapshot_fpath ({dbdata_snapshot_fpath}) either doesn't exist or is not absolute" # It may be a symlink so we need to resolve them first - pgdata_snapshot_real_fpath = pgdata_snapshot_fpath.resolve() - save_file(dbgym_cfg, pgdata_snapshot_real_fpath) - pgdata_dpath = pgdata_parent_dpath / "pgdata" - # Make the parent dir and the pgdata dir. Note how we require that pgdata_dpath does not exist while it's ok if the parent does. - pgdata_parent_dpath.mkdir(parents=True, exist_ok=True) - if pgdata_dpath.exists(): - shutil.rmtree(pgdata_dpath) - pgdata_dpath.mkdir(parents=False, exist_ok=False) - subprocess_run(f"tar -xzf {pgdata_snapshot_real_fpath} -C {pgdata_dpath}") - return pgdata_dpath + dbdata_snapshot_real_fpath = dbdata_snapshot_fpath.resolve() + save_file(dbgym_cfg, dbdata_snapshot_real_fpath) + dbdata_dpath = dbdata_parent_dpath / "dbdata" + # Make the parent dir and the dbdata dir. Note how we require that dbdata_dpath does not exist while it's ok if the parent does. + dbdata_parent_dpath.mkdir(parents=True, exist_ok=True) + if dbdata_dpath.exists(): + shutil.rmtree(dbdata_dpath) + dbdata_dpath.mkdir(parents=False, exist_ok=False) + subprocess_run(f"tar -xzf {dbdata_snapshot_real_fpath} -C {dbdata_dpath}") + return dbdata_dpath class EmbeddingDatagenGenericArgs: @@ -286,15 +286,15 @@ class EmbeddingDatagenGenericArgs: I wanted to make multiple classes instead of just one to conceptually separate the different args """ - def __init__(self, benchmark_name, workload_name, scale_factor, benchmark_config_path, seed, workload_path, pristine_pgdata_snapshot_path, pgdata_parent_dpath): + def __init__(self, benchmark_name, workload_name, scale_factor, benchmark_config_path, seed, workload_path, pristine_dbdata_snapshot_path, dbdata_parent_dpath): self.benchmark_name = benchmark_name self.workload_name = workload_name self.scale_factor = scale_factor self.benchmark_config_path = benchmark_config_path self.seed = seed self.workload_path = workload_path - self.pristine_pgdata_snapshot_path = pristine_pgdata_snapshot_path - self.pgdata_parent_dpath = pgdata_parent_dpath + self.pristine_dbdata_snapshot_path = pristine_dbdata_snapshot_path + self.dbdata_parent_dpath = dbdata_parent_dpath class EmbeddingDirGenArgs: diff --git a/tune/protox/env/pg_env.py b/tune/protox/env/pg_env.py index 62fa92b8..92236519 100644 --- a/tune/protox/env/pg_env.py +++ b/tune/protox/env/pg_env.py @@ -220,8 +220,8 @@ def step_before_execution(self, action: HolonAction) -> Tuple[bool, EnvInfoDict] # Get the prior state. prior_state = copy.deepcopy(self.state_container) # Save the old configuration file. - old_conf_path = f"{self.pg_conn.pgdata_dpath}/postgresql.auto.conf" - conf_path = f"{self.pg_conn.pgdata_dpath}/postgresql.auto.old" + old_conf_path = f"{self.pg_conn.dbdata_dpath}/postgresql.auto.conf" + conf_path = f"{self.pg_conn.dbdata_dpath}/postgresql.auto.old" local["cp"][old_conf_path, conf_path].run() # Figure out what we have to change to get to the new configuration. @@ -421,8 +421,8 @@ def attempt_checkpoint(conn_str: str) -> None: def close(self) -> None: self.pg_conn.shutdown_postgres() # This file may not be in in [workspace]/tmp/, so it's important to delete it - local["rm"]["-rf", self.pg_conn.pgdata_dpath].run() + local["rm"]["-rf", self.pg_conn.dbdata_dpath].run() # Even though these files get deleted because [workspace]/tmp/ gets deleted, # we'll just delete them here anyways because why not - local["rm"]["-f", self.pg_conn.checkpoint_pgdata_snapshot_fpath].run() - local["rm"]["-f", f"{self.pg_conn.checkpoint_pgdata_snapshot_fpath}.tmp"].run() + local["rm"]["-f", self.pg_conn.checkpoint_dbdata_snapshot_fpath].run() + local["rm"]["-f", f"{self.pg_conn.checkpoint_dbdata_snapshot_fpath}.tmp"].run() diff --git a/tune/protox/env/util/pg_conn.py b/tune/protox/env/util/pg_conn.py index 3a4f0207..8a8ecfb9 100644 --- a/tune/protox/env/util/pg_conn.py +++ b/tune/protox/env/util/pg_conn.py @@ -2,7 +2,7 @@ At a high level, this file's goal is to provide helpers to manage a Postgres instance during agent tuning. On the other hand, the goal of dbms.postgres.cli is to (1) install+build postgres and (2) - create pgdata. + create dbdata. util.pg provides helpers used by *both* of the above files (as well as other files). """ import os @@ -28,8 +28,8 @@ def __init__( self, dbgym_cfg: DBGymConfig, pgport: int, - pristine_pgdata_snapshot_fpath: Path, - pgdata_parent_dpath: Path, + pristine_dbdata_snapshot_fpath: Path, + dbdata_parent_dpath: Path, pgbin_path: Union[str, Path], connect_timeout: int, enable_boot: bool, @@ -46,20 +46,20 @@ def __init__( self.log_step = 0 self.logger = logger - # All the paths related to pgdata - # pristine_pgdata_snapshot_fpath is the .tgz snapshot that represents the starting state + # All the paths related to dbdata + # pristine_dbdata_snapshot_fpath is the .tgz snapshot that represents the starting state # of the database (with the default configuration). It is generated by a call to # `python tune.py dbms postgres ...` and should not be overwritten. - self.pristine_pgdata_snapshot_fpath = pristine_pgdata_snapshot_fpath - # checkpoint_pgdata_snapshot_fpath is the .tgz snapshot that represents the current + self.pristine_dbdata_snapshot_fpath = pristine_dbdata_snapshot_fpath + # checkpoint_dbdata_snapshot_fpath is the .tgz snapshot that represents the current # state of the database as it is being tuned. It is generated while tuning and is # discarded once tuning is completed. - self.checkpoint_pgdata_snapshot_fpath = dbgym_cfg.dbgym_tmp_path / "checkpoint_pgdata.tgz" - # pgdata_parent_dpath is the parent directory of the pgdata that is *actively being tuned*. - # Setting this lets us control the hardware device pgdata is built on (e.g. HDD vs. SSD). - self.pgdata_parent_dpath = pgdata_parent_dpath - # pgdata_dpath is the pgdata that is *actively being tuned* - self.pgdata_dpath = self.pgdata_parent_dpath / f"pgdata{self.pgport}" + self.checkpoint_dbdata_snapshot_fpath = dbgym_cfg.dbgym_tmp_path / "checkpoint_dbdata.tgz" + # dbdata_parent_dpath is the parent directory of the dbdata that is *actively being tuned*. + # Setting this lets us control the hardware device dbdata is built on (e.g. HDD vs. SSD). + self.dbdata_parent_dpath = dbdata_parent_dpath + # dbdata_dpath is the dbdata that is *actively being tuned* + self.dbdata_dpath = self.dbdata_parent_dpath / f"dbdata{self.pgport}" self._conn: Optional[psycopg.Connection[Any]] = None @@ -92,13 +92,13 @@ def move_log(self) -> None: def shutdown_postgres(self) -> None: """Shuts down postgres.""" self.disconnect() - if not Path(self.pgdata_dpath).exists(): + if not Path(self.dbdata_dpath).exists(): return while True: self.logger.get_logger(__name__).debug("Shutting down postgres...") _, stdout, stderr = local[f"{self.pgbin_path}/pg_ctl"][ - "stop", "--wait", "-t", "180", "-D", self.pgdata_dpath + "stop", "--wait", "-t", "180", "-D", self.dbdata_dpath ].run(retcode=None) time.sleep(1) self.logger.get_logger(__name__).debug( @@ -115,7 +115,7 @@ def shutdown_postgres(self) -> None: DBGYM_POSTGRES_DBNAME, ].run(retcode=None) - exists = (Path(self.pgdata_dpath) / "postmaster.pid").exists() + exists = (Path(self.dbdata_dpath) / "postmaster.pid").exists() if not exists and retcode != 0: break @@ -127,7 +127,7 @@ def start_with_changes( save_checkpoint: bool = False, ) -> bool: """ - This function assumes that some snapshot has already been untarred into self.pgdata_dpath + This function assumes that some snapshot has already been untarred into self.dbdata_dpath """ # Install the new configuration changes. if conf_changes is not None: @@ -135,11 +135,11 @@ def start_with_changes( # This way of doing it works for both single or multiple libraries. An example of a way # that *doesn't* work is `f"shared_preload_libraries='"{SHARED_PRELOAD_LIBRARIES}"'"` conf_changes.append(f"shared_preload_libraries='{SHARED_PRELOAD_LIBRARIES}'") - pgdata_auto_conf_path = self.pgdata_dpath / "postgresql.auto.conf" - with open(pgdata_auto_conf_path, "w") as f: + dbdata_auto_conf_path = self.dbdata_dpath / "postgresql.auto.conf" + with open(dbdata_auto_conf_path, "w") as f: f.write("\n".join(conf_changes)) save_auto_conf_path = self.dbgym_cfg.cur_task_runs_data_path(".", mkdir=True) / "postgresql.auto.conf" - local["cp"][pgdata_auto_conf_path, save_auto_conf_path].run() + local["cp"][dbdata_auto_conf_path, save_auto_conf_path].run() link_result(self.dbgym_cfg, save_auto_conf_path) # Start postgres instance. @@ -151,14 +151,14 @@ def start_with_changes( "cf", # We append .tmp so that if we fail in the *middle* of running tar, we # still have the previous checkpoint available to us - f"{self.checkpoint_pgdata_snapshot_fpath}.tmp", + f"{self.checkpoint_dbdata_snapshot_fpath}.tmp", "-C", - parent_dir(self.pgdata_dpath), - self.pgdata_dpath, + parent_dir(self.dbdata_dpath), + self.dbdata_dpath, ].run() # Make sure the PID lock file doesn't exist. - pid_lock = Path(f"{self.pgdata_dpath}/postmaster.pid") + pid_lock = Path(f"{self.dbdata_dpath}/postmaster.pid") assert not pid_lock.exists() if dump_page_cache: @@ -170,7 +170,7 @@ def start_with_changes( # Try starting up. retcode, stdout, stderr = local[f"{self.pgbin_path}/pg_ctl"][ "-D", - self.pgdata_dpath, + self.dbdata_dpath, "--wait", "-t", "180", @@ -241,7 +241,7 @@ def start_with_changes( # Move the temporary over since we now know the temporary can load. if save_checkpoint: - shutil.move(f"{self.pgdata_dpath}.tgz.tmp", f"{self.pgdata_dpath}.tgz") + shutil.move(f"{self.dbdata_dpath}.tgz.tmp", f"{self.dbdata_dpath}.tgz") return True @@ -332,29 +332,29 @@ def cancel_fn(conn_str: str) -> None: return 0, None def restore_pristine_snapshot(self): - self._restore_snapshot(self.pristine_pgdata_snapshot_fpath) + self._restore_snapshot(self.pristine_dbdata_snapshot_fpath) def restore_checkpointed_snapshot(self): - self._restore_snapshot(self.checkpoint_pgdata_snapshot_fpath) + self._restore_snapshot(self.checkpoint_dbdata_snapshot_fpath) @time_record("restore") def _restore_snapshot( - self, pgdata_snapshot_path: Path, + self, dbdata_snapshot_path: Path, ) -> bool: self.shutdown_postgres() - local["rm"]["-rf", self.pgdata_dpath].run() - local["mkdir"]["-m", "0700", "-p", self.pgdata_dpath].run() + local["rm"]["-rf", self.dbdata_dpath].run() + local["mkdir"]["-m", "0700", "-p", self.dbdata_dpath].run() - # Strip the "pgdata" so we can implant directly into the target pgdata_dpath. - assert pgdata_snapshot_path.exists() + # Strip the "dbdata" so we can implant directly into the target dbdata_dpath. + assert dbdata_snapshot_path.exists() local["tar"][ - "xf", pgdata_snapshot_path, "-C", self.pgdata_dpath, "--strip-components", "1" + "xf", dbdata_snapshot_path, "-C", self.dbdata_dpath, "--strip-components", "1" ].run() # Imprint the required port. ( (local["echo"][f"port={self.pgport}"]) - >> f"{self.pgdata_dpath}/postgresql.conf" + >> f"{self.dbdata_dpath}/postgresql.conf" )() return self.start_with_changes(conf_changes=None)