Skip to content

Commit

Permalink
Factored PostgresConn out of Proto-X (#44)
Browse files Browse the repository at this point in the history
**Summary**: PostgresConn is now fully independent of Proto-X.

**Demo**:
![Screenshot 2024-10-30 at 12 13
02](https://github.com/user-attachments/assets/7f730149-ee70-4cd4-aa20-74f8f7e5d0bb)

**Details**:
* It is in the `tune/env/` folder and can be used by other agents. Read
that file for how it differs from `dbms/` and from `util/pg.py`.
* I wrote integration tests testing the basic functionality of
PostgresConn.
  • Loading branch information
wangpatrick57 authored Oct 30, 2024
1 parent 6f27be5 commit 2faecff
Show file tree
Hide file tree
Showing 25 changed files with 413 additions and 206 deletions.
19 changes: 16 additions & 3 deletions .github/workflows/tests_ci.yml → .github/workflows/tests_ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,27 @@ jobs:
./scripts/mypy.sh
- name: Run unit tests
# Unit tests are defined as tests which don't require any external systems to be running.
run: |
. "$HOME/.cargo/env"
python scripts/run_unit_tests.py
./scripts/run_unit_tests.sh
- name: Run integration tests
# Delete the workspace. Run once with a clean workspace. Run again from the existing workspace.
# Integration tests do require external systems to be running (most commonly a database instance).
# Unlike end-to-end tests though, they test a specific module in a detailed manner, much like a unit test does.
#
# We set `INTENDED_DBDATA_HARDWARE` so that it's seen when `integtest_pg_conn.py` executes `./tune/env/set_up_env_integtests.sh`.
run: |
. "$HOME/.cargo/env"
export INTENDED_DBDATA_HARDWARE=ssd
./scripts/run_integration_tests.sh
- name: Run end-to-end tests
# End-to-end tests are like integration tests in that they require external systems to be running.
# Unlike integration tests though, they don't perform detailed checks for any individual module.
#
# Note that we need to run with a non-root user in order to start Postgres. This is configured in the .yaml
# file for our self-hosted GHA runners.
run: |
. "$HOME/.cargo/env"
python -m scripts.run_protox_integration_test ssd
python -m scripts.run_protox_e2e_test ssd
File renamed without changes.
2 changes: 2 additions & 0 deletions dbms/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# This folder contains code for building DBMSs.
# It should not be confused with code that uses DBMSs (e.g. those in tune/env/).
5 changes: 5 additions & 0 deletions dbms/postgres/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,11 @@ def _load_into_dbdata(
sql_file_execute(dbgym_cfg, conn, constraints_fpath)


# The start and stop functions slightly duplicate functionality from pg_conn.py. However, I chose to do it this way
# because what the `dbms` CLI needs in terms of starting and stopping Postgres is much simpler than what an agent
# that is tuning the database needs. Because these functions are so simple, I think it's okay to leave them here
# even though they are a little redundant. It seems better than making `dbms` depend on the behavior of the
# tuning environment.
def start_postgres(
dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path
) -> None:
Expand Down
File renamed without changes.
3 changes: 3 additions & 0 deletions scripts/e2e_test_dbgym_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
dbgym_workspace_path: ../dbgym_e2etest_workspace
boot_redis_port: 7379
ray_gcs_port: 7380
3 changes: 0 additions & 3 deletions scripts/integtest_dbgym_config.yaml

This file was deleted.

2 changes: 2 additions & 0 deletions scripts/run_integration_tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
python -m scripts.run_tests "integtest_*.py"
177 changes: 177 additions & 0 deletions scripts/run_protox_e2e_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
import os
import shutil
import subprocess
import sys
from enum import Enum, auto
from pathlib import Path

import yaml

from util.pg import get_is_postgres_running
from util.workspace import (
default_embedder_path,
default_hpoed_agent_params_path,
default_pristine_dbdata_snapshot_path,
default_replay_data_fpath,
default_repo_path,
default_tables_path,
default_traindata_path,
default_tuning_steps_dpath,
default_workload_path,
workload_name_fn,
)

# Be careful when changing these constants. In some places, the E2E test is hardcoded to work for these specific constants.
DBMS = "postgres"
AGENT = "protox"
BENCHMARK = "tpch"
SCALE_FACTOR = 0.01
E2ETEST_DBGYM_CONFIG_FPATH = Path("scripts/e2e_test_dbgym_config.yaml")


def get_workspace_dpath(config_fpath: Path) -> Path:
with open(config_fpath, "r") as file:
config = yaml.safe_load(file)
return Path(config.get("dbgym_workspace_path"))


def clear_workspace(workspace_dpath: Path) -> None:
actual_workspace_dpath = Path("../dbgym_workspace")
if workspace_dpath.exists():
if actual_workspace_dpath.exists():
assert not workspace_dpath.samefile(
actual_workspace_dpath
), "YOU MAY BE ABOUT TO DELETE YOUR ACTUAL WORKSPACE"
shutil.rmtree(workspace_dpath)


class Stage(Enum):
Tables = auto()
Workload = auto()
DBRepo = auto()
DBData = auto()
EmbeddingData = auto()
EmbeddingModel = auto()
TuneHPO = auto()
TuneTune = auto()
Replay = auto()


# When debugging the E2E test, this gives you an easy way of turning off certain stages to speed up your iteration cycle.
#
# I made this slightly convoluted system is because you can't just naively comment out a big chunk of code with all the stages
# you don't want to run. Many stages define variables that are used by future stages, which can't be commented out.
#
# One useful debugging workflow is to run all stages up until a point, make a copy of that workspace, and then rerun the
# integration test as many times as you want starting from that copy.
ALL_STAGES = {stage for stage in Stage}
# This is a set and not a list because the order of stages is already pre-defined. This just defines what not to skip.
STAGES_TO_RUN = ALL_STAGES


if __name__ == "__main__":
intended_dbdata_hardware = sys.argv[1] if len(sys.argv) > 1 else "hdd"

# Set the config file so that we use resources that don't conflict with normal usage (e.g. a different workspace, different ports, etc.).
os.environ["DBGYM_CONFIG_PATH"] = str(E2ETEST_DBGYM_CONFIG_FPATH)

# Clear the E2E testing workspace so we always run the test with a clean slate.
workspace_dpath = get_workspace_dpath(E2ETEST_DBGYM_CONFIG_FPATH)
clear_workspace(workspace_dpath)

# Make other checks that we have a clean slate for testing.
assert not get_is_postgres_running()

# Run the full Proto-X training pipeline, asserting things along the way
# Setup (workload and database)
tables_dpath = default_tables_path(workspace_dpath, BENCHMARK, SCALE_FACTOR)
if Stage.Tables in STAGES_TO_RUN:
assert not tables_dpath.exists()
subprocess.run(
f"python task.py benchmark {BENCHMARK} data {SCALE_FACTOR}".split(),
check=True,
)
assert tables_dpath.exists()

workload_name = workload_name_fn(SCALE_FACTOR, 15721, 15721, "all")
workload_dpath = default_workload_path(workspace_dpath, BENCHMARK, workload_name)
if Stage.Workload in STAGES_TO_RUN:
assert not workload_dpath.exists()
subprocess.run(
f"python task.py benchmark {BENCHMARK} workload --scale-factor {SCALE_FACTOR}".split(),
check=True,
)
assert workload_dpath.exists()

repo_dpath = default_repo_path(workspace_dpath)
if Stage.DBRepo in STAGES_TO_RUN:
assert not repo_dpath.exists()
subprocess.run(f"python task.py dbms {DBMS} build".split(), check=True)
assert repo_dpath.exists()

pristine_dbdata_snapshot_fpath = default_pristine_dbdata_snapshot_path(
workspace_dpath, BENCHMARK, SCALE_FACTOR
)
if Stage.DBData in STAGES_TO_RUN:
assert not pristine_dbdata_snapshot_fpath.exists()
subprocess.run(
f"python task.py dbms {DBMS} dbdata {BENCHMARK} --scale-factor {SCALE_FACTOR} --intended-dbdata-hardware {intended_dbdata_hardware}".split(),
check=True,
)
assert pristine_dbdata_snapshot_fpath.exists()

# Tuning (embedding, HPO, and actual tuning)
traindata_dpath = default_traindata_path(workspace_dpath, BENCHMARK, workload_name)
if Stage.EmbeddingData in STAGES_TO_RUN:
assert not traindata_dpath.exists()
subprocess.run(
f"python task.py tune {AGENT} embedding datagen {BENCHMARK} --scale-factor {SCALE_FACTOR} --override-sample-limits lineitem,32768 --intended-dbdata-hardware {intended_dbdata_hardware}".split(),
check=True,
)
assert traindata_dpath.exists()

embedder_dpath = default_embedder_path(workspace_dpath, BENCHMARK, workload_name)
if Stage.EmbeddingModel in STAGES_TO_RUN:
assert not embedder_dpath.exists()
subprocess.run(
f"python task.py tune {AGENT} embedding train {BENCHMARK} --scale-factor {SCALE_FACTOR} --iterations-per-epoch 1 --num-points-to-sample 1 --num-batches 1 --batch-size 64 --start-epoch 15 --num-samples 4 --train-max-concurrent 4 --num-curate 2".split(),
check=True,
)
assert embedder_dpath.exists()

hpoed_agent_params_fpath = default_hpoed_agent_params_path(
workspace_dpath, BENCHMARK, workload_name
)
if Stage.TuneHPO in STAGES_TO_RUN:
assert not hpoed_agent_params_fpath.exists()
subprocess.run(
f"python task.py tune {AGENT} agent hpo {BENCHMARK} --scale-factor {SCALE_FACTOR} --num-samples 2 --max-concurrent 2 --workload-timeout 15 --query-timeout 1 --tune-duration-during-hpo 0.01 --intended-dbdata-hardware {intended_dbdata_hardware}".split(),
check=True,
)
assert hpoed_agent_params_fpath.exists()

tuning_steps_dpath = default_tuning_steps_dpath(
workspace_dpath, BENCHMARK, workload_name, False
)
if Stage.TuneTune in STAGES_TO_RUN:
assert not tuning_steps_dpath.exists()
subprocess.run(
f"python task.py tune {AGENT} agent tune {BENCHMARK} --scale-factor {SCALE_FACTOR}".split(),
check=True,
)
assert tuning_steps_dpath.exists()

# Post-training (replay and analysis)
replay_data_fpath = default_replay_data_fpath(
workspace_dpath, BENCHMARK, workload_name, False
)
if Stage.Replay in STAGES_TO_RUN:
assert not replay_data_fpath.exists()
subprocess.run(
f"python3 task.py tune {AGENT} agent replay {BENCHMARK} --scale-factor {SCALE_FACTOR}".split(),
check=True,
)
assert replay_data_fpath.exists()

# Clear it at the end as well to avoid leaving artifacts.
clear_workspace(workspace_dpath)
Loading

0 comments on commit 2faecff

Please sign in to comment.