From a6c60615abd97450f1c4d942a91e1dd8ae23e717 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sat, 21 Dec 2024 20:34:06 -0500 Subject: [PATCH 1/6] update tables url (verified manually) --- benchmark/job/cli.py | 6 ++---- dependencies/requirements.txt | 1 - scripts/run_protox_e2e_test.py | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py index be2feb01..3f6fd6d7 100644 --- a/benchmark/job/cli.py +++ b/benchmark/job/cli.py @@ -12,8 +12,7 @@ link_result, ) -# JOB_TABLES_URL = "https://homepages.cwi.nl/~boncz/job/imdb.tgz" # This link stopped working for me -JOB_TABLES_URL = "https://drive.google.com/uc?id=19m0zDpphAw0Bu9Irr_ta9EGr5k85hiN1" +JOB_TABLES_URL = "https://event.cwi.nl/da/job/imdb.tgz" JOB_QUERY_NAMES = [ "1a", "1b", @@ -177,8 +176,7 @@ def _download_job_data(dbgym_cfg: DBGymConfig) -> None: logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloading: {expected_symlink_dpath}") real_data_path = dbgym_cfg.cur_task_runs_data_path(mkdir=True) - # subprocess_run(f"curl -O {JOB_TABLES_URL}", cwd=real_data_path) # This is if we're using a non-Google-Drive link - subprocess_run(f"gdown {JOB_TABLES_URL}", cwd=real_data_path) + subprocess_run(f"curl -O {JOB_TABLES_URL}", cwd=real_data_path) job_data_dpath = dbgym_cfg.cur_task_runs_data_path( default_tables_dname(DEFAULT_SCALE_FACTOR), mkdir=True ) diff --git a/dependencies/requirements.txt b/dependencies/requirements.txt index e1252f80..6c0cb4b7 100644 --- a/dependencies/requirements.txt +++ b/dependencies/requirements.txt @@ -135,4 +135,3 @@ Werkzeug==3.0.1 wrapt==1.14.1 zipp==3.17.0 streamlit==1.39.0 -gdown==5.2.0 \ No newline at end of file diff --git a/scripts/run_protox_e2e_test.py b/scripts/run_protox_e2e_test.py index 7d25374c..f86af1e5 100644 --- a/scripts/run_protox_e2e_test.py +++ b/scripts/run_protox_e2e_test.py @@ -200,5 +200,5 @@ def run_e2e_for_benchmark(benchmark_name: str, intended_dbdata_hardware: str) -> # Set the config file so that we use resources that don't conflict with normal usage (e.g. a different workspace, different ports, etc.). os.environ["DBGYM_CONFIG_PATH"] = str(E2ETEST_DBGYM_CONFIG_FPATH) - run_e2e_for_benchmark("tpch", intended_dbdata_hardware) + # run_e2e_for_benchmark("tpch", intended_dbdata_hardware) run_e2e_for_benchmark("job", intended_dbdata_hardware) From 86731552e350b40a5e7110af03ee53d46579c8c4 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sat, 21 Dec 2024 20:43:52 -0500 Subject: [PATCH 2/6] refactored download and untar into function --- benchmark/job/cli.py | 51 +++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py index 3f6fd6d7..ce463747 100644 --- a/benchmark/job/cli.py +++ b/benchmark/job/cli.py @@ -164,27 +164,12 @@ def job_workload( def _download_job_data(dbgym_cfg: DBGymConfig) -> None: - expected_symlink_dpath = ( - dbgym_cfg.cur_symlinks_data_path(mkdir=True) - / f"{default_tables_dname(DEFAULT_SCALE_FACTOR)}.link" - ) - if expected_symlink_dpath.exists(): - logging.getLogger(DBGYM_LOGGER_NAME).info( - f"Skipping download: {expected_symlink_dpath}" - ) - return - - logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloading: {expected_symlink_dpath}") - real_data_path = dbgym_cfg.cur_task_runs_data_path(mkdir=True) - subprocess_run(f"curl -O {JOB_TABLES_URL}", cwd=real_data_path) - job_data_dpath = dbgym_cfg.cur_task_runs_data_path( - default_tables_dname(DEFAULT_SCALE_FACTOR), mkdir=True + _download_and_untar_dir( + dbgym_cfg, + JOB_TABLES_URL, + "imdb.tgz", + default_tables_dname(DEFAULT_SCALE_FACTOR), ) - subprocess_run("tar -zxvf ../imdb.tgz", cwd=job_data_dpath) - subprocess_run(f"rm imdb.tgz", cwd=real_data_path) - symlink_dpath = link_result(dbgym_cfg, job_data_dpath) - assert expected_symlink_dpath.samefile(symlink_dpath) - logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloaded: {expected_symlink_dpath}") def _clone_job_queries(dbgym_cfg: DBGymConfig) -> None: @@ -207,6 +192,32 @@ def _clone_job_queries(dbgym_cfg: DBGymConfig) -> None: logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloned: {expected_symlink_dpath}") +def _download_and_untar_dir( + dbgym_cfg: DBGymConfig, + download_url: str, + download_tarred_fname: str, + untarred_dname: str, +) -> None: + expected_symlink_dpath = ( + dbgym_cfg.cur_symlinks_data_path(mkdir=True) / f"{untarred_dname}.link" + ) + if expected_symlink_dpath.exists(): + logging.getLogger(DBGYM_LOGGER_NAME).info( + f"Skipping download: {expected_symlink_dpath}" + ) + return + + logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloading: {expected_symlink_dpath}") + real_data_path = dbgym_cfg.cur_task_runs_data_path(mkdir=True) + subprocess_run(f"curl -O {download_url}", cwd=real_data_path) + untarred_data_dpath = dbgym_cfg.cur_task_runs_data_path(untarred_dname, mkdir=True) + subprocess_run(f"tar -zxvf ../{download_tarred_fname}", cwd=untarred_data_dpath) + subprocess_run(f"rm {download_tarred_fname}", cwd=real_data_path) + symlink_dpath = link_result(dbgym_cfg, untarred_data_dpath) + assert expected_symlink_dpath.samefile(symlink_dpath) + logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloaded: {expected_symlink_dpath}") + + def _generate_job_workload( dbgym_cfg: DBGymConfig, query_subset: str, From 731db402418df306b02fd006fc7c68a138ddefd1 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sat, 21 Dec 2024 20:46:07 -0500 Subject: [PATCH 3/6] now downloading job queries from https://event.cwi.nl/da/job/ --- benchmark/job/cli.py | 23 ++++------------------- benchmark/job/job_schema.sql | 4 ++-- 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py index ce463747..38d5f31f 100644 --- a/benchmark/job/cli.py +++ b/benchmark/job/cli.py @@ -13,6 +13,7 @@ ) JOB_TABLES_URL = "https://event.cwi.nl/da/job/imdb.tgz" +JOB_QUERIES_URL = "https://event.cwi.nl/da/job/job.tgz" JOB_QUERY_NAMES = [ "1a", "1b", @@ -159,7 +160,7 @@ def job_workload( dbgym_cfg: DBGymConfig, query_subset: str, scale_factor: float ) -> None: assert scale_factor == DEFAULT_SCALE_FACTOR - _clone_job_queries(dbgym_cfg) + _download_job_queries(dbgym_cfg) _generate_job_workload(dbgym_cfg, query_subset) @@ -172,24 +173,8 @@ def _download_job_data(dbgym_cfg: DBGymConfig) -> None: ) -def _clone_job_queries(dbgym_cfg: DBGymConfig) -> None: - expected_symlink_dpath = ( - dbgym_cfg.cur_symlinks_build_path(mkdir=True) / "job-queries.link" - ) - if expected_symlink_dpath.exists(): - logging.getLogger(DBGYM_LOGGER_NAME).info( - f"Skipping clone: {expected_symlink_dpath}" - ) - return - - logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloning: {expected_symlink_dpath}") - real_build_path = dbgym_cfg.cur_task_runs_build_path(mkdir=True) - subprocess_run( - f"./clone_job_queries.sh {real_build_path}", cwd=dbgym_cfg.cur_source_path() - ) - symlink_dpath = link_result(dbgym_cfg, real_build_path / "job-queries") - assert expected_symlink_dpath.samefile(symlink_dpath) - logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloned: {expected_symlink_dpath}") +def _download_job_queries(dbgym_cfg: DBGymConfig) -> None: + _download_and_untar_dir(dbgym_cfg, JOB_QUERIES_URL, "job.tgz", "job-queries") def _download_and_untar_dir( diff --git a/benchmark/job/job_schema.sql b/benchmark/job/job_schema.sql index 9b665278..64293af1 100644 --- a/benchmark/job/job_schema.sql +++ b/benchmark/job/job_schema.sql @@ -1,5 +1,5 @@ --- Copied over from https://github.com/wangpatrick57/job-queries/blob/master/schema.sql (from an older commit) --- We copied it over so that we have control over the schema, not job-queries. +-- Copied over from https://event.cwi.nl/da/job/job.tgz. +-- We copied it over so that we have control over the schema. CREATE TABLE aka_name ( id integer NOT NULL PRIMARY KEY, person_id integer NOT NULL, From e649d23c0699825384d4d6c53ba941611478cab5 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 22 Dec 2024 14:47:27 -0500 Subject: [PATCH 4/6] now getting queries from data instead of build --- benchmark/job/cli.py | 6 ++++-- scripts/run_protox_e2e_test.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py index 38d5f31f..37e38058 100644 --- a/benchmark/job/cli.py +++ b/benchmark/job/cli.py @@ -129,6 +129,7 @@ "33b", "33c", ] +JOB_QUERIES_DNAME = "job-queries" @click.group(name="job") @@ -174,7 +175,7 @@ def _download_job_data(dbgym_cfg: DBGymConfig) -> None: def _download_job_queries(dbgym_cfg: DBGymConfig) -> None: - _download_and_untar_dir(dbgym_cfg, JOB_QUERIES_URL, "job.tgz", "job-queries") + _download_and_untar_dir(dbgym_cfg, JOB_QUERIES_URL, "job.tgz", JOB_QUERIES_DNAME) def _download_and_untar_dir( @@ -230,7 +231,8 @@ def _generate_job_workload( with open(real_dpath / "order.txt", "w") as f: for qname in query_names: sql_fpath = ( - dbgym_cfg.cur_symlinks_build_path(mkdir=True) / ("job-queries.link") + dbgym_cfg.cur_symlinks_data_path(mkdir=True) + / (f"{JOB_QUERIES_DNAME}.link") ).resolve() / f"{qname}.sql" assert ( sql_fpath.exists() diff --git a/scripts/run_protox_e2e_test.py b/scripts/run_protox_e2e_test.py index f86af1e5..7d25374c 100644 --- a/scripts/run_protox_e2e_test.py +++ b/scripts/run_protox_e2e_test.py @@ -200,5 +200,5 @@ def run_e2e_for_benchmark(benchmark_name: str, intended_dbdata_hardware: str) -> # Set the config file so that we use resources that don't conflict with normal usage (e.g. a different workspace, different ports, etc.). os.environ["DBGYM_CONFIG_PATH"] = str(E2ETEST_DBGYM_CONFIG_FPATH) - # run_e2e_for_benchmark("tpch", intended_dbdata_hardware) + run_e2e_for_benchmark("tpch", intended_dbdata_hardware) run_e2e_for_benchmark("job", intended_dbdata_hardware) From 041feb1b06ada9f1b619a600e29d2cc689d030ab Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 22 Dec 2024 19:32:02 -0500 Subject: [PATCH 5/6] added untarred_original_dname option to _download_and_untar_dir --- benchmark/job/cli.py | 32 +++++++++++++++++++++++++++++--- scripts/run_protox_e2e_test.py | 2 +- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py index 37e38058..a3ef2bdd 100644 --- a/benchmark/job/cli.py +++ b/benchmark/job/cli.py @@ -1,4 +1,5 @@ import logging +from typing import Optional import click @@ -175,7 +176,13 @@ def _download_job_data(dbgym_cfg: DBGymConfig) -> None: def _download_job_queries(dbgym_cfg: DBGymConfig) -> None: - _download_and_untar_dir(dbgym_cfg, JOB_QUERIES_URL, "job.tgz", JOB_QUERIES_DNAME) + _download_and_untar_dir( + dbgym_cfg, + JOB_QUERIES_URL, + "job.tgz", + JOB_QUERIES_DNAME, + untarred_original_dname="job", + ) def _download_and_untar_dir( @@ -183,7 +190,14 @@ def _download_and_untar_dir( download_url: str, download_tarred_fname: str, untarred_dname: str, + untarred_original_dname: Optional[str] = None, ) -> None: + """ + Some .tgz files are built from a directory while others are built from the contents of + the directory. If the .tgz file we're untarring is built from a directory, it will have + an "original" directory name. If this is the case, you should set + `untarred_original_dname` to ensure that it gets renamed to `untarred_dname`. + """ expected_symlink_dpath = ( dbgym_cfg.cur_symlinks_data_path(mkdir=True) / f"{untarred_dname}.link" ) @@ -196,8 +210,20 @@ def _download_and_untar_dir( logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloading: {expected_symlink_dpath}") real_data_path = dbgym_cfg.cur_task_runs_data_path(mkdir=True) subprocess_run(f"curl -O {download_url}", cwd=real_data_path) - untarred_data_dpath = dbgym_cfg.cur_task_runs_data_path(untarred_dname, mkdir=True) - subprocess_run(f"tar -zxvf ../{download_tarred_fname}", cwd=untarred_data_dpath) + untarred_data_dpath = dbgym_cfg.cur_task_runs_data_path(untarred_dname) + + if untarred_original_dname is not None: + assert not untarred_data_dpath.exists() + subprocess_run(f"tar -zxvf {download_tarred_fname}", cwd=real_data_path) + assert (real_data_path / untarred_original_dname).exists() + subprocess_run( + f"mv {untarred_original_dname} {untarred_dname}", cwd=real_data_path + ) + else: + untarred_data_dpath.mkdir(parents=True, exist_ok=False) + subprocess_run(f"tar -zxvf ../{download_tarred_fname}", cwd=untarred_data_dpath) + + assert untarred_data_dpath.exists() subprocess_run(f"rm {download_tarred_fname}", cwd=real_data_path) symlink_dpath = link_result(dbgym_cfg, untarred_data_dpath) assert expected_symlink_dpath.samefile(symlink_dpath) diff --git a/scripts/run_protox_e2e_test.py b/scripts/run_protox_e2e_test.py index 7d25374c..262fc4ef 100644 --- a/scripts/run_protox_e2e_test.py +++ b/scripts/run_protox_e2e_test.py @@ -200,5 +200,5 @@ def run_e2e_for_benchmark(benchmark_name: str, intended_dbdata_hardware: str) -> # Set the config file so that we use resources that don't conflict with normal usage (e.g. a different workspace, different ports, etc.). os.environ["DBGYM_CONFIG_PATH"] = str(E2ETEST_DBGYM_CONFIG_FPATH) - run_e2e_for_benchmark("tpch", intended_dbdata_hardware) + # run_e2e_for_benchmark("tpch", intended_dbdata_hardware) # TODO: Uncomment this run_e2e_for_benchmark("job", intended_dbdata_hardware) From 5abd53b5a188dc3a32a754f3eaa0e53dd74cc607 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 22 Dec 2024 19:36:02 -0500 Subject: [PATCH 6/6] uncommented tpch e2e --- scripts/run_protox_e2e_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/run_protox_e2e_test.py b/scripts/run_protox_e2e_test.py index 262fc4ef..7d25374c 100644 --- a/scripts/run_protox_e2e_test.py +++ b/scripts/run_protox_e2e_test.py @@ -200,5 +200,5 @@ def run_e2e_for_benchmark(benchmark_name: str, intended_dbdata_hardware: str) -> # Set the config file so that we use resources that don't conflict with normal usage (e.g. a different workspace, different ports, etc.). os.environ["DBGYM_CONFIG_PATH"] = str(E2ETEST_DBGYM_CONFIG_FPATH) - # run_e2e_for_benchmark("tpch", intended_dbdata_hardware) # TODO: Uncomment this + run_e2e_for_benchmark("tpch", intended_dbdata_hardware) run_e2e_for_benchmark("job", intended_dbdata_hardware)