From a6c60615abd97450f1c4d942a91e1dd8ae23e717 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sat, 21 Dec 2024 20:34:06 -0500
Subject: [PATCH 1/6] update tables url (verified manually)

---
 benchmark/job/cli.py           | 6 ++----
 dependencies/requirements.txt  | 1 -
 scripts/run_protox_e2e_test.py | 2 +-
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py
index be2feb01..3f6fd6d7 100644
--- a/benchmark/job/cli.py
+++ b/benchmark/job/cli.py
@@ -12,8 +12,7 @@
     link_result,
 )
 
-# JOB_TABLES_URL = "https://homepages.cwi.nl/~boncz/job/imdb.tgz" # This link stopped working for me
-JOB_TABLES_URL = "https://drive.google.com/uc?id=19m0zDpphAw0Bu9Irr_ta9EGr5k85hiN1"
+JOB_TABLES_URL = "https://event.cwi.nl/da/job/imdb.tgz"
 JOB_QUERY_NAMES = [
     "1a",
     "1b",
@@ -177,8 +176,7 @@ def _download_job_data(dbgym_cfg: DBGymConfig) -> None:
 
     logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloading: {expected_symlink_dpath}")
     real_data_path = dbgym_cfg.cur_task_runs_data_path(mkdir=True)
-    # subprocess_run(f"curl -O {JOB_TABLES_URL}", cwd=real_data_path) # This is if we're using a non-Google-Drive link
-    subprocess_run(f"gdown {JOB_TABLES_URL}", cwd=real_data_path)
+    subprocess_run(f"curl -O {JOB_TABLES_URL}", cwd=real_data_path)
     job_data_dpath = dbgym_cfg.cur_task_runs_data_path(
         default_tables_dname(DEFAULT_SCALE_FACTOR), mkdir=True
     )
diff --git a/dependencies/requirements.txt b/dependencies/requirements.txt
index e1252f80..6c0cb4b7 100644
--- a/dependencies/requirements.txt
+++ b/dependencies/requirements.txt
@@ -135,4 +135,3 @@ Werkzeug==3.0.1
 wrapt==1.14.1
 zipp==3.17.0
 streamlit==1.39.0
-gdown==5.2.0
\ No newline at end of file
diff --git a/scripts/run_protox_e2e_test.py b/scripts/run_protox_e2e_test.py
index 7d25374c..f86af1e5 100644
--- a/scripts/run_protox_e2e_test.py
+++ b/scripts/run_protox_e2e_test.py
@@ -200,5 +200,5 @@ def run_e2e_for_benchmark(benchmark_name: str, intended_dbdata_hardware: str) ->
     # Set the config file so that we use resources that don't conflict with normal usage (e.g. a different workspace, different ports, etc.).
     os.environ["DBGYM_CONFIG_PATH"] = str(E2ETEST_DBGYM_CONFIG_FPATH)
 
-    run_e2e_for_benchmark("tpch", intended_dbdata_hardware)
+    # run_e2e_for_benchmark("tpch", intended_dbdata_hardware)
     run_e2e_for_benchmark("job", intended_dbdata_hardware)

From 86731552e350b40a5e7110af03ee53d46579c8c4 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sat, 21 Dec 2024 20:43:52 -0500
Subject: [PATCH 2/6] refactored download and untar into function

---
 benchmark/job/cli.py | 51 +++++++++++++++++++++++++++-----------------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py
index 3f6fd6d7..ce463747 100644
--- a/benchmark/job/cli.py
+++ b/benchmark/job/cli.py
@@ -164,27 +164,12 @@ def job_workload(
 
 
 def _download_job_data(dbgym_cfg: DBGymConfig) -> None:
-    expected_symlink_dpath = (
-        dbgym_cfg.cur_symlinks_data_path(mkdir=True)
-        / f"{default_tables_dname(DEFAULT_SCALE_FACTOR)}.link"
-    )
-    if expected_symlink_dpath.exists():
-        logging.getLogger(DBGYM_LOGGER_NAME).info(
-            f"Skipping download: {expected_symlink_dpath}"
-        )
-        return
-
-    logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloading: {expected_symlink_dpath}")
-    real_data_path = dbgym_cfg.cur_task_runs_data_path(mkdir=True)
-    subprocess_run(f"curl -O {JOB_TABLES_URL}", cwd=real_data_path)
-    job_data_dpath = dbgym_cfg.cur_task_runs_data_path(
-        default_tables_dname(DEFAULT_SCALE_FACTOR), mkdir=True
+    _download_and_untar_dir(
+        dbgym_cfg,
+        JOB_TABLES_URL,
+        "imdb.tgz",
+        default_tables_dname(DEFAULT_SCALE_FACTOR),
     )
-    subprocess_run("tar -zxvf ../imdb.tgz", cwd=job_data_dpath)
-    subprocess_run(f"rm imdb.tgz", cwd=real_data_path)
-    symlink_dpath = link_result(dbgym_cfg, job_data_dpath)
-    assert expected_symlink_dpath.samefile(symlink_dpath)
-    logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloaded: {expected_symlink_dpath}")
 
 
 def _clone_job_queries(dbgym_cfg: DBGymConfig) -> None:
@@ -207,6 +192,32 @@ def _clone_job_queries(dbgym_cfg: DBGymConfig) -> None:
     logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloned: {expected_symlink_dpath}")
 
 
+def _download_and_untar_dir(
+    dbgym_cfg: DBGymConfig,
+    download_url: str,
+    download_tarred_fname: str,
+    untarred_dname: str,
+) -> None:
+    expected_symlink_dpath = (
+        dbgym_cfg.cur_symlinks_data_path(mkdir=True) / f"{untarred_dname}.link"
+    )
+    if expected_symlink_dpath.exists():
+        logging.getLogger(DBGYM_LOGGER_NAME).info(
+            f"Skipping download: {expected_symlink_dpath}"
+        )
+        return
+
+    logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloading: {expected_symlink_dpath}")
+    real_data_path = dbgym_cfg.cur_task_runs_data_path(mkdir=True)
+    subprocess_run(f"curl -O {download_url}", cwd=real_data_path)
+    untarred_data_dpath = dbgym_cfg.cur_task_runs_data_path(untarred_dname, mkdir=True)
+    subprocess_run(f"tar -zxvf ../{download_tarred_fname}", cwd=untarred_data_dpath)
+    subprocess_run(f"rm {download_tarred_fname}", cwd=real_data_path)
+    symlink_dpath = link_result(dbgym_cfg, untarred_data_dpath)
+    assert expected_symlink_dpath.samefile(symlink_dpath)
+    logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloaded: {expected_symlink_dpath}")
+
+
 def _generate_job_workload(
     dbgym_cfg: DBGymConfig,
     query_subset: str,

From 731db402418df306b02fd006fc7c68a138ddefd1 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sat, 21 Dec 2024 20:46:07 -0500
Subject: [PATCH 3/6] now downloading job queries from
 https://event.cwi.nl/da/job/

---
 benchmark/job/cli.py         | 23 ++++-------------------
 benchmark/job/job_schema.sql |  4 ++--
 2 files changed, 6 insertions(+), 21 deletions(-)

diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py
index ce463747..38d5f31f 100644
--- a/benchmark/job/cli.py
+++ b/benchmark/job/cli.py
@@ -13,6 +13,7 @@
 )
 
 JOB_TABLES_URL = "https://event.cwi.nl/da/job/imdb.tgz"
+JOB_QUERIES_URL = "https://event.cwi.nl/da/job/job.tgz"
 JOB_QUERY_NAMES = [
     "1a",
     "1b",
@@ -159,7 +160,7 @@ def job_workload(
     dbgym_cfg: DBGymConfig, query_subset: str, scale_factor: float
 ) -> None:
     assert scale_factor == DEFAULT_SCALE_FACTOR
-    _clone_job_queries(dbgym_cfg)
+    _download_job_queries(dbgym_cfg)
     _generate_job_workload(dbgym_cfg, query_subset)
 
 
@@ -172,24 +173,8 @@ def _download_job_data(dbgym_cfg: DBGymConfig) -> None:
     )
 
 
-def _clone_job_queries(dbgym_cfg: DBGymConfig) -> None:
-    expected_symlink_dpath = (
-        dbgym_cfg.cur_symlinks_build_path(mkdir=True) / "job-queries.link"
-    )
-    if expected_symlink_dpath.exists():
-        logging.getLogger(DBGYM_LOGGER_NAME).info(
-            f"Skipping clone: {expected_symlink_dpath}"
-        )
-        return
-
-    logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloning: {expected_symlink_dpath}")
-    real_build_path = dbgym_cfg.cur_task_runs_build_path(mkdir=True)
-    subprocess_run(
-        f"./clone_job_queries.sh {real_build_path}", cwd=dbgym_cfg.cur_source_path()
-    )
-    symlink_dpath = link_result(dbgym_cfg, real_build_path / "job-queries")
-    assert expected_symlink_dpath.samefile(symlink_dpath)
-    logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloned: {expected_symlink_dpath}")
+def _download_job_queries(dbgym_cfg: DBGymConfig) -> None:
+    _download_and_untar_dir(dbgym_cfg, JOB_QUERIES_URL, "job.tgz", "job-queries")
 
 
 def _download_and_untar_dir(
diff --git a/benchmark/job/job_schema.sql b/benchmark/job/job_schema.sql
index 9b665278..64293af1 100644
--- a/benchmark/job/job_schema.sql
+++ b/benchmark/job/job_schema.sql
@@ -1,5 +1,5 @@
--- Copied over from https://github.com/wangpatrick57/job-queries/blob/master/schema.sql (from an older commit)
--- We copied it over so that we have control over the schema, not job-queries.
+-- Copied over from https://event.cwi.nl/da/job/job.tgz.
+-- We copied it over so that we have control over the schema.
 CREATE TABLE aka_name (
     id integer NOT NULL PRIMARY KEY,
     person_id integer NOT NULL,

From e649d23c0699825384d4d6c53ba941611478cab5 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 22 Dec 2024 14:47:27 -0500
Subject: [PATCH 4/6] now getting queries from data instead of build

---
 benchmark/job/cli.py           | 6 ++++--
 scripts/run_protox_e2e_test.py | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py
index 38d5f31f..37e38058 100644
--- a/benchmark/job/cli.py
+++ b/benchmark/job/cli.py
@@ -129,6 +129,7 @@
     "33b",
     "33c",
 ]
+JOB_QUERIES_DNAME = "job-queries"
 
 
 @click.group(name="job")
@@ -174,7 +175,7 @@ def _download_job_data(dbgym_cfg: DBGymConfig) -> None:
 
 
 def _download_job_queries(dbgym_cfg: DBGymConfig) -> None:
-    _download_and_untar_dir(dbgym_cfg, JOB_QUERIES_URL, "job.tgz", "job-queries")
+    _download_and_untar_dir(dbgym_cfg, JOB_QUERIES_URL, "job.tgz", JOB_QUERIES_DNAME)
 
 
 def _download_and_untar_dir(
@@ -230,7 +231,8 @@ def _generate_job_workload(
     with open(real_dpath / "order.txt", "w") as f:
         for qname in query_names:
             sql_fpath = (
-                dbgym_cfg.cur_symlinks_build_path(mkdir=True) / ("job-queries.link")
+                dbgym_cfg.cur_symlinks_data_path(mkdir=True)
+                / (f"{JOB_QUERIES_DNAME}.link")
             ).resolve() / f"{qname}.sql"
             assert (
                 sql_fpath.exists()
diff --git a/scripts/run_protox_e2e_test.py b/scripts/run_protox_e2e_test.py
index f86af1e5..7d25374c 100644
--- a/scripts/run_protox_e2e_test.py
+++ b/scripts/run_protox_e2e_test.py
@@ -200,5 +200,5 @@ def run_e2e_for_benchmark(benchmark_name: str, intended_dbdata_hardware: str) ->
     # Set the config file so that we use resources that don't conflict with normal usage (e.g. a different workspace, different ports, etc.).
     os.environ["DBGYM_CONFIG_PATH"] = str(E2ETEST_DBGYM_CONFIG_FPATH)
 
-    # run_e2e_for_benchmark("tpch", intended_dbdata_hardware)
+    run_e2e_for_benchmark("tpch", intended_dbdata_hardware)
     run_e2e_for_benchmark("job", intended_dbdata_hardware)

From 041feb1b06ada9f1b619a600e29d2cc689d030ab Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 22 Dec 2024 19:32:02 -0500
Subject: [PATCH 5/6] added untarred_original_dname option to
 _download_and_untar_dir

---
 benchmark/job/cli.py           | 32 +++++++++++++++++++++++++++++---
 scripts/run_protox_e2e_test.py |  2 +-
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py
index 37e38058..a3ef2bdd 100644
--- a/benchmark/job/cli.py
+++ b/benchmark/job/cli.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Optional
 
 import click
 
@@ -175,7 +176,13 @@ def _download_job_data(dbgym_cfg: DBGymConfig) -> None:
 
 
 def _download_job_queries(dbgym_cfg: DBGymConfig) -> None:
-    _download_and_untar_dir(dbgym_cfg, JOB_QUERIES_URL, "job.tgz", JOB_QUERIES_DNAME)
+    _download_and_untar_dir(
+        dbgym_cfg,
+        JOB_QUERIES_URL,
+        "job.tgz",
+        JOB_QUERIES_DNAME,
+        untarred_original_dname="job",
+    )
 
 
 def _download_and_untar_dir(
@@ -183,7 +190,14 @@ def _download_and_untar_dir(
     download_url: str,
     download_tarred_fname: str,
     untarred_dname: str,
+    untarred_original_dname: Optional[str] = None,
 ) -> None:
+    """
+    Some .tgz files are built from a directory while others are built from the contents of
+    the directory. If the .tgz file we're untarring is built from a directory, it will have
+    an "original" directory name. If this is the case, you should set
+    `untarred_original_dname` to ensure that it gets renamed to `untarred_dname`.
+    """
     expected_symlink_dpath = (
         dbgym_cfg.cur_symlinks_data_path(mkdir=True) / f"{untarred_dname}.link"
     )
@@ -196,8 +210,20 @@ def _download_and_untar_dir(
     logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloading: {expected_symlink_dpath}")
     real_data_path = dbgym_cfg.cur_task_runs_data_path(mkdir=True)
     subprocess_run(f"curl -O {download_url}", cwd=real_data_path)
-    untarred_data_dpath = dbgym_cfg.cur_task_runs_data_path(untarred_dname, mkdir=True)
-    subprocess_run(f"tar -zxvf ../{download_tarred_fname}", cwd=untarred_data_dpath)
+    untarred_data_dpath = dbgym_cfg.cur_task_runs_data_path(untarred_dname)
+
+    if untarred_original_dname is not None:
+        assert not untarred_data_dpath.exists()
+        subprocess_run(f"tar -zxvf {download_tarred_fname}", cwd=real_data_path)
+        assert (real_data_path / untarred_original_dname).exists()
+        subprocess_run(
+            f"mv {untarred_original_dname} {untarred_dname}", cwd=real_data_path
+        )
+    else:
+        untarred_data_dpath.mkdir(parents=True, exist_ok=False)
+        subprocess_run(f"tar -zxvf ../{download_tarred_fname}", cwd=untarred_data_dpath)
+
+    assert untarred_data_dpath.exists()
     subprocess_run(f"rm {download_tarred_fname}", cwd=real_data_path)
     symlink_dpath = link_result(dbgym_cfg, untarred_data_dpath)
     assert expected_symlink_dpath.samefile(symlink_dpath)
diff --git a/scripts/run_protox_e2e_test.py b/scripts/run_protox_e2e_test.py
index 7d25374c..262fc4ef 100644
--- a/scripts/run_protox_e2e_test.py
+++ b/scripts/run_protox_e2e_test.py
@@ -200,5 +200,5 @@ def run_e2e_for_benchmark(benchmark_name: str, intended_dbdata_hardware: str) ->
     # Set the config file so that we use resources that don't conflict with normal usage (e.g. a different workspace, different ports, etc.).
     os.environ["DBGYM_CONFIG_PATH"] = str(E2ETEST_DBGYM_CONFIG_FPATH)
 
-    run_e2e_for_benchmark("tpch", intended_dbdata_hardware)
+    # run_e2e_for_benchmark("tpch", intended_dbdata_hardware) # TODO: Uncomment this
     run_e2e_for_benchmark("job", intended_dbdata_hardware)

From 5abd53b5a188dc3a32a754f3eaa0e53dd74cc607 Mon Sep 17 00:00:00 2001
From: Patrick Wang <wang.patrick57@gmail.com>
Date: Sun, 22 Dec 2024 19:36:02 -0500
Subject: [PATCH 6/6] uncommented tpch e2e

---
 scripts/run_protox_e2e_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/run_protox_e2e_test.py b/scripts/run_protox_e2e_test.py
index 262fc4ef..7d25374c 100644
--- a/scripts/run_protox_e2e_test.py
+++ b/scripts/run_protox_e2e_test.py
@@ -200,5 +200,5 @@ def run_e2e_for_benchmark(benchmark_name: str, intended_dbdata_hardware: str) ->
     # Set the config file so that we use resources that don't conflict with normal usage (e.g. a different workspace, different ports, etc.).
     os.environ["DBGYM_CONFIG_PATH"] = str(E2ETEST_DBGYM_CONFIG_FPATH)
 
-    # run_e2e_for_benchmark("tpch", intended_dbdata_hardware) # TODO: Uncomment this
+    run_e2e_for_benchmark("tpch", intended_dbdata_hardware)
     run_e2e_for_benchmark("job", intended_dbdata_hardware)