Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated JOB to CWI's new link #50

Merged
merged 6 commits into from
Dec 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 54 additions & 32 deletions benchmark/job/cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
from typing import Optional

import click

Expand All @@ -12,8 +13,8 @@
link_result,
)

# JOB_TABLES_URL = "https://homepages.cwi.nl/~boncz/job/imdb.tgz" # This link stopped working for me
JOB_TABLES_URL = "https://drive.google.com/uc?id=19m0zDpphAw0Bu9Irr_ta9EGr5k85hiN1"
JOB_TABLES_URL = "https://event.cwi.nl/da/job/imdb.tgz"
JOB_QUERIES_URL = "https://event.cwi.nl/da/job/job.tgz"
JOB_QUERY_NAMES = [
"1a",
"1b",
Expand Down Expand Up @@ -129,6 +130,7 @@
"33b",
"33c",
]
JOB_QUERIES_DNAME = "job-queries"


@click.group(name="job")
Expand Down Expand Up @@ -160,14 +162,44 @@ def job_workload(
dbgym_cfg: DBGymConfig, query_subset: str, scale_factor: float
) -> None:
assert scale_factor == DEFAULT_SCALE_FACTOR
_clone_job_queries(dbgym_cfg)
_download_job_queries(dbgym_cfg)
_generate_job_workload(dbgym_cfg, query_subset)


def _download_job_data(dbgym_cfg: DBGymConfig) -> None:
_download_and_untar_dir(
dbgym_cfg,
JOB_TABLES_URL,
"imdb.tgz",
default_tables_dname(DEFAULT_SCALE_FACTOR),
)


def _download_job_queries(dbgym_cfg: DBGymConfig) -> None:
_download_and_untar_dir(
dbgym_cfg,
JOB_QUERIES_URL,
"job.tgz",
JOB_QUERIES_DNAME,
untarred_original_dname="job",
)


def _download_and_untar_dir(
dbgym_cfg: DBGymConfig,
download_url: str,
download_tarred_fname: str,
untarred_dname: str,
untarred_original_dname: Optional[str] = None,
) -> None:
"""
Some .tgz files are built from a directory while others are built from the contents of
the directory. If the .tgz file we're untarring is built from a directory, it will have
an "original" directory name. If this is the case, you should set
`untarred_original_dname` to ensure that it gets renamed to `untarred_dname`.
"""
expected_symlink_dpath = (
dbgym_cfg.cur_symlinks_data_path(mkdir=True)
/ f"{default_tables_dname(DEFAULT_SCALE_FACTOR)}.link"
dbgym_cfg.cur_symlinks_data_path(mkdir=True) / f"{untarred_dname}.link"
)
if expected_symlink_dpath.exists():
logging.getLogger(DBGYM_LOGGER_NAME).info(
Expand All @@ -177,36 +209,25 @@ def _download_job_data(dbgym_cfg: DBGymConfig) -> None:

logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloading: {expected_symlink_dpath}")
real_data_path = dbgym_cfg.cur_task_runs_data_path(mkdir=True)
# subprocess_run(f"curl -O {JOB_TABLES_URL}", cwd=real_data_path) # This is if we're using a non-Google-Drive link
subprocess_run(f"gdown {JOB_TABLES_URL}", cwd=real_data_path)
job_data_dpath = dbgym_cfg.cur_task_runs_data_path(
default_tables_dname(DEFAULT_SCALE_FACTOR), mkdir=True
)
subprocess_run("tar -zxvf ../imdb.tgz", cwd=job_data_dpath)
subprocess_run(f"rm imdb.tgz", cwd=real_data_path)
symlink_dpath = link_result(dbgym_cfg, job_data_dpath)
assert expected_symlink_dpath.samefile(symlink_dpath)
logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloaded: {expected_symlink_dpath}")

subprocess_run(f"curl -O {download_url}", cwd=real_data_path)
untarred_data_dpath = dbgym_cfg.cur_task_runs_data_path(untarred_dname)

def _clone_job_queries(dbgym_cfg: DBGymConfig) -> None:
expected_symlink_dpath = (
dbgym_cfg.cur_symlinks_build_path(mkdir=True) / "job-queries.link"
)
if expected_symlink_dpath.exists():
logging.getLogger(DBGYM_LOGGER_NAME).info(
f"Skipping clone: {expected_symlink_dpath}"
if untarred_original_dname is not None:
assert not untarred_data_dpath.exists()
subprocess_run(f"tar -zxvf {download_tarred_fname}", cwd=real_data_path)
assert (real_data_path / untarred_original_dname).exists()
subprocess_run(
f"mv {untarred_original_dname} {untarred_dname}", cwd=real_data_path
)
return
else:
untarred_data_dpath.mkdir(parents=True, exist_ok=False)
subprocess_run(f"tar -zxvf ../{download_tarred_fname}", cwd=untarred_data_dpath)

logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloning: {expected_symlink_dpath}")
real_build_path = dbgym_cfg.cur_task_runs_build_path(mkdir=True)
subprocess_run(
f"./clone_job_queries.sh {real_build_path}", cwd=dbgym_cfg.cur_source_path()
)
symlink_dpath = link_result(dbgym_cfg, real_build_path / "job-queries")
assert untarred_data_dpath.exists()
subprocess_run(f"rm {download_tarred_fname}", cwd=real_data_path)
symlink_dpath = link_result(dbgym_cfg, untarred_data_dpath)
assert expected_symlink_dpath.samefile(symlink_dpath)
logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloned: {expected_symlink_dpath}")
logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloaded: {expected_symlink_dpath}")


def _generate_job_workload(
Expand Down Expand Up @@ -236,7 +257,8 @@ def _generate_job_workload(
with open(real_dpath / "order.txt", "w") as f:
for qname in query_names:
sql_fpath = (
dbgym_cfg.cur_symlinks_build_path(mkdir=True) / ("job-queries.link")
dbgym_cfg.cur_symlinks_data_path(mkdir=True)
/ (f"{JOB_QUERIES_DNAME}.link")
).resolve() / f"{qname}.sql"
assert (
sql_fpath.exists()
Expand Down
4 changes: 2 additions & 2 deletions benchmark/job/job_schema.sql
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- Copied over from https://github.com/wangpatrick57/job-queries/blob/master/schema.sql (from an older commit)
-- We copied it over so that we have control over the schema, not job-queries.
-- Copied over from https://event.cwi.nl/da/job/job.tgz.
-- We copied it over so that we have control over the schema.
CREATE TABLE aka_name (
id integer NOT NULL PRIMARY KEY,
person_id integer NOT NULL,
Expand Down
1 change: 0 additions & 1 deletion dependencies/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -135,4 +135,3 @@ Werkzeug==3.0.1
wrapt==1.14.1
zipp==3.17.0
streamlit==1.39.0
gdown==5.2.0
Loading