Skip to content

Commit

Permalink
refactored everything to use linkname helpers
Browse files Browse the repository at this point in the history
  • Loading branch information
wangpatrick57 committed Dec 30, 2024
1 parent f5c89e7 commit e6ed566
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 20 deletions.
5 changes: 3 additions & 2 deletions benchmark/job/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
get_tables_dirname,
get_workload_dirname,
get_workload_suffix,
name_to_linkname,
)

from benchmark.constants import DEFAULT_SCALE_FACTOR
Expand Down Expand Up @@ -255,7 +256,7 @@ def _generate_job_workload(
get_workload_suffix("job", query_subset=query_subset),
)
expected_workload_symlink_path = dbgym_workspace.dbgym_cur_symlinks_path / (
workload_name + ".link"
name_to_linkname(workload_name)
)
if expected_workload_symlink_path.exists():
logging.getLogger(DBGYM_LOGGER_NAME).info(
Expand All @@ -281,7 +282,7 @@ def _generate_job_workload(

with open(workload_path / "order.txt", "w") as f:
queries_parent_path = dbgym_workspace.dbgym_cur_symlinks_path / (
JOB_QUERIES_DNAME + ".link"
name_to_linkname(JOB_QUERIES_DNAME)
)

for qname in query_names:
Expand Down
11 changes: 6 additions & 5 deletions benchmark/tpch/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
get_workload_suffix,
get_workload_symlink_path,
linkname_to_name,
name_to_linkname,
)

from benchmark.constants import DEFAULT_SCALE_FACTOR
Expand Down Expand Up @@ -98,7 +99,7 @@ def _get_queries_dirname(seed: int, scale_factor: float) -> str:

def _clone_tpch_kit(dbgym_workspace: DBGymWorkspace) -> None:
expected_symlink_path = dbgym_workspace.dbgym_cur_symlinks_path / (
TPCH_KIT_DIRNAME + ".link"
name_to_linkname(TPCH_KIT_DIRNAME)
)
if expected_symlink_path.exists():
logging.getLogger(DBGYM_LOGGER_NAME).info(
Expand All @@ -122,14 +123,14 @@ def _generate_tpch_queries(
dbgym_workspace: DBGymWorkspace, seed_start: int, seed_end: int, scale_factor: float
) -> None:
tpch_kit_path = dbgym_workspace.dbgym_cur_symlinks_path / (
TPCH_KIT_DIRNAME + ".link"
name_to_linkname(TPCH_KIT_DIRNAME)
)
logging.getLogger(DBGYM_LOGGER_NAME).info(
f"Generating queries: [{seed_start}, {seed_end}]"
)
for seed in range(seed_start, seed_end + 1):
expected_queries_symlink_path = dbgym_workspace.dbgym_cur_symlinks_path / (
_get_queries_dirname(seed, scale_factor) + ".link"
name_to_linkname(_get_queries_dirname(seed, scale_factor))
)
if expected_queries_symlink_path.exists():
continue
Expand All @@ -155,7 +156,7 @@ def _generate_tpch_queries(

def _generate_tpch_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
tpch_kit_path = dbgym_workspace.dbgym_cur_symlinks_path / (
TPCH_KIT_DIRNAME + ".link"
name_to_linkname(TPCH_KIT_DIRNAME)
)
expected_tables_symlink_path = get_tables_symlink_path(
dbgym_workspace.dbgym_workspace_path, "tpch", scale_factor
Expand Down Expand Up @@ -225,7 +226,7 @@ def _generate_tpch_workload(
with open(workload_path / "order.txt", "w") as f:
for seed in range(seed_start, seed_end + 1):
queries_parent_path = dbgym_workspace.dbgym_cur_symlinks_path / (
_get_queries_dirname(seed, scale_factor) + ".link"
name_to_linkname(_get_queries_dirname(seed, scale_factor))
)

for qname in query_names:
Expand Down
16 changes: 10 additions & 6 deletions gymlib_package/gymlib/symlinks_paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def get_tables_symlink_path(
workspace_path
/ SYMLINKS_DNAME
/ DBGYM_APP_NAME
/ (get_tables_dirname(benchmark, scale_factor) + ".link")
/ name_to_linkname(get_tables_dirname(benchmark, scale_factor))
)


Expand All @@ -61,7 +61,7 @@ def get_workload_symlink_path(
workspace_path
/ SYMLINKS_DNAME
/ DBGYM_APP_NAME
/ (get_workload_dirname(benchmark, scale_factor, suffix) + ".link")
/ name_to_linkname(get_workload_dirname(benchmark, scale_factor, suffix))
)


Expand All @@ -80,16 +80,20 @@ def get_dbdata_tgz_symlink_path(
workspace_path
/ SYMLINKS_DNAME
/ DBGYM_APP_NAME
/ (get_dbdata_tgz_filename(benchmark_name, scale_factor) + ".link")
/ name_to_linkname(get_dbdata_tgz_filename(benchmark_name, scale_factor))
)


# TODO: refactor stuff to use this
def is_linkname(name: str) -> bool:
assert not name.endswith(".link.link")
return name.endswith(".link")


def name_to_linkname(name: str) -> str:
assert not name.endswith(".link")
assert not is_linkname(name)
return f"{name}.link"


def linkname_to_name(linkname: str) -> str:
assert linkname.endswith(".link")
assert is_linkname(linkname)
return linkname[: -len(".link")]
17 changes: 10 additions & 7 deletions util/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from typing import IO, Any, Optional

import yaml
from gymlib.symlinks_paths import is_linkname, name_to_linkname

from util.log import DBGYM_LOGGER_NAME

Expand Down Expand Up @@ -143,9 +144,9 @@ def link_result(
link_name = custom_link_name
else:
if os.path.isfile(result_path):
link_name = basename_of_path(result_path) + ".link"
link_name = name_to_linkname(basename_of_path(result_path))
elif os.path.isdir(result_path):
link_name = basename_of_path(result_path) + ".link"
link_name = name_to_linkname(basename_of_path(result_path))
else:
raise AssertionError("result_path must be either a file or dir")

Expand All @@ -156,8 +157,8 @@ def link_result(
# Note that in a multi-threaded setting, this might remove one created by a process in the same run,
# meaning it's not "old" by our definition of "old". However, we'll always end up with a symlink
# file of the current run regardless of the order of threads.
assert link_name.endswith(".link") and not link_name.endswith(
".link.link"
assert is_linkname(
link_name
), f'link_name ({link_name}) should end with ".link"'
symlink_path = symlink_parent_path / link_name
try_remove_file(symlink_path)
Expand Down Expand Up @@ -210,7 +211,7 @@ def save_file(self, path: Path) -> None:
parent_path = parent_path_of_path(path)
if parent_path.samefile(run_path):
fname = basename_of_path(path)
symlink_path = self.dbgym_this_run_path / (fname + ".link")
symlink_path = self.dbgym_this_run_path / name_to_linkname(fname)
try_remove_file(symlink_path)
try_create_symlink(path, symlink_path)
# Otherwise, we know the path file is _not_ directly inside run_path dir.
Expand All @@ -225,7 +226,9 @@ def save_file(self, path: Path) -> None:

# Create symlink
open_base_dname = basename_of_path(base_path)
symlink_path = self.dbgym_this_run_path / (open_base_dname + ".link")
symlink_path = self.dbgym_this_run_path / name_to_linkname(
open_base_dname
)
try_remove_file(symlink_path)
try_create_symlink(base_path, symlink_path)
# If the file wasn't generated by a run, we can't just symlink it because we don't know that it's immutable.
Expand Down Expand Up @@ -461,7 +464,7 @@ def try_create_symlink(src_path: Path, dst_path: Path) -> None:
Our functions that create symlinks might be called by multiple processes at once
during HPO. Thus, this is a thread-safe way to create a symlink.
"""
assert dst_path.suffix == ".link"
assert is_linkname(dst_path.name)
try:
os.symlink(src_path, dst_path)
except FileExistsError:
Expand Down

0 comments on commit e6ed566

Please sign in to comment.