Skip to content

Commit

Permalink
fordpath -> path
Browse files Browse the repository at this point in the history
  • Loading branch information
wangpatrick57 committed Dec 30, 2024
1 parent 85f40a0 commit bff93e5
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 71 deletions.
54 changes: 26 additions & 28 deletions manage/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,9 @@ def clean_workspace(
# 2. Go through symlinks, figuring out which "children of task runs" to keep
# Based on the rules of the framework, "children of task runs" should be run_*/ directories.
# However, the user's workspace might happen to break these rules by putting directories not
# named "run_*/" or files directly in task_runs/. Thus, I use the term "task_run_child_fordpaths"
# named "run_*/" or files directly in task_runs/. Thus, I use the term "task_run_child_paths"
# instead of "run_dpaths".
task_run_child_fordpaths_to_keep = set()
task_run_child_paths_to_keep = set()

if dbgym_workspace.dbgym_runs_path.exists():
while symlink_fpaths_to_process:
Expand All @@ -127,66 +127,64 @@ def clean_workspace(
# Path.resolve() resolves all layers of symlinks while os.readlink() only resolves one layer.
# However, os.readlink() literally reads the string contents of the link. We need to do some
# processing on the result of os.readlink() to convert it to an absolute path
real_fordpath = symlink_fpath.resolve()
one_layer_resolved_fordpath = os.readlink(symlink_fpath)
assert str(real_fordpath) == str(
real_path = symlink_fpath.resolve()
one_layer_resolved_path = os.readlink(symlink_fpath)
assert str(real_path) == str(
os.readlink(symlink_fpath)
), f"symlink_fpath ({symlink_fpath}) seems to point to *another* symlink. This is difficult to handle, so it is currently disallowed. Please resolve this situation manually."

# If the file doesn't exist, we'll just ignore it.
if not real_fordpath.exists():
if not real_path.exists():
continue
# We're only trying to figure out which direct children of task_runs/ to save. If the file isn't
# even a descendant, we don't care about it.
if not is_child_path(real_fordpath, dbgym_workspace.dbgym_runs_path):
if not is_child_path(real_path, dbgym_workspace.dbgym_runs_path):
continue

assert not real_fordpath.samefile(dbgym_workspace.dbgym_runs_path)
assert not real_path.samefile(dbgym_workspace.dbgym_runs_path)

# Figure out the task_run_child_fordpath to put into task_run_child_fordpaths_to_keep
task_run_child_fordpath = None
if parent_dpath_of_path(real_fordpath).samefile(
# Figure out the task_run_child_path to put into task_run_child_paths_to_keep
task_run_child_path = None
if parent_dpath_of_path(real_path).samefile(
dbgym_workspace.dbgym_runs_path
):
# While it's true that it shouldn't be possible to symlink to a directory directly in task_runs/,
# we'll just not delete it if the user happens to have one like this. Even if the user messed up
# the structure somehow, it's just a good idea not to delete it.
task_run_child_fordpath = real_fordpath
task_run_child_path = real_path
else:
# Technically, it's not allowed to symlink to any files not in task_runs/run_*/[codebase]/[organization]/.
# However, as with above, we won't just nuke files if the workspace doesn't follow this rule for
# some reason.
task_run_child_fordpath = real_fordpath
while not parent_dpath_of_path(task_run_child_fordpath).samefile(
task_run_child_path = real_path
while not parent_dpath_of_path(task_run_child_path).samefile(
dbgym_workspace.dbgym_runs_path
):
task_run_child_fordpath = parent_dpath_of_path(
task_run_child_fordpath
)
assert task_run_child_fordpath != None
assert parent_dpath_of_path(task_run_child_fordpath).samefile(
task_run_child_path = parent_dpath_of_path(task_run_child_path)
assert task_run_child_path != None
assert parent_dpath_of_path(task_run_child_path).samefile(
dbgym_workspace.dbgym_runs_path
), f"task_run_child_fordpath ({task_run_child_fordpath}) is not a direct child of dbgym_workspace.dbgym_runs_path"
task_run_child_fordpaths_to_keep.add(task_run_child_fordpath)
), f"task_run_child_path ({task_run_child_path}) is not a direct child of dbgym_workspace.dbgym_runs_path"
task_run_child_paths_to_keep.add(task_run_child_path)

# If on safe mode, add symlinks inside the task_run_child_fordpath to be processed
# If on safe mode, add symlinks inside the task_run_child_path to be processed
if mode == "safe":
add_symlinks_in_dpath(
symlink_fpaths_to_process,
task_run_child_fordpath,
task_run_child_path,
processed_symlinks,
)

# 3. Go through all children of task_runs/*, deleting any that we weren't told to keep
# It's true that symlinks might link outside of task_runs/*. We'll just not care about those
starting_num_files = _count_files_in_workspace(dbgym_workspace)
if dbgym_workspace.dbgym_runs_path.exists():
for child_fordpath in dbgym_workspace.dbgym_runs_path.iterdir():
if child_fordpath not in task_run_child_fordpaths_to_keep:
if child_fordpath.is_dir():
shutil.rmtree(child_fordpath)
for child_path in dbgym_workspace.dbgym_runs_path.iterdir():
if child_path not in task_run_child_paths_to_keep:
if child_path.is_dir():
shutil.rmtree(child_path)
else:
os.remove(child_fordpath)
os.remove(child_path)
ending_num_files = _count_files_in_workspace(dbgym_workspace)

if verbose:
Expand Down
2 changes: 1 addition & 1 deletion util/tests/unittest_workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ def test_link_result_cannot_link_symlink(self) -> None:
)
with self.assertRaisesRegex(
AssertionError,
"result_fordpath \(.*\) should be a fully resolved path",
"result_path \(.*\) should be a fully resolved path",
):
self.workspace.link_result(symlink_path)

Expand Down
84 changes: 42 additions & 42 deletions util/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,37 +169,37 @@ def __init__(self, dbgym_workspace_path: Path):
# TODO(phw2): refactor our manual symlinking in postgres/cli.py to use link_result() instead
def link_result(
self,
result_fordpath: Path,
result_path: Path,
custom_link_name: Optional[str] = None,
) -> Path:
"""
result_fordpath must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path.
Further, result_fordpath must have been generated by this invocation to task.py. This also means that
result_fordpath itself can be a file or a dir but not a symlink.
result_path must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path.
Further, result_path must have been generated by this invocation to task.py. This also means that
result_path itself can be a file or a dir but not a symlink.
Given a file or directory in task_runs/run_*/[codebase]/[org], this will create a symlink inside
symlinks/[codebase]/[org]/.
Will override the old symlink if there is one, so that symlinks/ always contains the latest generated
version of a file.
This function will return the path to the symlink that was created.
"""
assert isinstance(result_fordpath, Path)
assert isinstance(result_path, Path)
assert is_fully_resolved(
result_fordpath
), f"result_fordpath ({result_fordpath}) should be a fully resolved path"
result_path
), f"result_path ({result_path}) should be a fully resolved path"
assert is_child_path(
result_fordpath, self.dbgym_this_run_path
result_path, self.dbgym_this_run_path
), "The result must have been generated in *this* run_*/ dir"
assert not os.path.islink(result_fordpath)
assert not os.path.islink(result_path)

if type(custom_link_name) is str:
link_name = custom_link_name
else:
if os.path.isfile(result_fordpath):
link_name = basename_of_path(result_fordpath) + ".link"
elif os.path.isdir(result_fordpath):
link_name = basename_of_path(result_fordpath) + ".link"
if os.path.isfile(result_path):
link_name = basename_of_path(result_path) + ".link"
elif os.path.isdir(result_path):
link_name = basename_of_path(result_path) + ".link"
else:
raise AssertionError("result_fordpath must be either a file or dir")
raise AssertionError("result_path must be either a file or dir")

symlink_parent_dpath = self.dbgym_symlinks_path / self.app_name
symlink_parent_dpath.mkdir(parents=True, exist_ok=True)
Expand All @@ -213,7 +213,7 @@ def link_result(
), f'link_name ({link_name}) should end with ".link"'
symlink_path = symlink_parent_dpath / link_name
try_remove_file(symlink_path)
try_create_symlink(result_fordpath, symlink_path)
try_create_symlink(result_path, symlink_path)

return symlink_path

Expand Down Expand Up @@ -560,27 +560,27 @@ def open_and_save(
return open(open_fpath, mode=mode)


def extract_from_task_run_fordpath(
dbgym_workspace: DBGymWorkspace, task_run_fordpath: Path
def extract_from_task_run_path(
dbgym_workspace: DBGymWorkspace, task_run_path: Path
) -> tuple[Path, str, Path, str]:
"""
The task_runs/ folder is organized like task_runs/run_*/[codebase]/[org]/any/path/you/want.
This function extracts the [codebase] and [org] components
"""
assert isinstance(task_run_fordpath, Path)
assert not task_run_fordpath.is_symlink()
parent_dpath = task_run_fordpath.parent
assert isinstance(task_run_path, Path)
assert not task_run_path.is_symlink()
parent_dpath = task_run_path.parent
# TODO(phw2): make this a common function
assert not parent_dpath.samefile(
dbgym_workspace.dbgym_runs_path
), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/ dir instead of directly in dbgym_workspace.dbgym_runs_path ({dbgym_workspace.dbgym_runs_path})"
), f"task_run_path ({task_run_path}) should be inside a run_*/ dir instead of directly in dbgym_workspace.dbgym_runs_path ({dbgym_workspace.dbgym_runs_path})"
assert not parent_dpath_of_path(parent_dpath).samefile(
dbgym_workspace.dbgym_runs_path
), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/[codebase]/ dir instead of directly in run_*/ ({dbgym_workspace.dbgym_runs_path})"
), f"task_run_path ({task_run_path}) should be inside a run_*/[codebase]/ dir instead of directly in run_*/ ({dbgym_workspace.dbgym_runs_path})"
assert not parent_dpath_of_path(parent_dpath_of_path(parent_dpath)).samefile(
dbgym_workspace.dbgym_runs_path
), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/[codebase]/[organization]/ dir instead of directly in run_*/ ({dbgym_workspace.dbgym_runs_path})"
# org_dpath is the run_*/[codebase]/[organization]/ dir that task_run_fordpath is in
), f"task_run_path ({task_run_path}) should be inside a run_*/[codebase]/[organization]/ dir instead of directly in run_*/ ({dbgym_workspace.dbgym_runs_path})"
# org_dpath is the run_*/[codebase]/[organization]/ dir that task_run_path is in
org_dpath = parent_dpath
while not parent_dpath_of_path(
parent_dpath_of_path(parent_dpath_of_path(org_dpath))
Expand Down Expand Up @@ -617,7 +617,7 @@ def save_file(dbgym_workspace: DBGymWorkspace, fpath: Path) -> None:
# 2. files or dirs generated by a run may be very large (up to 100s of GBs) so we don't want to copy them
if is_child_path(fpath, dbgym_workspace.dbgym_runs_path):
# get paths we'll need later.
_, codebase_dname, org_dpath, org_dname = extract_from_task_run_fordpath(
_, codebase_dname, org_dpath, org_dname = extract_from_task_run_path(
dbgym_workspace, fpath
)
this_run_save_dpath = (
Expand Down Expand Up @@ -658,39 +658,39 @@ def save_file(dbgym_workspace: DBGymWorkspace, fpath: Path) -> None:
# TODO(phw2): deprecate this once I'm done with unittest_workspace.py
def link_result(
dbgym_workspace: DBGymWorkspace,
result_fordpath: Path,
result_path: Path,
custom_result_name: Optional[str] = None,
) -> Path:
"""
result_fordpath must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path.
Further, result_fordpath must have been generated by this invocation to task.py. This also means that
result_fordpath itself can be a file or a dir but not a symlink.
result_path must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path.
Further, result_path must have been generated by this invocation to task.py. This also means that
result_path itself can be a file or a dir but not a symlink.
Given a file or directory in task_runs/run_*/[codebase]/[org], this will create a symlink inside
symlinks/[codebase]/[org]/.
Will override the old symlink if there is one, so that symlinks/ always contains the latest generated
version of a file.
This function will return the path to the symlink that was created.
"""
assert isinstance(result_fordpath, Path)
assert isinstance(result_path, Path)
assert is_fully_resolved(
result_fordpath
), f"result_fordpath ({result_fordpath}) should be a fully resolved path"
assert is_child_path(result_fordpath, dbgym_workspace.dbgym_this_run_path)
assert not os.path.islink(result_fordpath)
result_path
), f"result_path ({result_path}) should be a fully resolved path"
assert is_child_path(result_path, dbgym_workspace.dbgym_this_run_path)
assert not os.path.islink(result_path)

if type(custom_result_name) is str:
result_name = custom_result_name
else:
if os.path.isfile(result_fordpath):
result_name = basename_of_path(result_fordpath) + ".link"
elif os.path.isdir(result_fordpath):
result_name = basename_of_path(result_fordpath) + ".link"
if os.path.isfile(result_path):
result_name = basename_of_path(result_path) + ".link"
elif os.path.isdir(result_path):
result_name = basename_of_path(result_path) + ".link"
else:
raise AssertionError("result_fordpath must be either a file or dir")
raise AssertionError("result_path must be either a file or dir")

# Figure out the parent directory path of the symlink
codebase_dpath, codebase_dname, _, org_dname = extract_from_task_run_fordpath(
dbgym_workspace, result_fordpath
codebase_dpath, codebase_dname, _, org_dname = extract_from_task_run_path(
dbgym_workspace, result_path
)
# We're only supposed to save files generated by us, which means they should be in cur_task_runs_path()
assert codebase_dpath.samefile(
Expand All @@ -710,7 +710,7 @@ def link_result(
), f'result_name ({result_name}) should end with ".link"'
symlink_path = symlink_parent_dpath / result_name
try_remove_file(symlink_path)
try_create_symlink(result_fordpath, symlink_path)
try_create_symlink(result_path, symlink_path)

return symlink_path

Expand Down

0 comments on commit bff93e5

Please sign in to comment.