From bff93e58c31cffa617698341d35fe405671fb9ee Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 29 Dec 2024 21:01:22 -0500 Subject: [PATCH] fordpath -> path --- manage/cli.py | 54 ++++++++++---------- util/tests/unittest_workspace.py | 2 +- util/workspace.py | 84 ++++++++++++++++---------------- 3 files changed, 69 insertions(+), 71 deletions(-) diff --git a/manage/cli.py b/manage/cli.py index 5d785d9f..79bd5ecb 100644 --- a/manage/cli.py +++ b/manage/cli.py @@ -116,9 +116,9 @@ def clean_workspace( # 2. Go through symlinks, figuring out which "children of task runs" to keep # Based on the rules of the framework, "children of task runs" should be run_*/ directories. # However, the user's workspace might happen to break these rules by putting directories not - # named "run_*/" or files directly in task_runs/. Thus, I use the term "task_run_child_fordpaths" + # named "run_*/" or files directly in task_runs/. Thus, I use the term "task_run_child_paths" # instead of "run_dpaths". - task_run_child_fordpaths_to_keep = set() + task_run_child_paths_to_keep = set() if dbgym_workspace.dbgym_runs_path.exists(): while symlink_fpaths_to_process: @@ -127,53 +127,51 @@ def clean_workspace( # Path.resolve() resolves all layers of symlinks while os.readlink() only resolves one layer. # However, os.readlink() literally reads the string contents of the link. We need to do some # processing on the result of os.readlink() to convert it to an absolute path - real_fordpath = symlink_fpath.resolve() - one_layer_resolved_fordpath = os.readlink(symlink_fpath) - assert str(real_fordpath) == str( + real_path = symlink_fpath.resolve() + one_layer_resolved_path = os.readlink(symlink_fpath) + assert str(real_path) == str( os.readlink(symlink_fpath) ), f"symlink_fpath ({symlink_fpath}) seems to point to *another* symlink. This is difficult to handle, so it is currently disallowed. Please resolve this situation manually." # If the file doesn't exist, we'll just ignore it. - if not real_fordpath.exists(): + if not real_path.exists(): continue # We're only trying to figure out which direct children of task_runs/ to save. If the file isn't # even a descendant, we don't care about it. - if not is_child_path(real_fordpath, dbgym_workspace.dbgym_runs_path): + if not is_child_path(real_path, dbgym_workspace.dbgym_runs_path): continue - assert not real_fordpath.samefile(dbgym_workspace.dbgym_runs_path) + assert not real_path.samefile(dbgym_workspace.dbgym_runs_path) - # Figure out the task_run_child_fordpath to put into task_run_child_fordpaths_to_keep - task_run_child_fordpath = None - if parent_dpath_of_path(real_fordpath).samefile( + # Figure out the task_run_child_path to put into task_run_child_paths_to_keep + task_run_child_path = None + if parent_dpath_of_path(real_path).samefile( dbgym_workspace.dbgym_runs_path ): # While it's true that it shouldn't be possible to symlink to a directory directly in task_runs/, # we'll just not delete it if the user happens to have one like this. Even if the user messed up # the structure somehow, it's just a good idea not to delete it. - task_run_child_fordpath = real_fordpath + task_run_child_path = real_path else: # Technically, it's not allowed to symlink to any files not in task_runs/run_*/[codebase]/[organization]/. # However, as with above, we won't just nuke files if the workspace doesn't follow this rule for # some reason. - task_run_child_fordpath = real_fordpath - while not parent_dpath_of_path(task_run_child_fordpath).samefile( + task_run_child_path = real_path + while not parent_dpath_of_path(task_run_child_path).samefile( dbgym_workspace.dbgym_runs_path ): - task_run_child_fordpath = parent_dpath_of_path( - task_run_child_fordpath - ) - assert task_run_child_fordpath != None - assert parent_dpath_of_path(task_run_child_fordpath).samefile( + task_run_child_path = parent_dpath_of_path(task_run_child_path) + assert task_run_child_path != None + assert parent_dpath_of_path(task_run_child_path).samefile( dbgym_workspace.dbgym_runs_path - ), f"task_run_child_fordpath ({task_run_child_fordpath}) is not a direct child of dbgym_workspace.dbgym_runs_path" - task_run_child_fordpaths_to_keep.add(task_run_child_fordpath) + ), f"task_run_child_path ({task_run_child_path}) is not a direct child of dbgym_workspace.dbgym_runs_path" + task_run_child_paths_to_keep.add(task_run_child_path) - # If on safe mode, add symlinks inside the task_run_child_fordpath to be processed + # If on safe mode, add symlinks inside the task_run_child_path to be processed if mode == "safe": add_symlinks_in_dpath( symlink_fpaths_to_process, - task_run_child_fordpath, + task_run_child_path, processed_symlinks, ) @@ -181,12 +179,12 @@ def clean_workspace( # It's true that symlinks might link outside of task_runs/*. We'll just not care about those starting_num_files = _count_files_in_workspace(dbgym_workspace) if dbgym_workspace.dbgym_runs_path.exists(): - for child_fordpath in dbgym_workspace.dbgym_runs_path.iterdir(): - if child_fordpath not in task_run_child_fordpaths_to_keep: - if child_fordpath.is_dir(): - shutil.rmtree(child_fordpath) + for child_path in dbgym_workspace.dbgym_runs_path.iterdir(): + if child_path not in task_run_child_paths_to_keep: + if child_path.is_dir(): + shutil.rmtree(child_path) else: - os.remove(child_fordpath) + os.remove(child_path) ending_num_files = _count_files_in_workspace(dbgym_workspace) if verbose: diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index 89a1c6c1..6cc9fa77 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -273,7 +273,7 @@ def test_link_result_cannot_link_symlink(self) -> None: ) with self.assertRaisesRegex( AssertionError, - "result_fordpath \(.*\) should be a fully resolved path", + "result_path \(.*\) should be a fully resolved path", ): self.workspace.link_result(symlink_path) diff --git a/util/workspace.py b/util/workspace.py index 29075847..35c70691 100644 --- a/util/workspace.py +++ b/util/workspace.py @@ -169,37 +169,37 @@ def __init__(self, dbgym_workspace_path: Path): # TODO(phw2): refactor our manual symlinking in postgres/cli.py to use link_result() instead def link_result( self, - result_fordpath: Path, + result_path: Path, custom_link_name: Optional[str] = None, ) -> Path: """ - result_fordpath must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path. - Further, result_fordpath must have been generated by this invocation to task.py. This also means that - result_fordpath itself can be a file or a dir but not a symlink. + result_path must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path. + Further, result_path must have been generated by this invocation to task.py. This also means that + result_path itself can be a file or a dir but not a symlink. Given a file or directory in task_runs/run_*/[codebase]/[org], this will create a symlink inside symlinks/[codebase]/[org]/. Will override the old symlink if there is one, so that symlinks/ always contains the latest generated version of a file. This function will return the path to the symlink that was created. """ - assert isinstance(result_fordpath, Path) + assert isinstance(result_path, Path) assert is_fully_resolved( - result_fordpath - ), f"result_fordpath ({result_fordpath}) should be a fully resolved path" + result_path + ), f"result_path ({result_path}) should be a fully resolved path" assert is_child_path( - result_fordpath, self.dbgym_this_run_path + result_path, self.dbgym_this_run_path ), "The result must have been generated in *this* run_*/ dir" - assert not os.path.islink(result_fordpath) + assert not os.path.islink(result_path) if type(custom_link_name) is str: link_name = custom_link_name else: - if os.path.isfile(result_fordpath): - link_name = basename_of_path(result_fordpath) + ".link" - elif os.path.isdir(result_fordpath): - link_name = basename_of_path(result_fordpath) + ".link" + if os.path.isfile(result_path): + link_name = basename_of_path(result_path) + ".link" + elif os.path.isdir(result_path): + link_name = basename_of_path(result_path) + ".link" else: - raise AssertionError("result_fordpath must be either a file or dir") + raise AssertionError("result_path must be either a file or dir") symlink_parent_dpath = self.dbgym_symlinks_path / self.app_name symlink_parent_dpath.mkdir(parents=True, exist_ok=True) @@ -213,7 +213,7 @@ def link_result( ), f'link_name ({link_name}) should end with ".link"' symlink_path = symlink_parent_dpath / link_name try_remove_file(symlink_path) - try_create_symlink(result_fordpath, symlink_path) + try_create_symlink(result_path, symlink_path) return symlink_path @@ -560,27 +560,27 @@ def open_and_save( return open(open_fpath, mode=mode) -def extract_from_task_run_fordpath( - dbgym_workspace: DBGymWorkspace, task_run_fordpath: Path +def extract_from_task_run_path( + dbgym_workspace: DBGymWorkspace, task_run_path: Path ) -> tuple[Path, str, Path, str]: """ The task_runs/ folder is organized like task_runs/run_*/[codebase]/[org]/any/path/you/want. This function extracts the [codebase] and [org] components """ - assert isinstance(task_run_fordpath, Path) - assert not task_run_fordpath.is_symlink() - parent_dpath = task_run_fordpath.parent + assert isinstance(task_run_path, Path) + assert not task_run_path.is_symlink() + parent_dpath = task_run_path.parent # TODO(phw2): make this a common function assert not parent_dpath.samefile( dbgym_workspace.dbgym_runs_path - ), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/ dir instead of directly in dbgym_workspace.dbgym_runs_path ({dbgym_workspace.dbgym_runs_path})" + ), f"task_run_path ({task_run_path}) should be inside a run_*/ dir instead of directly in dbgym_workspace.dbgym_runs_path ({dbgym_workspace.dbgym_runs_path})" assert not parent_dpath_of_path(parent_dpath).samefile( dbgym_workspace.dbgym_runs_path - ), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/[codebase]/ dir instead of directly in run_*/ ({dbgym_workspace.dbgym_runs_path})" + ), f"task_run_path ({task_run_path}) should be inside a run_*/[codebase]/ dir instead of directly in run_*/ ({dbgym_workspace.dbgym_runs_path})" assert not parent_dpath_of_path(parent_dpath_of_path(parent_dpath)).samefile( dbgym_workspace.dbgym_runs_path - ), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/[codebase]/[organization]/ dir instead of directly in run_*/ ({dbgym_workspace.dbgym_runs_path})" - # org_dpath is the run_*/[codebase]/[organization]/ dir that task_run_fordpath is in + ), f"task_run_path ({task_run_path}) should be inside a run_*/[codebase]/[organization]/ dir instead of directly in run_*/ ({dbgym_workspace.dbgym_runs_path})" + # org_dpath is the run_*/[codebase]/[organization]/ dir that task_run_path is in org_dpath = parent_dpath while not parent_dpath_of_path( parent_dpath_of_path(parent_dpath_of_path(org_dpath)) @@ -617,7 +617,7 @@ def save_file(dbgym_workspace: DBGymWorkspace, fpath: Path) -> None: # 2. files or dirs generated by a run may be very large (up to 100s of GBs) so we don't want to copy them if is_child_path(fpath, dbgym_workspace.dbgym_runs_path): # get paths we'll need later. - _, codebase_dname, org_dpath, org_dname = extract_from_task_run_fordpath( + _, codebase_dname, org_dpath, org_dname = extract_from_task_run_path( dbgym_workspace, fpath ) this_run_save_dpath = ( @@ -658,39 +658,39 @@ def save_file(dbgym_workspace: DBGymWorkspace, fpath: Path) -> None: # TODO(phw2): deprecate this once I'm done with unittest_workspace.py def link_result( dbgym_workspace: DBGymWorkspace, - result_fordpath: Path, + result_path: Path, custom_result_name: Optional[str] = None, ) -> Path: """ - result_fordpath must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path. - Further, result_fordpath must have been generated by this invocation to task.py. This also means that - result_fordpath itself can be a file or a dir but not a symlink. + result_path must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path. + Further, result_path must have been generated by this invocation to task.py. This also means that + result_path itself can be a file or a dir but not a symlink. Given a file or directory in task_runs/run_*/[codebase]/[org], this will create a symlink inside symlinks/[codebase]/[org]/. Will override the old symlink if there is one, so that symlinks/ always contains the latest generated version of a file. This function will return the path to the symlink that was created. """ - assert isinstance(result_fordpath, Path) + assert isinstance(result_path, Path) assert is_fully_resolved( - result_fordpath - ), f"result_fordpath ({result_fordpath}) should be a fully resolved path" - assert is_child_path(result_fordpath, dbgym_workspace.dbgym_this_run_path) - assert not os.path.islink(result_fordpath) + result_path + ), f"result_path ({result_path}) should be a fully resolved path" + assert is_child_path(result_path, dbgym_workspace.dbgym_this_run_path) + assert not os.path.islink(result_path) if type(custom_result_name) is str: result_name = custom_result_name else: - if os.path.isfile(result_fordpath): - result_name = basename_of_path(result_fordpath) + ".link" - elif os.path.isdir(result_fordpath): - result_name = basename_of_path(result_fordpath) + ".link" + if os.path.isfile(result_path): + result_name = basename_of_path(result_path) + ".link" + elif os.path.isdir(result_path): + result_name = basename_of_path(result_path) + ".link" else: - raise AssertionError("result_fordpath must be either a file or dir") + raise AssertionError("result_path must be either a file or dir") # Figure out the parent directory path of the symlink - codebase_dpath, codebase_dname, _, org_dname = extract_from_task_run_fordpath( - dbgym_workspace, result_fordpath + codebase_dpath, codebase_dname, _, org_dname = extract_from_task_run_path( + dbgym_workspace, result_path ) # We're only supposed to save files generated by us, which means they should be in cur_task_runs_path() assert codebase_dpath.samefile( @@ -710,7 +710,7 @@ def link_result( ), f'result_name ({result_name}) should end with ".link"' symlink_path = symlink_parent_dpath / result_name try_remove_file(symlink_path) - try_create_symlink(result_fordpath, symlink_path) + try_create_symlink(result_path, symlink_path) return symlink_path