From 3696d13e28d8c1a0e1ff5bb660564138b313869c Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Tue, 26 Sep 2023 21:19:26 +0200 Subject: [PATCH 01/34] Add preliminary backup command --- disk_objectstore/backup_utils.py | 240 +++++++++++++++++++++++++++++++ disk_objectstore/cli.py | 39 ++++- 2 files changed, 278 insertions(+), 1 deletion(-) create mode 100644 disk_objectstore/backup_utils.py diff --git a/disk_objectstore/backup_utils.py b/disk_objectstore/backup_utils.py new file mode 100644 index 0000000..f33fe9f --- /dev/null +++ b/disk_objectstore/backup_utils.py @@ -0,0 +1,240 @@ +""" +Utilities to back up a container. +""" + +import shutil +import sqlite3 +import subprocess +import tempfile +from pathlib import Path +from typing import Optional + +from disk_objectstore.container import Container + + +def _log(msg, end="\n"): + print(msg, end=end) + + +def _is_exe_found(exe) -> bool: + return shutil.which(exe) is not None + + +def _run_cmd(args: list, remote: Optional[str] = None, check: bool = True) -> bool: + """ + Run a command locally or remotely. + """ + all_args = args[:] + if remote: + all_args = ["ssh", remote] + all_args + + try: + res = subprocess.run(all_args, capture_output=True, text=True, check=check) + except subprocess.CalledProcessError as exc: + _log("Error: " + str(exc)) + return False + + _log(f"stdout: {all_args}\n{res.stdout}") + _log(f"stderr: {all_args}\n{res.stderr}") + + success = not bool(res.returncode) + + return success + + +def _check_if_remote_accessible(remote: str) -> bool: + _log(f"Checking if '{remote}' is accessible...", end="") + success = _run_cmd(["exit"], remote=remote) + if not success: + _log(f"Error: Remote '{remote}' is not accessible!") + return False + _log(f"Success! '{remote}' is accessible!") + return True + + +def _check_path_exists(path: Path, remote: Optional[str] = None) -> bool: + cmd = ["[", "-e", str(path), "]"] + return _run_cmd(cmd, remote=remote, check=False) + + +def _call_rsync( # pylint: disable=too-many-arguments + args: list, + src: Path, + dest: Path, + link_dest: Optional[Path] = None, + remote: Optional[str] = None, + src_trailing_slash: bool = False, + dest_trailing_slash: bool = False, +) -> bool: + """Call rsync with specified arguments and handle possible errors & stdout/stderr + + :param link_dest: + Path to the hardlinked files location (previous backup). + + :param src_trailing_slash: + Add a trailing slash to the source path. This makes rsync copy the contents + of the folder instead of the folder itself. + + :param dest_trailing_slash: + Add a trailing slash to the destination path. This makes rsync interpret the + destination as a folder and create it if it doesn't exists. + + :return: + True if successful and False if unsuccessful. + """ + + all_args = args[:] + if link_dest: + if not remote: + # for local paths, use resolve() to get absolute path + link_dest_str = str(link_dest.resolve()) + else: + # for remote paths, we require absolute paths anyways + link_dest_str = str(link_dest) + all_args += [f"--link-dest={link_dest_str}"] + + if src_trailing_slash: + all_args += [str(src) + "/"] + else: + all_args += [str(src)] + + dest_str = str(dest) + if dest_trailing_slash: + dest_str += "/" + + if not remote: + all_args += [dest_str] + else: + all_args += [f"{remote}:{dest_str}"] + + try: + res = subprocess.run(all_args, capture_output=True, text=True, check=True) + except subprocess.CalledProcessError as exc: + _log(f"Error: {exc}") + return False + + _log(f"stdout: {all_args}\n{res.stdout}") + _log(f"stderr: {all_args}\n{res.stderr}") + + success = not bool(res.returncode) + + return success + + +def backup( # pylint: disable=too-many-return-statements, too-many-branches + container: Container, + path: Path, + remote: Optional[str] = None, + prev_backup: Optional[Path] = None, + rsync_exe: str = "rsync", +) -> bool: + """Create a backup of the disk-objectstore container + + It should be done in the following order: + 1) loose files; + 2) sqlite database; + 3) packed files. + + :return: + True is successful and False if unsuccessful. + """ + + # ------------------ + # input validation: + if remote: + if not _check_if_remote_accessible(remote): + return False + + if not _is_exe_found(rsync_exe): + _log(f"Error: {rsync_exe} not accessible.") + return False + + path_exists = _check_path_exists(path, remote) + + if not path_exists: + success = _run_cmd(["mkdir", str(path)], remote=remote) + if not success: + _log(f"Error: Couldn't access/create '{str(path)}'!") + return False + + if prev_backup: + if not _check_path_exists(prev_backup, remote): + _log(f"Error: {str(prev_backup)} not found.") + return False + # ------------------ + + # subprocess arguments shared by all rsync calls: + rsync_args = [rsync_exe, "-azh", "-vv", "--no-whole-file"] + + container_root_path = container.get_folder() + loose_path = container._get_loose_folder() # pylint: disable=protected-access + packs_path = container._get_pack_folder() # pylint: disable=protected-access + sqlite_path = container._get_pack_index_path() # pylint: disable=protected-access + + # step 1: back up loose files + loose_path_rel = loose_path.relative_to(container_root_path) + prev_backup_loose = prev_backup / loose_path_rel if prev_backup else None + success = _call_rsync( + rsync_args, loose_path, path, remote=remote, link_dest=prev_backup_loose + ) + if not success: + return False + + # step 2: back up sqlite db + + # make a temporary directory to dump sqlite db locally + with tempfile.TemporaryDirectory() as temp_dir_name: + sqlite_temp_loc = Path(temp_dir_name) / "packs.idx" + + # Safe way to make a backup of the sqlite db, while it might potentially be accessed + # https://docs.python.org/3/library/sqlite3.html#sqlite3.Connection.backup + src = sqlite3.connect(str(sqlite_path)) + dst = sqlite3.connect(str(sqlite_temp_loc)) + with dst: + src.backup(dst) + dst.close() + src.close() + + if sqlite_temp_loc.is_file(): + _log(f"Dumped the SQLite database to {str(sqlite_temp_loc)}") + else: + _log(f"Error: '{str(sqlite_temp_loc)}' was not created.") + return False + + # step 3: transfer the SQLITE database file + success = _call_rsync( + rsync_args, sqlite_temp_loc, path, remote=remote, link_dest=prev_backup + ) + if not success: + return False + + # step 4: transfer the packed files + packs_path_rel = packs_path.relative_to(container_root_path) + prev_backup_packs = prev_backup / packs_path_rel if prev_backup else None + success = _call_rsync( + rsync_args, packs_path, path, remote=remote, link_dest=prev_backup_packs + ) + if not success: + return False + + # step 5: transfer anything else in the container folder + success = _call_rsync( + rsync_args + + [ + "--exclude", + str(loose_path_rel), + "--exclude", + "packs.idx", + "--exclude", + str(packs_path_rel), + ], + container_root_path, + path, + link_dest=prev_backup, + remote=remote, + src_trailing_slash=True, + ) + if not success: + return False + + return True diff --git a/disk_objectstore/cli.py b/disk_objectstore/cli.py index acd3332..6ce1c56 100644 --- a/disk_objectstore/cli.py +++ b/disk_objectstore/cli.py @@ -8,7 +8,7 @@ import click -from disk_objectstore import __version__ +from disk_objectstore import __version__, backup_utils from disk_objectstore.container import Container @@ -183,3 +183,40 @@ def optimize( container.clean_storage(vacuum=vacuum) size = sum(f.stat().st_size for f in dostore.path.glob("**/*") if f.is_file()) click.echo(f"Final container size: {round(size/1000, 2)} Mb") + + +@main.command("backup") +@click.argument("path", nargs=1, type=click.Path()) +@click.option( + "--remote", + default=None, + help="ssh remote of the backup location.", +) +@click.option( + "--prev_backup", + default=None, + help="Previous backup location for rsync link-dest.", +) +@click.option( + "--rsync_exe", + default="rsync", + help="Specify the 'rsync' executable, if not in PATH.", +) +@pass_dostore +def backup( + dostore: ContainerContext, + path: str, + remote: Optional[str], + prev_backup: Optional[str], + rsync_exe: str, +): + """Create a backup of the container""" + + with dostore.container as container: + backup_utils.backup( + container, + Path(path), + remote=remote, + prev_backup=Path(prev_backup) if prev_backup else None, + rsync_exe=rsync_exe, + ) From a7013740a8220d41e7fc7a29e98bca716a5311f1 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Wed, 25 Oct 2023 18:37:10 +0200 Subject: [PATCH 02/34] interface overhaul; still wip --- disk_objectstore/backup_utils.py | 157 ++++++++++++++++++++++++------- disk_objectstore/cli.py | 53 +++++++---- 2 files changed, 156 insertions(+), 54 deletions(-) diff --git a/disk_objectstore/backup_utils.py b/disk_objectstore/backup_utils.py index f33fe9f..a4091e0 100644 --- a/disk_objectstore/backup_utils.py +++ b/disk_objectstore/backup_utils.py @@ -2,6 +2,7 @@ Utilities to back up a container. """ +import logging import shutil import sqlite3 import subprocess @@ -11,16 +12,29 @@ from disk_objectstore.container import Container +logger = logging.getLogger(__name__) + def _log(msg, end="\n"): print(msg, end=end) -def _is_exe_found(exe) -> bool: +def split_remote_and_path(dest: str): + """extract remote and path from :""" + split_dest = dest.split(":") + if len(split_dest) == 1: + return None, Path(dest) + if len(split_dest) == 2: + return split_dest[0], Path(split_dest[1]) + # more than 1 colon: + raise ValueError + + +def is_exe_found(exe: str) -> bool: return shutil.which(exe) is not None -def _run_cmd(args: list, remote: Optional[str] = None, check: bool = True) -> bool: +def run_cmd(args: list, remote: Optional[str] = None, check: bool = True) -> bool: """ Run a command locally or remotely. """ @@ -42,9 +56,10 @@ def _run_cmd(args: list, remote: Optional[str] = None, check: bool = True) -> bo return success -def _check_if_remote_accessible(remote: str) -> bool: +def check_if_remote_accessible(remote: str) -> bool: + """Check if remote host is accessible via ssh""" _log(f"Checking if '{remote}' is accessible...", end="") - success = _run_cmd(["exit"], remote=remote) + success = run_cmd(["exit"], remote=remote) if not success: _log(f"Error: Remote '{remote}' is not accessible!") return False @@ -52,12 +67,12 @@ def _check_if_remote_accessible(remote: str) -> bool: return True -def _check_path_exists(path: Path, remote: Optional[str] = None) -> bool: +def check_path_exists(path: Path, remote: Optional[str] = None) -> bool: cmd = ["[", "-e", str(path), "]"] - return _run_cmd(cmd, remote=remote, check=False) + return run_cmd(cmd, remote=remote, check=False) -def _call_rsync( # pylint: disable=too-many-arguments +def call_rsync( # pylint: disable=too-many-arguments args: list, src: Path, dest: Path, @@ -121,47 +136,54 @@ def _call_rsync( # pylint: disable=too-many-arguments return success -def backup( # pylint: disable=too-many-return-statements, too-many-branches - container: Container, +def validate_inputs( path: Path, remote: Optional[str] = None, - prev_backup: Optional[Path] = None, rsync_exe: str = "rsync", ) -> bool: - """Create a backup of the disk-objectstore container - - It should be done in the following order: - 1) loose files; - 2) sqlite database; - 3) packed files. + """Validate inputs to the backup cli command :return: - True is successful and False if unsuccessful. + True if validation passes, False otherwise. """ - - # ------------------ - # input validation: if remote: - if not _check_if_remote_accessible(remote): + if not check_if_remote_accessible(remote): return False - if not _is_exe_found(rsync_exe): + if not is_exe_found(rsync_exe): _log(f"Error: {rsync_exe} not accessible.") return False - path_exists = _check_path_exists(path, remote) + path_exists = check_path_exists(path, remote) if not path_exists: - success = _run_cmd(["mkdir", str(path)], remote=remote) + success = run_cmd(["mkdir", str(path)], remote=remote) if not success: _log(f"Error: Couldn't access/create '{str(path)}'!") return False - if prev_backup: - if not _check_path_exists(prev_backup, remote): - _log(f"Error: {str(prev_backup)} not found.") - return False - # ------------------ + return True + + +def backup_container( # pylint: disable=too-many-return-statements, too-many-branches + container: Container, + path: Path, + remote: Optional[str] = None, + prev_backup: Optional[Path] = None, + rsync_exe: str = "rsync", +) -> bool: + """Create a backup of the disk-objectstore container + + This is safe to perform when the container is being used. + + It should be done in the following order: + 1) loose files; + 2) sqlite database; + 3) packed files. + + :return: + True if successful and False if unsuccessful. + """ # subprocess arguments shared by all rsync calls: rsync_args = [rsync_exe, "-azh", "-vv", "--no-whole-file"] @@ -174,7 +196,7 @@ def backup( # pylint: disable=too-many-return-statements, too-many-branches # step 1: back up loose files loose_path_rel = loose_path.relative_to(container_root_path) prev_backup_loose = prev_backup / loose_path_rel if prev_backup else None - success = _call_rsync( + success = call_rsync( rsync_args, loose_path, path, remote=remote, link_dest=prev_backup_loose ) if not success: @@ -202,7 +224,7 @@ def backup( # pylint: disable=too-many-return-statements, too-many-branches return False # step 3: transfer the SQLITE database file - success = _call_rsync( + success = call_rsync( rsync_args, sqlite_temp_loc, path, remote=remote, link_dest=prev_backup ) if not success: @@ -210,15 +232,14 @@ def backup( # pylint: disable=too-many-return-statements, too-many-branches # step 4: transfer the packed files packs_path_rel = packs_path.relative_to(container_root_path) - prev_backup_packs = prev_backup / packs_path_rel if prev_backup else None - success = _call_rsync( - rsync_args, packs_path, path, remote=remote, link_dest=prev_backup_packs + success = call_rsync( + rsync_args, packs_path, path, remote=remote, link_dest=prev_backup ) if not success: return False # step 5: transfer anything else in the container folder - success = _call_rsync( + success = call_rsync( rsync_args + [ "--exclude", @@ -238,3 +259,69 @@ def backup( # pylint: disable=too-many-return-statements, too-many-branches return False return True + + +def backup_auto_folders( + container: Container, + path: Path, + remote: Optional[str] = None, + rsync_exe: str = "rsync", +): + """Create a backup, managing live and previous backup folders automatically + + The running backup is done to `/live-backup`. When it completes, it is moved to + the final path: `/last-backup`. This done so that the last backup wouldn't be + corrupted, in case the live one crashes or gets interrupted. Rsync `link-dest` is used between + the two folders to keep the backups incremental and performant. + + :param path: + Path to where the backup will be created. If 'remote' is specified, must be an absolute path, + otherwise can be relative. + + :param remote: + Remote host of the backup location. 'ssh' executable is called via subprocess and therefore remote + hosts configured for it are supported (e.g. via .ssh/config file). + + :param kwargs: + * Executable paths if not default, e.g. 'rsync' + + :return: + True is successful and False if unsuccessful. + """ + + live_folder = path / "live-backup" + last_folder = path / "last-backup" + + prev_exists = check_path_exists(last_folder, remote) + + success = backup_container( + container, + live_folder, + remote=remote, + prev_backup=last_folder if prev_exists else None, + rsync_exe=rsync_exe, + ) + if not success: + return False + + # move live-backup -> last-backup in a safe manner + # (such that if the process stops at any point, that we wouldn't lose data) + # step 1: last-backup -> last-backup-old + if prev_exists: + success = run_cmd( + ["mv", str(last_folder), str(last_folder) + "-old"], remote=remote + ) + if not success: + return False + # step 2: live-backup -> last-backup + success = run_cmd(["mv", str(live_folder), str(last_folder)], remote=remote) + if not success: + return False + # step 3: remote last-backup-old + if prev_exists: + success = run_cmd(["rm", "-rf", str(last_folder) + "-old"], remote=remote) + if not success: + return False + + _log(f"Backup moved from '{str(live_folder)}' to '{str(last_folder)}'.") + return True diff --git a/disk_objectstore/cli.py b/disk_objectstore/cli.py index 6ce1c56..978cc8e 100644 --- a/disk_objectstore/cli.py +++ b/disk_objectstore/cli.py @@ -186,37 +186,52 @@ def optimize( @main.command("backup") -@click.argument("path", nargs=1, type=click.Path()) -@click.option( - "--remote", - default=None, - help="ssh remote of the backup location.", -) -@click.option( - "--prev_backup", - default=None, - help="Previous backup location for rsync link-dest.", -) +@click.argument("dest", nargs=1, type=click.Path()) +# @click.option( +# "--keep", +# default=1, +# help="Number of previous backups to keep in the destination.", +# ) @click.option( "--rsync_exe", default="rsync", - help="Specify the 'rsync' executable, if not in PATH.", + help="Specify the 'rsync' executable, if not in PATH. Used for both local and remote destinations.", ) @pass_dostore def backup( dostore: ContainerContext, - path: str, - remote: Optional[str], - prev_backup: Optional[str], + dest: str, rsync_exe: str, ): - """Create a backup of the container""" + """Create a backup of the container to a subfolder `last-backup` of the destination location DEST. + + NOTE: This is safe to run while the container is being used. + + Destination (DEST) can either be a local path, or a remote destination (reachable via ssh). + In the latter case, remote destination needs to have the following syntax: + [@]: + i.e., contain the remote host name and the remote path, separated by a colon (and optionally the + remote user separated by an @ symbol). You can tune SSH parameters using the standard options given + by OpenSSH, such as adding configuration options to ~/.ssh/config (e.g. to allow for passwordless + login - recommended, since this script might ask multiple times for the password). + + NOTE: 'rsync' and other UNIX-specific commands are called, thus the command will not work on + non-UNIX environments. + + """ + + try: + remote, path = backup_utils.split_remote_and_path(dest) + except ValueError: + click.echo("Unsupported destination.") + return False + + backup_utils.validate_inputs(path, remote=remote, rsync_exe=rsync_exe) with dostore.container as container: - backup_utils.backup( + return backup_utils.backup_auto_folders( container, - Path(path), + path, remote=remote, - prev_backup=Path(prev_backup) if prev_backup else None, rsync_exe=rsync_exe, ) From 15e9982484ffd312bae38509a1097d3b924f263f Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Wed, 1 Nov 2023 19:32:57 +0100 Subject: [PATCH 03/34] implement keep; delete old backups --- disk_objectstore/backup_utils.py | 130 ++++++++++++++++++++----------- disk_objectstore/cli.py | 23 +++--- 2 files changed, 99 insertions(+), 54 deletions(-) diff --git a/disk_objectstore/backup_utils.py b/disk_objectstore/backup_utils.py index a4091e0..4eb17a9 100644 --- a/disk_objectstore/backup_utils.py +++ b/disk_objectstore/backup_utils.py @@ -2,9 +2,12 @@ Utilities to back up a container. """ +import datetime import logging +import random import shutil import sqlite3 +import string import subprocess import tempfile from pathlib import Path @@ -34,7 +37,7 @@ def is_exe_found(exe: str) -> bool: return shutil.which(exe) is not None -def run_cmd(args: list, remote: Optional[str] = None, check: bool = True) -> bool: +def run_cmd(args: list, remote: Optional[str] = None): """ Run a command locally or remotely. """ @@ -42,24 +45,24 @@ def run_cmd(args: list, remote: Optional[str] = None, check: bool = True) -> boo if remote: all_args = ["ssh", remote] + all_args - try: - res = subprocess.run(all_args, capture_output=True, text=True, check=check) - except subprocess.CalledProcessError as exc: - _log("Error: " + str(exc)) - return False + res = subprocess.run(all_args, capture_output=True, text=True, check=False) + exit_code = res.returncode - _log(f"stdout: {all_args}\n{res.stdout}") - _log(f"stderr: {all_args}\n{res.stderr}") + _log( + f"Command: {all_args}\n" + f" Exit Code: {exit_code}\n" + f" stdout/stderr: {res.stdout}\n{res.stderr}" + ) - success = not bool(res.returncode) + success = exit_code == 0 - return success + return success, res.stdout def check_if_remote_accessible(remote: str) -> bool: """Check if remote host is accessible via ssh""" - _log(f"Checking if '{remote}' is accessible...", end="") - success = run_cmd(["exit"], remote=remote) + _log(f"Checking if '{remote}' is accessible...") + success = run_cmd(["exit"], remote=remote)[0] if not success: _log(f"Error: Remote '{remote}' is not accessible!") return False @@ -69,7 +72,7 @@ def check_if_remote_accessible(remote: str) -> bool: def check_path_exists(path: Path, remote: Optional[str] = None) -> bool: cmd = ["[", "-e", str(path), "]"] - return run_cmd(cmd, remote=remote, check=False) + return run_cmd(cmd, remote=remote)[0] def call_rsync( # pylint: disable=too-many-arguments @@ -127,18 +130,22 @@ def call_rsync( # pylint: disable=too-many-arguments except subprocess.CalledProcessError as exc: _log(f"Error: {exc}") return False + exit_code = res.returncode + _log( + f"Command: {all_args}\n" + f" Exit Code: {exit_code}\n" + f" stdout/stderr: {res.stdout}\n{res.stderr}" + ) - _log(f"stdout: {all_args}\n{res.stdout}") - _log(f"stderr: {all_args}\n{res.stderr}") - - success = not bool(res.returncode) + success = exit_code == 0 return success def validate_inputs( path: Path, - remote: Optional[str] = None, + remote: Optional[str], + keep: int, rsync_exe: str = "rsync", ) -> bool: """Validate inputs to the backup cli command @@ -146,6 +153,10 @@ def validate_inputs( :return: True if validation passes, False otherwise. """ + if keep < 0: + _log("Error: keep variable can't be negative!") + return False + if remote: if not check_if_remote_accessible(remote): return False @@ -157,7 +168,7 @@ def validate_inputs( path_exists = check_path_exists(path, remote) if not path_exists: - success = run_cmd(["mkdir", str(path)], remote=remote) + success = run_cmd(["mkdir", str(path)], remote=remote)[0] if not success: _log(f"Error: Couldn't access/create '{str(path)}'!") return False @@ -261,18 +272,49 @@ def backup_container( # pylint: disable=too-many-return-statements, too-many-br return True +def delete_old_backups(path: Path, remote: Optional[str] = None, keep: int = 1) -> bool: + """Get all folders matching the backup pattern, and delete oldest ones.""" + success, stdout = run_cmd( + [ + "find", + str(path), + "-maxdepth", + "1", + "-type", + "d", + "-name", + "backup_*_*", + "-print", + ], + remote=remote, + ) + if not success: + return False + + sorted_folders = sorted(stdout.splitlines()) + to_delete = sorted_folders[: -(keep + 1)] + for folder in to_delete: + success = run_cmd(["rm", "-rf", folder], remote=remote)[0] + if success: + _log(f"Deleted old backup: {folder}") + else: + _log(f"Warning: couldn't delete old backup: {folder}") + return True + + def backup_auto_folders( container: Container, path: Path, remote: Optional[str] = None, + keep: int = 1, rsync_exe: str = "rsync", ): """Create a backup, managing live and previous backup folders automatically The running backup is done to `/live-backup`. When it completes, it is moved to - the final path: `/last-backup`. This done so that the last backup wouldn't be - corrupted, in case the live one crashes or gets interrupted. Rsync `link-dest` is used between - the two folders to keep the backups incremental and performant. + the final path: `/backup__` and the symlink `/last-backup will + be set to point to it. Rsync `link-dest` is used between live-backup and last-backup + to keep the backups incremental and performant. :param path: Path to where the backup will be created. If 'remote' is specified, must be an absolute path, @@ -282,46 +324,44 @@ def backup_auto_folders( Remote host of the backup location. 'ssh' executable is called via subprocess and therefore remote hosts configured for it are supported (e.g. via .ssh/config file). - :param kwargs: - * Executable paths if not default, e.g. 'rsync' - :return: True is successful and False if unsuccessful. """ live_folder = path / "live-backup" - last_folder = path / "last-backup" + last_symlink = path / "last-backup" - prev_exists = check_path_exists(last_folder, remote) + prev_exists = check_path_exists(last_symlink, remote) success = backup_container( container, live_folder, remote=remote, - prev_backup=last_folder if prev_exists else None, + prev_backup=last_symlink if prev_exists else None, rsync_exe=rsync_exe, ) if not success: return False - # move live-backup -> last-backup in a safe manner - # (such that if the process stops at any point, that we wouldn't lose data) - # step 1: last-backup -> last-backup-old - if prev_exists: - success = run_cmd( - ["mv", str(last_folder), str(last_folder) + "-old"], remote=remote - ) - if not success: - return False - # step 2: live-backup -> last-backup - success = run_cmd(["mv", str(live_folder), str(last_folder)], remote=remote) + # move live-backup -> backup__ + timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + randstr = "".join(random.choices(string.ascii_lowercase + string.digits, k=4)) + folder_name = f"backup_{timestamp}_{randstr}" + + success = run_cmd(["mv", str(live_folder), str(path / folder_name)], remote=remote)[ + 0 + ] if not success: return False - # step 3: remote last-backup-old - if prev_exists: - success = run_cmd(["rm", "-rf", str(last_folder) + "-old"], remote=remote) - if not success: - return False - _log(f"Backup moved from '{str(live_folder)}' to '{str(last_folder)}'.") + # update last-backup symlink + success = run_cmd( + ["ln", "-sfn", str(folder_name), str(last_symlink)], remote=remote + )[0] + if not success: + return False + _log(f"Backup moved from '{str(live_folder)}' to '{str(path / folder_name)}'.") + + delete_old_backups(path, remote=remote, keep=keep) + return True diff --git a/disk_objectstore/cli.py b/disk_objectstore/cli.py index 978cc8e..6465ef0 100644 --- a/disk_objectstore/cli.py +++ b/disk_objectstore/cli.py @@ -187,11 +187,11 @@ def optimize( @main.command("backup") @click.argument("dest", nargs=1, type=click.Path()) -# @click.option( -# "--keep", -# default=1, -# help="Number of previous backups to keep in the destination.", -# ) +@click.option( + "--keep", + default=1, + help="Number of previous backups to keep in the destination. (default: 1)", +) @click.option( "--rsync_exe", default="rsync", @@ -201,9 +201,11 @@ def optimize( def backup( dostore: ContainerContext, dest: str, + keep: int, rsync_exe: str, ): - """Create a backup of the container to a subfolder `last-backup` of the destination location DEST. + """Create a backup of the container to destination location DEST, in a subfolder + backup__ and point a symlink called `last-backup` to it. NOTE: This is safe to run while the container is being used. @@ -215,9 +217,8 @@ def backup( by OpenSSH, such as adding configuration options to ~/.ssh/config (e.g. to allow for passwordless login - recommended, since this script might ask multiple times for the password). - NOTE: 'rsync' and other UNIX-specific commands are called, thus the command will not work on + NOTE: 'rsync' and other UNIX-specific commands are called, thus the command will not work on non-UNIX environments. - """ try: @@ -226,12 +227,16 @@ def backup( click.echo("Unsupported destination.") return False - backup_utils.validate_inputs(path, remote=remote, rsync_exe=rsync_exe) + success = backup_utils.validate_inputs(path, remote, keep, rsync_exe=rsync_exe) + if not success: + click.echo("Input validation failed.") + return False with dostore.container as container: return backup_utils.backup_auto_folders( container, path, remote=remote, + keep=keep, rsync_exe=rsync_exe, ) From b5a30f2ee342f5d67711a69315357806b84b9e07 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Wed, 1 Nov 2023 23:35:59 +0100 Subject: [PATCH 04/34] organized into a class & improved logging --- disk_objectstore/backup_utils.py | 613 ++++++++++++++++--------------- disk_objectstore/cli.py | 46 ++- 2 files changed, 336 insertions(+), 323 deletions(-) diff --git a/disk_objectstore/backup_utils.py b/disk_objectstore/backup_utils.py index 4eb17a9..1444619 100644 --- a/disk_objectstore/backup_utils.py +++ b/disk_objectstore/backup_utils.py @@ -15,13 +15,10 @@ from disk_objectstore.container import Container +logging.basicConfig() logger = logging.getLogger(__name__) -def _log(msg, end="\n"): - print(msg, end=end) - - def split_remote_and_path(dest: str): """extract remote and path from :""" split_dest = dest.split(":") @@ -30,338 +27,342 @@ def split_remote_and_path(dest: str): if len(split_dest) == 2: return split_dest[0], Path(split_dest[1]) # more than 1 colon: - raise ValueError + raise ValueError("Invalid destination format: :") def is_exe_found(exe: str) -> bool: return shutil.which(exe) is not None -def run_cmd(args: list, remote: Optional[str] = None): - """ - Run a command locally or remotely. - """ - all_args = args[:] - if remote: - all_args = ["ssh", remote] + all_args +class BackupUtilities: + """Utilities to make a backup of the disk-objectstore container""" + + def __init__( + self, dest: str, keep: int, rsync_exe: str, logger_: logging.Logger + ) -> None: + self.dest = dest + self.keep = keep + self.rsync_exe = rsync_exe + self.logger = logger_ + self.remote, self.path = split_remote_and_path(dest) + + def run_cmd(self, args: list): + """ + Run a command locally or remotely. + """ + all_args = args[:] + if self.remote: + all_args = ["ssh", self.remote] + all_args + + res = subprocess.run(all_args, capture_output=True, text=True, check=False) + exit_code = res.returncode + + self.logger.debug( + f"Command: {all_args}\n" + f" Exit Code: {exit_code}\n" + f" stdout/stderr: {res.stdout}\n{res.stderr}" + ) - res = subprocess.run(all_args, capture_output=True, text=True, check=False) - exit_code = res.returncode + success = exit_code == 0 - _log( - f"Command: {all_args}\n" - f" Exit Code: {exit_code}\n" - f" stdout/stderr: {res.stdout}\n{res.stderr}" - ) + return success, res.stdout - success = exit_code == 0 + def check_if_remote_accessible(self) -> bool: + """Check if remote host is accessible via ssh""" + self.logger.info(f"Checking if '{self.remote}' is accessible...") + success = self.run_cmd(["exit"])[0] + if not success: + self.logger.error(f"Remote '{self.remote}' is not accessible!") + return False + self.logger.info("Success! '%s' is accessible!", self.remote) + return True + + def check_path_exists(self, path: Path) -> bool: + cmd = ["[", "-e", str(path), "]"] + return self.run_cmd(cmd)[0] + + def validate_inputs(self) -> bool: + """Validate inputs to the backup cli command + + :return: + True if validation passes, False otherwise. + """ + if self.keep < 0: + self.logger.error("keep variable can't be negative!") + return False - return success, res.stdout + if self.remote: + if not self.check_if_remote_accessible(): + return False + if not is_exe_found(self.rsync_exe): + self.logger.error(f"{self.rsync_exe} not accessible.") + return False -def check_if_remote_accessible(remote: str) -> bool: - """Check if remote host is accessible via ssh""" - _log(f"Checking if '{remote}' is accessible...") - success = run_cmd(["exit"], remote=remote)[0] - if not success: - _log(f"Error: Remote '{remote}' is not accessible!") - return False - _log(f"Success! '{remote}' is accessible!") - return True + path_exists = self.check_path_exists(self.path) + + if not path_exists: + success = self.run_cmd(["mkdir", str(self.path)])[0] + if not success: + self.logger.error(f"Couldn't access/create '{str(self.path)}'!") + return False + + return True + + def call_rsync( # pylint: disable=too-many-arguments + self, + args: list, + src: Path, + dest: Path, + link_dest: Optional[Path] = None, + src_trailing_slash: bool = False, + dest_trailing_slash: bool = False, + ) -> bool: + """Call rsync with specified arguments and handle possible errors & stdout/stderr + + :param link_dest: + Path to the hardlinked files location (previous backup). + + :param src_trailing_slash: + Add a trailing slash to the source path. This makes rsync copy the contents + of the folder instead of the folder itself. + + :param dest_trailing_slash: + Add a trailing slash to the destination path. This makes rsync interpret the + destination as a folder and create it if it doesn't exists. + + :return: + True if successful and False if unsuccessful. + """ + + all_args = args[:] + if link_dest: + if not self.remote: + # for local paths, use resolve() to get absolute path + link_dest_str = str(link_dest.resolve()) + else: + # for remote paths, we require absolute paths anyways + link_dest_str = str(link_dest) + all_args += [f"--link-dest={link_dest_str}"] + + if src_trailing_slash: + all_args += [str(src) + "/"] + else: + all_args += [str(src)] + dest_str = str(dest) + if dest_trailing_slash: + dest_str += "/" -def check_path_exists(path: Path, remote: Optional[str] = None) -> bool: - cmd = ["[", "-e", str(path), "]"] - return run_cmd(cmd, remote=remote)[0] + if not self.remote: + all_args += [dest_str] + else: + all_args += [f"{self.remote}:{dest_str}"] + try: + res = subprocess.run(all_args, capture_output=True, text=True, check=True) + except subprocess.CalledProcessError as exc: + self.logger.error(f"{exc}") + return False + exit_code = res.returncode + + self.logger.debug( + "Command: %s\n Exit Code: %s\n stdout/stderr: %s\n%s", + str(all_args), + exit_code, + res.stdout, + res.stderr, + ) -def call_rsync( # pylint: disable=too-many-arguments - args: list, - src: Path, - dest: Path, - link_dest: Optional[Path] = None, - remote: Optional[str] = None, - src_trailing_slash: bool = False, - dest_trailing_slash: bool = False, -) -> bool: - """Call rsync with specified arguments and handle possible errors & stdout/stderr + success = exit_code == 0 - :param link_dest: - Path to the hardlinked files location (previous backup). + return success - :param src_trailing_slash: - Add a trailing slash to the source path. This makes rsync copy the contents - of the folder instead of the folder itself. + def backup_container( # pylint: disable=too-many-return-statements, too-many-branches + self, + container: Container, + path: Path, + prev_backup: Optional[Path] = None, + ) -> bool: + """Create a backup of the disk-objectstore container - :param dest_trailing_slash: - Add a trailing slash to the destination path. This makes rsync interpret the - destination as a folder and create it if it doesn't exists. + This is safe to perform when the container is being used. - :return: - True if successful and False if unsuccessful. - """ + It should be done in the following order: + 1) loose files; + 2) sqlite database; + 3) packed files. - all_args = args[:] - if link_dest: - if not remote: - # for local paths, use resolve() to get absolute path - link_dest_str = str(link_dest.resolve()) - else: - # for remote paths, we require absolute paths anyways - link_dest_str = str(link_dest) - all_args += [f"--link-dest={link_dest_str}"] - - if src_trailing_slash: - all_args += [str(src) + "/"] - else: - all_args += [str(src)] - - dest_str = str(dest) - if dest_trailing_slash: - dest_str += "/" - - if not remote: - all_args += [dest_str] - else: - all_args += [f"{remote}:{dest_str}"] - - try: - res = subprocess.run(all_args, capture_output=True, text=True, check=True) - except subprocess.CalledProcessError as exc: - _log(f"Error: {exc}") - return False - exit_code = res.returncode - _log( - f"Command: {all_args}\n" - f" Exit Code: {exit_code}\n" - f" stdout/stderr: {res.stdout}\n{res.stderr}" - ) - - success = exit_code == 0 - - return success - - -def validate_inputs( - path: Path, - remote: Optional[str], - keep: int, - rsync_exe: str = "rsync", -) -> bool: - """Validate inputs to the backup cli command - - :return: - True if validation passes, False otherwise. - """ - if keep < 0: - _log("Error: keep variable can't be negative!") - return False - - if remote: - if not check_if_remote_accessible(remote): - return False + :return: + True if successful and False if unsuccessful. + """ - if not is_exe_found(rsync_exe): - _log(f"Error: {rsync_exe} not accessible.") - return False + # subprocess arguments shared by all rsync calls: + rsync_args = [self.rsync_exe, "-azh", "-vv", "--no-whole-file"] - path_exists = check_path_exists(path, remote) + container_root_path = container.get_folder() + loose_path = container._get_loose_folder() # pylint: disable=protected-access + packs_path = container._get_pack_folder() # pylint: disable=protected-access + sqlite_path = ( + container._get_pack_index_path() # pylint: disable=protected-access + ) - if not path_exists: - success = run_cmd(["mkdir", str(path)], remote=remote)[0] + # step 1: back up loose files + loose_path_rel = loose_path.relative_to(container_root_path) + prev_backup_loose = prev_backup / loose_path_rel if prev_backup else None + success = self.call_rsync( + rsync_args, loose_path, path, link_dest=prev_backup_loose + ) + if not success: + return False + self.logger.info(f"Transferred {str(loose_path)} to {str(path)}") + + # step 2: back up sqlite db + + # make a temporary directory to dump sqlite db locally + with tempfile.TemporaryDirectory() as temp_dir_name: + sqlite_temp_loc = Path(temp_dir_name) / "packs.idx" + + # Safe way to make a backup of the sqlite db, while it might potentially be accessed + # https://docs.python.org/3/library/sqlite3.html#sqlite3.Connection.backup + src = sqlite3.connect(str(sqlite_path)) + dst = sqlite3.connect(str(sqlite_temp_loc)) + with dst: + src.backup(dst) + dst.close() + src.close() + + if sqlite_temp_loc.is_file(): + self.logger.info( + f"Dumped the SQLite database to {str(sqlite_temp_loc)}" + ) + else: + self.logger.error("'%s' was not created.", str(sqlite_temp_loc)) + return False + + # step 3: transfer the SQLITE database file + success = self.call_rsync( + rsync_args, sqlite_temp_loc, path, link_dest=prev_backup + ) + if not success: + return False + self.logger.info(f"Transferred SQLite database to {str(path)}") + + # step 4: transfer the packed files + packs_path_rel = packs_path.relative_to(container_root_path) + success = self.call_rsync(rsync_args, packs_path, path, link_dest=prev_backup) + if not success: + return False + self.logger.info(f"Transferred {str(packs_path)} to {str(path)}") + + # step 5: transfer anything else in the container folder + success = self.call_rsync( + rsync_args + + [ + "--exclude", + str(loose_path_rel), + "--exclude", + "packs.idx", + "--exclude", + str(packs_path_rel), + ], + container_root_path, + path, + link_dest=prev_backup, + src_trailing_slash=True, + ) if not success: - _log(f"Error: Couldn't access/create '{str(path)}'!") return False - return True - - -def backup_container( # pylint: disable=too-many-return-statements, too-many-branches - container: Container, - path: Path, - remote: Optional[str] = None, - prev_backup: Optional[Path] = None, - rsync_exe: str = "rsync", -) -> bool: - """Create a backup of the disk-objectstore container - - This is safe to perform when the container is being used. - - It should be done in the following order: - 1) loose files; - 2) sqlite database; - 3) packed files. - - :return: - True if successful and False if unsuccessful. - """ - - # subprocess arguments shared by all rsync calls: - rsync_args = [rsync_exe, "-azh", "-vv", "--no-whole-file"] - - container_root_path = container.get_folder() - loose_path = container._get_loose_folder() # pylint: disable=protected-access - packs_path = container._get_pack_folder() # pylint: disable=protected-access - sqlite_path = container._get_pack_index_path() # pylint: disable=protected-access - - # step 1: back up loose files - loose_path_rel = loose_path.relative_to(container_root_path) - prev_backup_loose = prev_backup / loose_path_rel if prev_backup else None - success = call_rsync( - rsync_args, loose_path, path, remote=remote, link_dest=prev_backup_loose - ) - if not success: - return False - - # step 2: back up sqlite db - - # make a temporary directory to dump sqlite db locally - with tempfile.TemporaryDirectory() as temp_dir_name: - sqlite_temp_loc = Path(temp_dir_name) / "packs.idx" - - # Safe way to make a backup of the sqlite db, while it might potentially be accessed - # https://docs.python.org/3/library/sqlite3.html#sqlite3.Connection.backup - src = sqlite3.connect(str(sqlite_path)) - dst = sqlite3.connect(str(sqlite_temp_loc)) - with dst: - src.backup(dst) - dst.close() - src.close() - - if sqlite_temp_loc.is_file(): - _log(f"Dumped the SQLite database to {str(sqlite_temp_loc)}") - else: - _log(f"Error: '{str(sqlite_temp_loc)}' was not created.") + return True + + def delete_old_backups(self) -> bool: + """Get all folders matching the backup pattern, and delete oldest ones.""" + success, stdout = self.run_cmd( + [ + "find", + str(self.path), + "-maxdepth", + "1", + "-type", + "d", + "-name", + "backup_*_*", + "-print", + ], + ) + if not success: return False - # step 3: transfer the SQLITE database file - success = call_rsync( - rsync_args, sqlite_temp_loc, path, remote=remote, link_dest=prev_backup + sorted_folders = sorted(stdout.splitlines()) + to_delete = sorted_folders[: -(self.keep + 1)] + for folder in to_delete: + success = self.run_cmd(["rm", "-rf", folder])[0] + if success: + self.logger.info(f"Deleted old backup: {folder}") + else: + self.logger.warning("Warning: couldn't delete old backup: %s", folder) + return True + + def backup_auto_folders( + self, + container: Container, + ): + """Create a backup, managing live and previous backup folders automatically + + The running backup is done to `/live-backup`. When it completes, it is moved to + the final path: `/backup__` and the symlink `/last-backup will + be set to point to it. Rsync `link-dest` is used between live-backup and last-backup + to keep the backups incremental and performant. + + :param path: + Path to where the backup will be created. If 'remote' is specified, must be an absolute path, + otherwise can be relative. + + :param remote: + Remote host of the backup location. 'ssh' executable is called via subprocess and therefore remote + hosts configured for it are supported (e.g. via .ssh/config file). + + :return: + True is successful and False if unsuccessful. + """ + + live_folder = self.path / "live-backup" + last_symlink = self.path / "last-backup" + + prev_exists = self.check_path_exists(last_symlink) + if prev_exists: + self.logger.info( + f"'{str(last_symlink)}' exists, using it for rsync --link-dest." + ) + + success = self.backup_container( + container, + live_folder, + prev_backup=last_symlink if prev_exists else None, ) if not success: return False - # step 4: transfer the packed files - packs_path_rel = packs_path.relative_to(container_root_path) - success = call_rsync( - rsync_args, packs_path, path, remote=remote, link_dest=prev_backup - ) - if not success: - return False - - # step 5: transfer anything else in the container folder - success = call_rsync( - rsync_args - + [ - "--exclude", - str(loose_path_rel), - "--exclude", - "packs.idx", - "--exclude", - str(packs_path_rel), - ], - container_root_path, - path, - link_dest=prev_backup, - remote=remote, - src_trailing_slash=True, - ) - if not success: - return False - - return True - - -def delete_old_backups(path: Path, remote: Optional[str] = None, keep: int = 1) -> bool: - """Get all folders matching the backup pattern, and delete oldest ones.""" - success, stdout = run_cmd( - [ - "find", - str(path), - "-maxdepth", - "1", - "-type", - "d", - "-name", - "backup_*_*", - "-print", - ], - remote=remote, - ) - if not success: - return False - - sorted_folders = sorted(stdout.splitlines()) - to_delete = sorted_folders[: -(keep + 1)] - for folder in to_delete: - success = run_cmd(["rm", "-rf", folder], remote=remote)[0] - if success: - _log(f"Deleted old backup: {folder}") - else: - _log(f"Warning: couldn't delete old backup: {folder}") - return True - - -def backup_auto_folders( - container: Container, - path: Path, - remote: Optional[str] = None, - keep: int = 1, - rsync_exe: str = "rsync", -): - """Create a backup, managing live and previous backup folders automatically - - The running backup is done to `/live-backup`. When it completes, it is moved to - the final path: `/backup__` and the symlink `/last-backup will - be set to point to it. Rsync `link-dest` is used between live-backup and last-backup - to keep the backups incremental and performant. - - :param path: - Path to where the backup will be created. If 'remote' is specified, must be an absolute path, - otherwise can be relative. - - :param remote: - Remote host of the backup location. 'ssh' executable is called via subprocess and therefore remote - hosts configured for it are supported (e.g. via .ssh/config file). - - :return: - True is successful and False if unsuccessful. - """ - - live_folder = path / "live-backup" - last_symlink = path / "last-backup" - - prev_exists = check_path_exists(last_symlink, remote) - - success = backup_container( - container, - live_folder, - remote=remote, - prev_backup=last_symlink if prev_exists else None, - rsync_exe=rsync_exe, - ) - if not success: - return False - - # move live-backup -> backup__ - timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") - randstr = "".join(random.choices(string.ascii_lowercase + string.digits, k=4)) - folder_name = f"backup_{timestamp}_{randstr}" - - success = run_cmd(["mv", str(live_folder), str(path / folder_name)], remote=remote)[ - 0 - ] - if not success: - return False - - # update last-backup symlink - success = run_cmd( - ["ln", "-sfn", str(folder_name), str(last_symlink)], remote=remote - )[0] - if not success: - return False - _log(f"Backup moved from '{str(live_folder)}' to '{str(path / folder_name)}'.") - - delete_old_backups(path, remote=remote, keep=keep) - - return True + # move live-backup -> backup__ + timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + randstr = "".join(random.choices(string.ascii_lowercase + string.digits, k=4)) + folder_name = f"backup_{timestamp}_{randstr}" + + success = self.run_cmd(["mv", str(live_folder), str(self.path / folder_name)])[ + 0 + ] + if not success: + return False + + # update last-backup symlink + success = self.run_cmd(["ln", "-sfn", str(folder_name), str(last_symlink)])[0] + if not success: + return False + self.logger.info( + f"Backup moved from '{str(live_folder)}' to '{str(self.path / folder_name)}'." + ) + + self.delete_old_backups() + + return True diff --git a/disk_objectstore/cli.py b/disk_objectstore/cli.py index 6465ef0..b41e95b 100644 --- a/disk_objectstore/cli.py +++ b/disk_objectstore/cli.py @@ -1,6 +1,7 @@ """A small CLI tool for managing stores.""" import dataclasses import json +import logging import os import sys from pathlib import Path @@ -197,12 +198,14 @@ def optimize( default="rsync", help="Specify the 'rsync' executable, if not in PATH. Used for both local and remote destinations.", ) +@click.option( + "--verbosity", + default="info", + help="Set verbosity [silent|info|debug], default is 'info'.", +) @pass_dostore def backup( - dostore: ContainerContext, - dest: str, - keep: int, - rsync_exe: str, + dostore: ContainerContext, dest: str, keep: int, rsync_exe: str, verbosity: str ): """Create a backup of the container to destination location DEST, in a subfolder backup__ and point a symlink called `last-backup` to it. @@ -221,22 +224,31 @@ def backup( non-UNIX environments. """ + if verbosity == "silent": + backup_utils.logger.setLevel(logging.ERROR) + elif verbosity == "info": + backup_utils.logger.setLevel(logging.INFO) + elif verbosity == "debug": + backup_utils.logger.setLevel(logging.DEBUG) + else: + click.echo("Unsupported verbosity.") + return + try: - remote, path = backup_utils.split_remote_and_path(dest) - except ValueError: - click.echo("Unsupported destination.") - return False + backup_utils_instance = backup_utils.BackupUtilities( + dest, keep, rsync_exe, backup_utils.logger + ) + except ValueError as e: + click.echo(f"Error: {e}") + return - success = backup_utils.validate_inputs(path, remote, keep, rsync_exe=rsync_exe) + success = backup_utils_instance.validate_inputs() if not success: click.echo("Input validation failed.") - return False + return with dostore.container as container: - return backup_utils.backup_auto_folders( - container, - path, - remote=remote, - keep=keep, - rsync_exe=rsync_exe, - ) + success = backup_utils_instance.backup_auto_folders(container) + if not success: + click.echo("Error: backup failed.") + return From 15bdb18285f42f6d172df7d86ad1ff6da33690d3 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Thu, 2 Nov 2023 11:30:31 +0100 Subject: [PATCH 05/34] plug-in func to backup_auto_folders (for aiida-core) --- disk_objectstore/backup_utils.py | 63 +++++++++++++++++++------------- disk_objectstore/cli.py | 6 ++- 2 files changed, 43 insertions(+), 26 deletions(-) diff --git a/disk_objectstore/backup_utils.py b/disk_objectstore/backup_utils.py index 1444619..6a784e7 100644 --- a/disk_objectstore/backup_utils.py +++ b/disk_objectstore/backup_utils.py @@ -11,7 +11,7 @@ import subprocess import tempfile from pathlib import Path -from typing import Optional +from typing import Callable, Optional from disk_objectstore.container import Container @@ -35,7 +35,9 @@ def is_exe_found(exe: str) -> bool: class BackupUtilities: - """Utilities to make a backup of the disk-objectstore container""" + """Utilities to make a backup of the disk-objectstore container + and to manage backups in general (designed to also be used by aiida-core) + """ def __init__( self, dest: str, keep: int, rsync_exe: str, logger_: logging.Logger @@ -46,6 +48,9 @@ def __init__( self.logger = logger_ self.remote, self.path = split_remote_and_path(dest) + # subprocess arguments shared by all rsync calls: + self.rsync_args = [self.rsync_exe, "-azh", "-vv", "--no-whole-file"] + def run_cmd(self, args: list): """ Run a command locally or remotely. @@ -81,7 +86,7 @@ def check_path_exists(self, path: Path) -> bool: cmd = ["[", "-e", str(path), "]"] return self.run_cmd(cmd)[0] - def validate_inputs(self) -> bool: + def validate_inputs(self, additional_exes: Optional[list] = None) -> bool: """Validate inputs to the backup cli command :return: @@ -99,6 +104,12 @@ def validate_inputs(self) -> bool: self.logger.error(f"{self.rsync_exe} not accessible.") return False + if additional_exes: + for add_exe in additional_exes: + if not is_exe_found(add_exe): + self.logger.error(f"{add_exe} not accessible.") + return False + path_exists = self.check_path_exists(self.path) if not path_exists: @@ -111,12 +122,12 @@ def validate_inputs(self) -> bool: def call_rsync( # pylint: disable=too-many-arguments self, - args: list, src: Path, dest: Path, link_dest: Optional[Path] = None, src_trailing_slash: bool = False, dest_trailing_slash: bool = False, + extra_args: Optional[list] = None, ) -> bool: """Call rsync with specified arguments and handle possible errors & stdout/stderr @@ -135,7 +146,9 @@ def call_rsync( # pylint: disable=too-many-arguments True if successful and False if unsuccessful. """ - all_args = args[:] + all_args = self.rsync_args[:] + if extra_args: + all_args += extra_args if link_dest: if not self.remote: # for local paths, use resolve() to get absolute path @@ -197,9 +210,6 @@ def backup_container( # pylint: disable=too-many-return-statements, too-many-br True if successful and False if unsuccessful. """ - # subprocess arguments shared by all rsync calls: - rsync_args = [self.rsync_exe, "-azh", "-vv", "--no-whole-file"] - container_root_path = container.get_folder() loose_path = container._get_loose_folder() # pylint: disable=protected-access packs_path = container._get_pack_folder() # pylint: disable=protected-access @@ -210,9 +220,7 @@ def backup_container( # pylint: disable=too-many-return-statements, too-many-br # step 1: back up loose files loose_path_rel = loose_path.relative_to(container_root_path) prev_backup_loose = prev_backup / loose_path_rel if prev_backup else None - success = self.call_rsync( - rsync_args, loose_path, path, link_dest=prev_backup_loose - ) + success = self.call_rsync(loose_path, path, link_dest=prev_backup_loose) if not success: return False self.logger.info(f"Transferred {str(loose_path)} to {str(path)}") @@ -241,24 +249,25 @@ def backup_container( # pylint: disable=too-many-return-statements, too-many-br return False # step 3: transfer the SQLITE database file - success = self.call_rsync( - rsync_args, sqlite_temp_loc, path, link_dest=prev_backup - ) + success = self.call_rsync(sqlite_temp_loc, path, link_dest=prev_backup) if not success: return False self.logger.info(f"Transferred SQLite database to {str(path)}") # step 4: transfer the packed files packs_path_rel = packs_path.relative_to(container_root_path) - success = self.call_rsync(rsync_args, packs_path, path, link_dest=prev_backup) + success = self.call_rsync(packs_path, path, link_dest=prev_backup) if not success: return False self.logger.info(f"Transferred {str(packs_path)} to {str(path)}") # step 5: transfer anything else in the container folder success = self.call_rsync( - rsync_args - + [ + container_root_path, + path, + link_dest=prev_backup, + src_trailing_slash=True, + extra_args=[ "--exclude", str(loose_path_rel), "--exclude", @@ -266,10 +275,6 @@ def backup_container( # pylint: disable=too-many-return-statements, too-many-br "--exclude", str(packs_path_rel), ], - container_root_path, - path, - link_dest=prev_backup, - src_trailing_slash=True, ) if not success: return False @@ -306,7 +311,7 @@ def delete_old_backups(self) -> bool: def backup_auto_folders( self, - container: Container, + backup_function: Callable, ): """Create a backup, managing live and previous backup folders automatically @@ -315,6 +320,10 @@ def backup_auto_folders( be set to point to it. Rsync `link-dest` is used between live-backup and last-backup to keep the backups incremental and performant. + :param backup_function: + Function that is used to make a single backup. Needs to have two arguments: path and + previous_backup location (which can be None). + :param path: Path to where the backup will be created. If 'remote' is specified, must be an absolute path, otherwise can be relative. @@ -336,10 +345,14 @@ def backup_auto_folders( f"'{str(last_symlink)}' exists, using it for rsync --link-dest." ) - success = self.backup_container( - container, + # success = self.backup_container( + # container, + # live_folder, + # prev_backup=last_symlink if prev_exists else None, + # ) + success = backup_function( live_folder, - prev_backup=last_symlink if prev_exists else None, + last_symlink if prev_exists else None, ) if not success: return False diff --git a/disk_objectstore/cli.py b/disk_objectstore/cli.py index b41e95b..bb4c88d 100644 --- a/disk_objectstore/cli.py +++ b/disk_objectstore/cli.py @@ -248,7 +248,11 @@ def backup( return with dostore.container as container: - success = backup_utils_instance.backup_auto_folders(container) + success = backup_utils_instance.backup_auto_folders( + lambda path, prev: backup_utils_instance.backup_container( + container, path, prev_backup=prev + ) + ) if not success: click.echo("Error: backup failed.") return From ccf56c6a5a3ece23c355d55e23c21e631f4c972b Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Tue, 7 Nov 2023 16:46:51 +0100 Subject: [PATCH 06/34] omit symlink without error, if fs doesn't support it --- disk_objectstore/backup_utils.py | 81 ++++++++++++++++++-------------- disk_objectstore/cli.py | 4 ++ tests/test_cli.py | 17 +++++++ 3 files changed, 67 insertions(+), 35 deletions(-) diff --git a/disk_objectstore/backup_utils.py b/disk_objectstore/backup_utils.py index 6a784e7..197c447 100644 --- a/disk_objectstore/backup_utils.py +++ b/disk_objectstore/backup_utils.py @@ -51,7 +51,7 @@ def __init__( # subprocess arguments shared by all rsync calls: self.rsync_args = [self.rsync_exe, "-azh", "-vv", "--no-whole-file"] - def run_cmd(self, args: list): + def run_cmd(self, args: list, check: bool = False): """ Run a command locally or remotely. """ @@ -59,7 +59,7 @@ def run_cmd(self, args: list): if self.remote: all_args = ["ssh", self.remote] + all_args - res = subprocess.run(all_args, capture_output=True, text=True, check=False) + res = subprocess.run(all_args, capture_output=True, text=True, check=check) exit_code = res.returncode self.logger.debug( @@ -281,9 +281,9 @@ def backup_container( # pylint: disable=too-many-return-statements, too-many-br return True - def delete_old_backups(self) -> bool: - """Get all folders matching the backup pattern, and delete oldest ones.""" - success, stdout = self.run_cmd( + def get_existing_backup_folders(self): + """Get all folders matching the backup folder name pattern.""" + _, stdout = self.run_cmd( [ "find", str(self.path), @@ -295,11 +295,19 @@ def delete_old_backups(self) -> bool: "backup_*_*", "-print", ], + check=True, ) - if not success: - return False - sorted_folders = sorted(stdout.splitlines()) + return stdout.splitlines() + + def get_last_backup_folder(self): + """Get the latest backup folder, if it exists.""" + existing_backups = self.get_existing_backup_folders() + return Path(sorted(existing_backups)[-1]) if existing_backups else None + + def delete_old_backups(self): + """Get all folders matching the backup pattern, and delete oldest ones.""" + sorted_folders = sorted(self.get_existing_backup_folders()) to_delete = sorted_folders[: -(self.keep + 1)] for folder in to_delete: success = self.run_cmd(["rm", "-rf", folder])[0] @@ -307,7 +315,6 @@ def delete_old_backups(self) -> bool: self.logger.info(f"Deleted old backup: {folder}") else: self.logger.warning("Warning: couldn't delete old backup: %s", folder) - return True def backup_auto_folders( self, @@ -316,43 +323,36 @@ def backup_auto_folders( """Create a backup, managing live and previous backup folders automatically The running backup is done to `/live-backup`. When it completes, it is moved to - the final path: `/backup__` and the symlink `/last-backup will - be set to point to it. Rsync `link-dest` is used between live-backup and last-backup - to keep the backups incremental and performant. + the final path: `/backup__`. If the filesystem supports it, + the symlink `/last-backup` is added to point to the last backup. + Rsync `link-dest` is used to keep the backups incremental and performant. :param backup_function: Function that is used to make a single backup. Needs to have two arguments: path and previous_backup location (which can be None). - :param path: - Path to where the backup will be created. If 'remote' is specified, must be an absolute path, - otherwise can be relative. - - :param remote: - Remote host of the backup location. 'ssh' executable is called via subprocess and therefore remote - hosts configured for it are supported (e.g. via .ssh/config file). - :return: True is successful and False if unsuccessful. """ live_folder = self.path / "live-backup" - last_symlink = self.path / "last-backup" - prev_exists = self.check_path_exists(last_symlink) - if prev_exists: + try: + last_folder = self.get_last_backup_folder() + except subprocess.CalledProcessError: + self.logger.error("Couldn't determine last backup.") + return False + + if last_folder: self.logger.info( - f"'{str(last_symlink)}' exists, using it for rsync --link-dest." + f"Last backup is '{str(last_folder)}', using it for rsync --link-dest." ) + else: + self.logger.info("Couldn't find a previous backup to increment from.") - # success = self.backup_container( - # container, - # live_folder, - # prev_backup=last_symlink if prev_exists else None, - # ) success = backup_function( live_folder, - last_symlink if prev_exists else None, + last_folder, ) if not success: return False @@ -368,14 +368,25 @@ def backup_auto_folders( if not success: return False - # update last-backup symlink - success = self.run_cmd(["ln", "-sfn", str(folder_name), str(last_symlink)])[0] - if not success: - return False self.logger.info( f"Backup moved from '{str(live_folder)}' to '{str(self.path / folder_name)}'." ) - self.delete_old_backups() + symlink_name = "last-backup" + success = self.run_cmd( + ["ln", "-sfn", str(folder_name), str(self.path / symlink_name)] + )[0] + if not success: + self.logger.warning( + f"Couldn't create symlink '{symlink_name}'. Perhaps the filesystem doesn't support it." + ) + else: + self.logger.info(f"Added symlink '{symlink_name}' to '{folder_name}'.") + + try: + self.delete_old_backups() + except subprocess.CalledProcessError: + self.logger.error("Failed to delete old backups.") + return False return True diff --git a/disk_objectstore/cli.py b/disk_objectstore/cli.py index bb4c88d..b58652d 100644 --- a/disk_objectstore/cli.py +++ b/disk_objectstore/cli.py @@ -212,9 +212,13 @@ def backup( NOTE: This is safe to run while the container is being used. + NOTE: the symlink `last-backup` is omitted if the filesystem doesn't support it. + Destination (DEST) can either be a local path, or a remote destination (reachable via ssh). In the latter case, remote destination needs to have the following syntax: + [@]: + i.e., contain the remote host name and the remote path, separated by a colon (and optionally the remote user separated by an @ symbol). You can tune SSH parameters using the standard options given by OpenSSH, such as adding configuration options to ~/.ssh/config (e.g. to allow for passwordless diff --git a/tests/test_cli.py b/tests/test_cli.py index 9280471..34b1b4d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -185,3 +185,20 @@ def myimport( assert result.exit_code == 0 assert "INFO: no `tqdm` package found" in result.stdout assert "No errors found" in result.stdout + + +def test_backup(temp_container, temp_dir): + """Test the backup command""" + + temp_container.init_container(clear=True) + # Add a few objects + for idx in range(100): + temp_container.add_object(f"test-{idx}".encode()) + + obj = cli.ContainerContext(temp_container.get_folder()) + + path = Path(temp_dir) / "backup" + result = CliRunner().invoke(cli.backup, [str(path)], obj=obj) + + assert result.exit_code == 0 + assert path.exists() From f5441cc035cc769e354dcc8cfc5dd97778a2fe02 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Wed, 8 Nov 2023 14:20:44 +0100 Subject: [PATCH 07/34] add cli tests --- tests/test_cli.py | 75 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 73 insertions(+), 2 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 34b1b4d..bf124f8 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -187,7 +187,17 @@ def myimport( assert "No errors found" in result.stdout -def test_backup(temp_container, temp_dir): +@pytest.mark.parametrize( + "remote, verbosity", + [ + (False, None), + (False, "silent"), + (False, "info"), + (False, "debug"), + (True, None), + ], +) +def test_backup(temp_container, temp_dir, remote, verbosity): """Test the backup command""" temp_container.init_container(clear=True) @@ -198,7 +208,68 @@ def test_backup(temp_container, temp_dir): obj = cli.ContainerContext(temp_container.get_folder()) path = Path(temp_dir) / "backup" - result = CliRunner().invoke(cli.backup, [str(path)], obj=obj) + + if remote: + destination = f"localhost:{str(path)}" + else: + destination = str(path) + + args = [destination] + + if verbosity: + args += [f"--verbosity={verbosity}"] + + result = CliRunner().invoke(cli.backup, args, obj=obj) assert result.exit_code == 0 assert path.exists() + + path_contents = [entry.name for entry in path.iterdir()] + backup_dirs = [ + entry for entry in path.iterdir() if entry.name.startswith("backup_") + ] + + assert "last-backup" in path_contents + assert len(backup_dirs) == 1 + + backup_dir_contents = [entry.name for entry in backup_dirs[0].iterdir()] + + for item in ["config.json", "duplicates", "loose", "packs", "packs.idx", "sandbox"]: + assert item in backup_dir_contents + + # validate the backup + + obj = cli.ContainerContext(backup_dirs[0]) + result = CliRunner().invoke(cli.validate, obj=obj) + + assert result.exit_code == 0 + assert "No errors found" in result.stdout + + +def test_backup_repeated(temp_container, temp_dir): + """Test the backup command repeated 3 times. + + Considering --keep 1 is default, the last one should get deleted. + """ + + temp_container.init_container(clear=True) + # Add a few objects + for idx in range(100): + temp_container.add_object(f"test-{idx}".encode()) + + obj = cli.ContainerContext(temp_container.get_folder()) + + path = Path(temp_dir) / "backup" + + for _ in range(3): + result = CliRunner().invoke(cli.backup, [str(path)], obj=obj) + assert result.exit_code == 0 + + assert path.exists() + path_contents = [entry.name for entry in path.iterdir()] + backup_dirs = [ + entry for entry in path.iterdir() if entry.name.startswith("backup_") + ] + + assert "last-backup" in path_contents + assert len(backup_dirs) == 2 From 9de905ada2e4315def055a2f1037e158525286d3 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Wed, 8 Nov 2023 14:36:32 +0100 Subject: [PATCH 08/34] ci: make 'ssh localhost' work --- .github/workflows/ci.yml | 73 ++++++++++++------------ .github/workflows/setup-ssh-localhost.sh | 6 ++ 2 files changed, 43 insertions(+), 36 deletions(-) create mode 100755 .github/workflows/setup-ssh-localhost.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 08249f3..972fd6f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,22 +6,19 @@ name: Continuous integration on: [push, pull_request] jobs: - pre-commit: - runs-on: ubuntu-latest timeout-minutes: 5 steps: - - uses: actions/checkout@v2 - - name: Set up Python 3.8 - uses: actions/setup-python@v2 - with: - python-version: "3.8" - - uses: pre-commit/action@v2.0.0 + - uses: actions/checkout@v2 + - name: Set up Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: "3.8" + - uses: pre-commit/action@v2.0.0 tests: - runs-on: ${{ matrix.os }} strategy: fail-fast: false @@ -30,30 +27,34 @@ jobs: python-version: ["3.8", "3.9", "3.10", "3.11"] steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies (including dev dependencies at frozen version) - # I'm using pip install -e to make sure that the coverage properly traces the runs - # also of the concurrent tests (maybe we can achieve this differently) - run: | - python -m pip install --upgrade pip - pip install -e .[progressbar,optionaltests] - pip install -r requirements.lock - - name: Test with pytest - # No need to run the benchmarks, they will run in a different workflow - # Also, run in very verbose mode so if there is an error we get a complete diff - run: pytest -vv --cov=disk_objectstore --benchmark-skip - env: - SQLALCHEMY_WARN_20: 1 - - name: Create xml coverage - run: coverage xml - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v3 - with: - files: ./coverage.xml - name: disk-objectstore - ## Commenting the following lines - if often fails, and if at least one manages to push, it should be enough - # fail_ci_if_error: true + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - name: set up 'ssh localhost' + run: | + .github/workflows/setup-ssh-localhost.sh + ssh localhost + - name: Install dependencies (including dev dependencies at frozen version) + # I'm using pip install -e to make sure that the coverage properly traces the runs + # also of the concurrent tests (maybe we can achieve this differently) + run: | + python -m pip install --upgrade pip + pip install -e .[progressbar,optionaltests] + pip install -r requirements.lock + - name: Test with pytest + # No need to run the benchmarks, they will run in a different workflow + # Also, run in very verbose mode so if there is an error we get a complete diff + run: pytest -vv --cov=disk_objectstore --benchmark-skip + env: + SQLALCHEMY_WARN_20: 1 + - name: Create xml coverage + run: coverage xml + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v3 + with: + files: ./coverage.xml + name: disk-objectstore + ## Commenting the following lines - if often fails, and if at least one manages to push, it should be enough + # fail_ci_if_error: true diff --git a/.github/workflows/setup-ssh-localhost.sh b/.github/workflows/setup-ssh-localhost.sh new file mode 100755 index 0000000..df70959 --- /dev/null +++ b/.github/workflows/setup-ssh-localhost.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +set -ev + +ssh-keygen -q -t rsa -b 4096 -N "" -f "${HOME}/.ssh/id_rsa" +ssh-keygen -y -f "${HOME}/.ssh/id_rsa" >> "${HOME}/.ssh/authorized_keys" +ssh-keyscan -H localhost >> "${HOME}/.ssh/known_hosts" From a2031d91e53468d9ef465f3112bd188bb73c7135 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Wed, 8 Nov 2023 15:14:07 +0100 Subject: [PATCH 09/34] cli: return 1 if failure --- disk_objectstore/cli.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/disk_objectstore/cli.py b/disk_objectstore/cli.py index b58652d..57532f5 100644 --- a/disk_objectstore/cli.py +++ b/disk_objectstore/cli.py @@ -236,7 +236,7 @@ def backup( backup_utils.logger.setLevel(logging.DEBUG) else: click.echo("Unsupported verbosity.") - return + return 1 try: backup_utils_instance = backup_utils.BackupUtilities( @@ -244,12 +244,12 @@ def backup( ) except ValueError as e: click.echo(f"Error: {e}") - return + return 1 success = backup_utils_instance.validate_inputs() if not success: click.echo("Input validation failed.") - return + return 1 with dostore.container as container: success = backup_utils_instance.backup_auto_folders( @@ -259,4 +259,5 @@ def backup( ) if not success: click.echo("Error: backup failed.") - return + return 1 + return 0 From c030168847c506c1c4c8f8a14218c82416467ec0 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Tue, 28 Nov 2023 21:12:13 +0100 Subject: [PATCH 10/34] fix CI macos ssh localhost --- .github/workflows/ci.yml | 2 +- .github/workflows/setup-ssh-localhost.sh | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 972fd6f..73f6177 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,7 +35,7 @@ jobs: - name: set up 'ssh localhost' run: | .github/workflows/setup-ssh-localhost.sh - ssh localhost + ssh -v localhost - name: Install dependencies (including dev dependencies at frozen version) # I'm using pip install -e to make sure that the coverage properly traces the runs # also of the concurrent tests (maybe we can achieve this differently) diff --git a/.github/workflows/setup-ssh-localhost.sh b/.github/workflows/setup-ssh-localhost.sh index df70959..ab3347a 100755 --- a/.github/workflows/setup-ssh-localhost.sh +++ b/.github/workflows/setup-ssh-localhost.sh @@ -4,3 +4,6 @@ set -ev ssh-keygen -q -t rsa -b 4096 -N "" -f "${HOME}/.ssh/id_rsa" ssh-keygen -y -f "${HOME}/.ssh/id_rsa" >> "${HOME}/.ssh/authorized_keys" ssh-keyscan -H localhost >> "${HOME}/.ssh/known_hosts" + +chmod 700 "${HOME}/.ssh" +chmod 600 "${HOME}/.ssh"/* From bd116510cd24de740f468037172dceb4b9d4dfe3 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Tue, 28 Nov 2023 21:27:25 +0100 Subject: [PATCH 11/34] skip backup tests on windows --- .github/workflows/ci.yml | 3 +++ tests/test_cli.py | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 73f6177..177bbb6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,7 +32,10 @@ jobs: uses: actions/setup-python@v1 with: python-version: ${{ matrix.python-version }} + # Set up 'ssh localhost' that is used in testing the backup command + # skipped for windows, as it doesn't support this setup or the backup command - name: set up 'ssh localhost' + if: matrix.os != 'windows-latest' run: | .github/workflows/setup-ssh-localhost.sh ssh -v localhost diff --git a/tests/test_cli.py b/tests/test_cli.py index bf124f8..0629484 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,4 +1,5 @@ """Test the CLI commands""" +import platform from pathlib import Path import pytest @@ -187,6 +188,9 @@ def myimport( assert "No errors found" in result.stdout +@pytest.mark.skipif( + platform.system() == "Windows", reason="Backup not supported on Windows" +) @pytest.mark.parametrize( "remote, verbosity", [ @@ -246,6 +250,9 @@ def test_backup(temp_container, temp_dir, remote, verbosity): assert "No errors found" in result.stdout +@pytest.mark.skipif( + platform.system() == "Windows", reason="Backup not supported on Windows" +) def test_backup_repeated(temp_container, temp_dir): """Test the backup command repeated 3 times. From c5ecf2fbbe1e1de28bf8ca1d8eb9cf8854fd25f4 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Wed, 6 Dec 2023 01:42:16 +0100 Subject: [PATCH 12/34] reorganize --- disk_objectstore/backup_utils.py | 334 ++++++++++++++++--------------- disk_objectstore/cli.py | 35 ++-- 2 files changed, 185 insertions(+), 184 deletions(-) diff --git a/disk_objectstore/backup_utils.py b/disk_objectstore/backup_utils.py index 197c447..750a15b 100644 --- a/disk_objectstore/backup_utils.py +++ b/disk_objectstore/backup_utils.py @@ -16,7 +16,11 @@ from disk_objectstore.container import Container logging.basicConfig() -logger = logging.getLogger(__name__) +backup_logger = logging.getLogger(__name__) + + +class BackupError(Exception): + "Raised when backup fails." def split_remote_and_path(dest: str): @@ -27,50 +31,40 @@ def split_remote_and_path(dest: str): if len(split_dest) == 2: return split_dest[0], Path(split_dest[1]) # more than 1 colon: - raise ValueError("Invalid destination format: :") + raise BackupError("Invalid destination format: :") def is_exe_found(exe: str) -> bool: return shutil.which(exe) is not None -class BackupUtilities: - """Utilities to make a backup of the disk-objectstore container - and to manage backups in general (designed to also be used by aiida-core) +class BackupManager: + """ + Class that contains all configuration and utility functions to create + backups except for the backup function itself, which is passed in according + to what is backed up (e.g. the disk-objectstore container in for this repo, + or the whole aiida storage in aiida-core) """ def __init__( - self, dest: str, keep: int, rsync_exe: str, logger_: logging.Logger + self, + dest: str, + keep: int, + logger: logging.Logger, + exes: dict, ) -> None: self.dest = dest self.keep = keep - self.rsync_exe = rsync_exe - self.logger = logger_ + self.logger = logger self.remote, self.path = split_remote_and_path(dest) - # subprocess arguments shared by all rsync calls: - self.rsync_args = [self.rsync_exe, "-azh", "-vv", "--no-whole-file"] - - def run_cmd(self, args: list, check: bool = False): - """ - Run a command locally or remotely. - """ - all_args = args[:] - if self.remote: - all_args = ["ssh", self.remote] + all_args - - res = subprocess.run(all_args, capture_output=True, text=True, check=check) - exit_code = res.returncode + self.exes = exes - self.logger.debug( - f"Command: {all_args}\n" - f" Exit Code: {exit_code}\n" - f" stdout/stderr: {res.stdout}\n{res.stderr}" - ) - - success = exit_code == 0 + # make sure rsync gets added so it gets validated + if "rsync" not in self.exes: + self.exes["rsync"] = "rsync" - return success, res.stdout + self.validate() def check_if_remote_accessible(self) -> bool: """Check if remote host is accessible via ssh""" @@ -86,39 +80,61 @@ def check_path_exists(self, path: Path) -> bool: cmd = ["[", "-e", str(path), "]"] return self.run_cmd(cmd)[0] - def validate_inputs(self, additional_exes: Optional[list] = None) -> bool: - """Validate inputs to the backup cli command + def validate(self): + """Validate the backup config inputs :return: True if validation passes, False otherwise. """ if self.keep < 0: - self.logger.error("keep variable can't be negative!") - return False + raise BackupError( + "Input validation failed: keep variable can't be negative!" + ) if self.remote: if not self.check_if_remote_accessible(): - return False - - if not is_exe_found(self.rsync_exe): - self.logger.error(f"{self.rsync_exe} not accessible.") - return False + raise BackupError( + "Input validation failed: keep variable can't be negative!" + ) - if additional_exes: - for add_exe in additional_exes: - if not is_exe_found(add_exe): - self.logger.error(f"{add_exe} not accessible.") - return False + if self.exes: + for _, path in self.exes.items(): + if not is_exe_found(path): + raise BackupError( + f"Input validation failed: {path} not accessible." + ) path_exists = self.check_path_exists(self.path) if not path_exists: success = self.run_cmd(["mkdir", str(self.path)])[0] if not success: - self.logger.error(f"Couldn't access/create '{str(self.path)}'!") - return False + raise BackupError( + f"Input validation failed: Couldn't access/create '{str(self.path)}'!" + ) - return True + def run_cmd( + self, + args: list, + ): + """ + Run a command locally or remotely. + """ + all_args = args[:] + if self.remote: + all_args = ["ssh", self.remote] + all_args + + res = subprocess.run(all_args, capture_output=True, text=True, check=False) + + self.logger.debug( + f"Command: {all_args}\n" + f" Exit Code: {res.returncode}\n" + f" stdout/stderr: {res.stdout}\n{res.stderr}" + ) + + success = res.returncode == 0 + + return success, res.stdout def call_rsync( # pylint: disable=too-many-arguments self, @@ -146,7 +162,9 @@ def call_rsync( # pylint: disable=too-many-arguments True if successful and False if unsuccessful. """ - all_args = self.rsync_args[:] + assert "rsync" in self.exes + + all_args = [self.exes["rsync"], "-azh", "-vv", "--no-whole-file"] if extra_args: all_args += extra_args if link_dest: @@ -172,114 +190,24 @@ def call_rsync( # pylint: disable=too-many-arguments else: all_args += [f"{self.remote}:{dest_str}"] - try: - res = subprocess.run(all_args, capture_output=True, text=True, check=True) - except subprocess.CalledProcessError as exc: - self.logger.error(f"{exc}") - return False - exit_code = res.returncode + res = subprocess.run(all_args, capture_output=True, text=True, check=False) self.logger.debug( "Command: %s\n Exit Code: %s\n stdout/stderr: %s\n%s", str(all_args), - exit_code, + res.returncode, res.stdout, res.stderr, ) - success = exit_code == 0 + success = res.returncode == 0 return success - def backup_container( # pylint: disable=too-many-return-statements, too-many-branches - self, - container: Container, - path: Path, - prev_backup: Optional[Path] = None, - ) -> bool: - """Create a backup of the disk-objectstore container - - This is safe to perform when the container is being used. - - It should be done in the following order: - 1) loose files; - 2) sqlite database; - 3) packed files. - - :return: - True if successful and False if unsuccessful. - """ - - container_root_path = container.get_folder() - loose_path = container._get_loose_folder() # pylint: disable=protected-access - packs_path = container._get_pack_folder() # pylint: disable=protected-access - sqlite_path = ( - container._get_pack_index_path() # pylint: disable=protected-access - ) - - # step 1: back up loose files - loose_path_rel = loose_path.relative_to(container_root_path) - prev_backup_loose = prev_backup / loose_path_rel if prev_backup else None - success = self.call_rsync(loose_path, path, link_dest=prev_backup_loose) - if not success: - return False - self.logger.info(f"Transferred {str(loose_path)} to {str(path)}") - - # step 2: back up sqlite db - - # make a temporary directory to dump sqlite db locally - with tempfile.TemporaryDirectory() as temp_dir_name: - sqlite_temp_loc = Path(temp_dir_name) / "packs.idx" - - # Safe way to make a backup of the sqlite db, while it might potentially be accessed - # https://docs.python.org/3/library/sqlite3.html#sqlite3.Connection.backup - src = sqlite3.connect(str(sqlite_path)) - dst = sqlite3.connect(str(sqlite_temp_loc)) - with dst: - src.backup(dst) - dst.close() - src.close() - - if sqlite_temp_loc.is_file(): - self.logger.info( - f"Dumped the SQLite database to {str(sqlite_temp_loc)}" - ) - else: - self.logger.error("'%s' was not created.", str(sqlite_temp_loc)) - return False - - # step 3: transfer the SQLITE database file - success = self.call_rsync(sqlite_temp_loc, path, link_dest=prev_backup) - if not success: - return False - self.logger.info(f"Transferred SQLite database to {str(path)}") - - # step 4: transfer the packed files - packs_path_rel = packs_path.relative_to(container_root_path) - success = self.call_rsync(packs_path, path, link_dest=prev_backup) - if not success: - return False - self.logger.info(f"Transferred {str(packs_path)} to {str(path)}") - - # step 5: transfer anything else in the container folder - success = self.call_rsync( - container_root_path, - path, - link_dest=prev_backup, - src_trailing_slash=True, - extra_args=[ - "--exclude", - str(loose_path_rel), - "--exclude", - "packs.idx", - "--exclude", - str(packs_path_rel), - ], - ) - if not success: - return False - - return True + # ---- + # Utilities to manage multiple folders of backups, e.g. hard-linking to previous backup; + # deleting old backups. + # ---- def get_existing_backup_folders(self): """Get all folders matching the backup folder name pattern.""" @@ -294,8 +222,7 @@ def get_existing_backup_folders(self): "-name", "backup_*_*", "-print", - ], - check=True, + ] ) return stdout.splitlines() @@ -316,10 +243,7 @@ def delete_old_backups(self): else: self.logger.warning("Warning: couldn't delete old backup: %s", folder) - def backup_auto_folders( - self, - backup_function: Callable, - ): + def backup_auto_folders(self, backup_func: Callable): """Create a backup, managing live and previous backup folders automatically The running backup is done to `/live-backup`. When it completes, it is moved to @@ -327,7 +251,7 @@ def backup_auto_folders( the symlink `/last-backup` is added to point to the last backup. Rsync `link-dest` is used to keep the backups incremental and performant. - :param backup_function: + :param backup_func: Function that is used to make a single backup. Needs to have two arguments: path and previous_backup location (which can be None). @@ -339,9 +263,8 @@ def backup_auto_folders( try: last_folder = self.get_last_backup_folder() - except subprocess.CalledProcessError: - self.logger.error("Couldn't determine last backup.") - return False + except subprocess.CalledProcessError as exc: + raise BackupError("Couldn't determine last backup.") from exc if last_folder: self.logger.info( @@ -350,15 +273,15 @@ def backup_auto_folders( else: self.logger.info("Couldn't find a previous backup to increment from.") - success = backup_function( + backup_func( live_folder, last_folder, ) - if not success: - return False # move live-backup -> backup__ - timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + timestamp = datetime.datetime.now(datetime.timezone.utc).strftime( + "%Y%m%d%H%M%S" + ) randstr = "".join(random.choices(string.ascii_lowercase + string.digits, k=4)) folder_name = f"backup_{timestamp}_{randstr}" @@ -366,7 +289,9 @@ def backup_auto_folders( 0 ] if not success: - return False + raise BackupError( + f"Failed to move '{str(live_folder)}' to '{str(self.path / folder_name)}'" + ) self.logger.info( f"Backup moved from '{str(live_folder)}' to '{str(self.path / folder_name)}'." @@ -386,7 +311,92 @@ def backup_auto_folders( try: self.delete_old_backups() except subprocess.CalledProcessError: - self.logger.error("Failed to delete old backups.") - return False + self.logger.warning("Failed to delete old backups.") - return True + +def backup_container( + manager: BackupManager, + container: Container, + path: Path, + prev_backup: Optional[Path] = None, +) -> None: + """Create a backup of the disk-objectstore container + + This is safe to perform when the container is being used. + + It should be done in the following order: + 1) loose files; + 2) sqlite database; + 3) packed files. + + :return: + True if successful and False if unsuccessful. + """ + + container_root_path = container.get_folder() + loose_path = container._get_loose_folder() # pylint: disable=protected-access + packs_path = container._get_pack_folder() # pylint: disable=protected-access + sqlite_path = container._get_pack_index_path() # pylint: disable=protected-access + + # step 1: back up loose files + loose_path_rel = loose_path.relative_to(container_root_path) + prev_backup_loose = prev_backup / loose_path_rel if prev_backup else None + success = manager.call_rsync(loose_path, path, link_dest=prev_backup_loose) + if not success: + raise BackupError(f"rsync failed for: {str(loose_path)} to {str(path)}") + manager.logger.info(f"Transferred {str(loose_path)} to {str(path)}") + + # step 2: back up sqlite db + + # make a temporary directory to dump sqlite db locally + with tempfile.TemporaryDirectory() as temp_dir_name: + sqlite_temp_loc = Path(temp_dir_name) / "packs.idx" + + # Safe way to make a backup of the sqlite db, while it might potentially be accessed + # https://docs.python.org/3/library/sqlite3.html#sqlite3.Connection.backup + src = sqlite3.connect(str(sqlite_path)) + dst = sqlite3.connect(str(sqlite_temp_loc)) + with dst: + src.backup(dst) + dst.close() + src.close() + + if sqlite_temp_loc.is_file(): + manager.logger.info(f"Dumped the SQLite database to {str(sqlite_temp_loc)}") + else: + raise BackupError(f"'{str(sqlite_temp_loc)}' failed to be created.") + + # step 3: transfer the SQLITE database file + success = manager.call_rsync(sqlite_temp_loc, path, link_dest=prev_backup) + if not success: + raise BackupError( + f"rsync failed for: {str(sqlite_temp_loc)} to {str(path)}" + ) + manager.logger.info(f"Transferred SQLite database to {str(path)}") + + # step 4: transfer the packed files + packs_path_rel = packs_path.relative_to(container_root_path) + success = manager.call_rsync(packs_path, path, link_dest=prev_backup) + if not success: + raise BackupError(f"rsync failed for: {str(packs_path)} to {str(path)}") + manager.logger.info(f"Transferred {str(packs_path)} to {str(path)}") + + # step 5: transfer anything else in the container folder + success = manager.call_rsync( + container_root_path, + path, + link_dest=prev_backup, + src_trailing_slash=True, + extra_args=[ + "--exclude", + str(loose_path_rel), + "--exclude", + "packs.idx", + "--exclude", + str(packs_path_rel), + ], + ) + if not success: + raise BackupError( + f"rsync failed for: {str(container_root_path)} to {str(path)}" + ) diff --git a/disk_objectstore/cli.py b/disk_objectstore/cli.py index 57532f5..af4a3ae 100644 --- a/disk_objectstore/cli.py +++ b/disk_objectstore/cli.py @@ -229,35 +229,26 @@ def backup( """ if verbosity == "silent": - backup_utils.logger.setLevel(logging.ERROR) + backup_utils.backup_logger.setLevel(logging.ERROR) elif verbosity == "info": - backup_utils.logger.setLevel(logging.INFO) + backup_utils.backup_logger.setLevel(logging.INFO) elif verbosity == "debug": - backup_utils.logger.setLevel(logging.DEBUG) + backup_utils.backup_logger.setLevel(logging.DEBUG) else: click.echo("Unsupported verbosity.") return 1 - try: - backup_utils_instance = backup_utils.BackupUtilities( - dest, keep, rsync_exe, backup_utils.logger - ) - except ValueError as e: - click.echo(f"Error: {e}") - return 1 - - success = backup_utils_instance.validate_inputs() - if not success: - click.echo("Input validation failed.") - return 1 - with dostore.container as container: - success = backup_utils_instance.backup_auto_folders( - lambda path, prev: backup_utils_instance.backup_container( - container, path, prev_backup=prev + try: + backup_manager = backup_utils.BackupManager( + dest, keep, backup_utils.backup_logger, exes={"rsync": rsync_exe} ) - ) - if not success: - click.echo("Error: backup failed.") + backup_manager.backup_auto_folders( + lambda path, prev: backup_utils.backup_container( + backup_manager, container, path, prev + ) + ) + except backup_utils.BackupError as e: + click.echo(f"Error: {e}") return 1 return 0 From 5a20caaf412e2b971759282635af35d92b7d6d19 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Thu, 7 Dec 2023 08:49:53 +0100 Subject: [PATCH 13/34] add tests --- disk_objectstore/backup_utils.py | 91 +++++++++------------ disk_objectstore/cli.py | 5 +- tests/test_backup.py | 134 +++++++++++++++++++++++++++++++ tests/test_cli.py | 10 ++- 4 files changed, 183 insertions(+), 57 deletions(-) create mode 100644 tests/test_backup.py diff --git a/disk_objectstore/backup_utils.py b/disk_objectstore/backup_utils.py index 750a15b..f35edfe 100644 --- a/disk_objectstore/backup_utils.py +++ b/disk_objectstore/backup_utils.py @@ -49,16 +49,19 @@ class BackupManager: def __init__( self, dest: str, - keep: int, logger: logging.Logger, - exes: dict, + keep: int = 1, + exes: Optional[dict] = None, ) -> None: self.dest = dest self.keep = keep self.logger = logger self.remote, self.path = split_remote_and_path(dest) - self.exes = exes + if exes is None: + self.exes = {} + else: + self.exes = exes # make sure rsync gets added so it gets validated if "rsync" not in self.exes: @@ -66,15 +69,13 @@ def __init__( self.validate() - def check_if_remote_accessible(self) -> bool: + def check_if_remote_accessible(self): """Check if remote host is accessible via ssh""" self.logger.info(f"Checking if '{self.remote}' is accessible...") success = self.run_cmd(["exit"])[0] if not success: - self.logger.error(f"Remote '{self.remote}' is not accessible!") - return False + raise BackupError(f"Remote '{self.remote}' is not accessible!") self.logger.info("Success! '%s' is accessible!", self.remote) - return True def check_path_exists(self, path: Path) -> bool: cmd = ["[", "-e", str(path), "]"] @@ -92,10 +93,7 @@ def validate(self): ) if self.remote: - if not self.check_if_remote_accessible(): - raise BackupError( - "Input validation failed: keep variable can't be negative!" - ) + self.check_if_remote_accessible() if self.exes: for _, path in self.exes.items(): @@ -142,9 +140,8 @@ def call_rsync( # pylint: disable=too-many-arguments dest: Path, link_dest: Optional[Path] = None, src_trailing_slash: bool = False, - dest_trailing_slash: bool = False, extra_args: Optional[list] = None, - ) -> bool: + ): """Call rsync with specified arguments and handle possible errors & stdout/stderr :param link_dest: @@ -154,10 +151,6 @@ def call_rsync( # pylint: disable=too-many-arguments Add a trailing slash to the source path. This makes rsync copy the contents of the folder instead of the folder itself. - :param dest_trailing_slash: - Add a trailing slash to the destination path. This makes rsync interpret the - destination as a folder and create it if it doesn't exists. - :return: True if successful and False if unsuccessful. """ @@ -182,8 +175,6 @@ def call_rsync( # pylint: disable=too-many-arguments all_args += [str(src)] dest_str = str(dest) - if dest_trailing_slash: - dest_str += "/" if not self.remote: all_args += [dest_str] @@ -202,7 +193,8 @@ def call_rsync( # pylint: disable=too-many-arguments success = res.returncode == 0 - return success + if not success: + raise BackupError(f"rsync failed for: {str(src)} to {str(dest)}") # ---- # Utilities to manage multiple folders of backups, e.g. hard-linking to previous backup; @@ -211,7 +203,7 @@ def call_rsync( # pylint: disable=too-many-arguments def get_existing_backup_folders(self): """Get all folders matching the backup folder name pattern.""" - _, stdout = self.run_cmd( + success, stdout = self.run_cmd( [ "find", str(self.path), @@ -225,6 +217,9 @@ def get_existing_backup_folders(self): ] ) + if not success: + raise BackupError("Existing backups determination failed.") + return stdout.splitlines() def get_last_backup_folder(self): @@ -261,10 +256,7 @@ def backup_auto_folders(self, backup_func: Callable): live_folder = self.path / "live-backup" - try: - last_folder = self.get_last_backup_folder() - except subprocess.CalledProcessError as exc: - raise BackupError("Couldn't determine last backup.") from exc + last_folder = self.get_last_backup_folder() if last_folder: self.logger.info( @@ -308,10 +300,20 @@ def backup_auto_folders(self, backup_func: Callable): else: self.logger.info(f"Added symlink '{symlink_name}' to '{folder_name}'.") - try: - self.delete_old_backups() - except subprocess.CalledProcessError: - self.logger.warning("Failed to delete old backups.") + self.delete_old_backups() + + +def _sqlite_backup(src: Path, dst: Path): + """ + Safe way to make a backup of the sqlite db, while it might potentially be accessed + https://docs.python.org/3/library/sqlite3.html#sqlite3.Connection.backup + """ + src_connect = sqlite3.connect(str(src)) + dst_connect = sqlite3.connect(str(dst)) + with dst_connect: + src_connect.backup(dst_connect) + dst_connect.close() + src_connect.close() def backup_container( @@ -341,9 +343,8 @@ def backup_container( # step 1: back up loose files loose_path_rel = loose_path.relative_to(container_root_path) prev_backup_loose = prev_backup / loose_path_rel if prev_backup else None - success = manager.call_rsync(loose_path, path, link_dest=prev_backup_loose) - if not success: - raise BackupError(f"rsync failed for: {str(loose_path)} to {str(path)}") + + manager.call_rsync(loose_path, path, link_dest=prev_backup_loose) manager.logger.info(f"Transferred {str(loose_path)} to {str(path)}") # step 2: back up sqlite db @@ -351,15 +352,7 @@ def backup_container( # make a temporary directory to dump sqlite db locally with tempfile.TemporaryDirectory() as temp_dir_name: sqlite_temp_loc = Path(temp_dir_name) / "packs.idx" - - # Safe way to make a backup of the sqlite db, while it might potentially be accessed - # https://docs.python.org/3/library/sqlite3.html#sqlite3.Connection.backup - src = sqlite3.connect(str(sqlite_path)) - dst = sqlite3.connect(str(sqlite_temp_loc)) - with dst: - src.backup(dst) - dst.close() - src.close() + _sqlite_backup(sqlite_path, sqlite_temp_loc) if sqlite_temp_loc.is_file(): manager.logger.info(f"Dumped the SQLite database to {str(sqlite_temp_loc)}") @@ -367,22 +360,16 @@ def backup_container( raise BackupError(f"'{str(sqlite_temp_loc)}' failed to be created.") # step 3: transfer the SQLITE database file - success = manager.call_rsync(sqlite_temp_loc, path, link_dest=prev_backup) - if not success: - raise BackupError( - f"rsync failed for: {str(sqlite_temp_loc)} to {str(path)}" - ) + manager.call_rsync(sqlite_temp_loc, path, link_dest=prev_backup) manager.logger.info(f"Transferred SQLite database to {str(path)}") # step 4: transfer the packed files packs_path_rel = packs_path.relative_to(container_root_path) - success = manager.call_rsync(packs_path, path, link_dest=prev_backup) - if not success: - raise BackupError(f"rsync failed for: {str(packs_path)} to {str(path)}") + manager.call_rsync(packs_path, path, link_dest=prev_backup) manager.logger.info(f"Transferred {str(packs_path)} to {str(path)}") # step 5: transfer anything else in the container folder - success = manager.call_rsync( + manager.call_rsync( container_root_path, path, link_dest=prev_backup, @@ -396,7 +383,3 @@ def backup_container( str(packs_path_rel), ], ) - if not success: - raise BackupError( - f"rsync failed for: {str(container_root_path)} to {str(path)}" - ) diff --git a/disk_objectstore/cli.py b/disk_objectstore/cli.py index af4a3ae..18acd3e 100644 --- a/disk_objectstore/cli.py +++ b/disk_objectstore/cli.py @@ -241,7 +241,10 @@ def backup( with dostore.container as container: try: backup_manager = backup_utils.BackupManager( - dest, keep, backup_utils.backup_logger, exes={"rsync": rsync_exe} + dest, + backup_utils.backup_logger, + exes={"rsync": rsync_exe}, + keep=keep, ) backup_manager.backup_auto_folders( lambda path, prev: backup_utils.backup_container( diff --git a/tests/test_backup.py b/tests/test_backup.py new file mode 100644 index 0000000..b4e6bb5 --- /dev/null +++ b/tests/test_backup.py @@ -0,0 +1,134 @@ +"""Test the backup functionality. + +""" + +import random +import string +from pathlib import Path + +import pytest + +from disk_objectstore import backup_utils +from disk_objectstore.backup_utils import BackupError, BackupManager + + +def _random_string(n=10): + return "".join(random.choices(string.ascii_lowercase + string.digits, k=n)) + + +def test_invalid_destination(): + """Test invalid destination with two colons.""" + dest = "localhost:/tmp/test:" + with pytest.raises(BackupError, match="Invalid destination format"): + BackupManager(dest, backup_utils.backup_logger) + + +def test_inaccessible_remote(): + """Test a remote destination of random characters that will not be accessible.""" + dest = f"_{_random_string()}:/tmp/test" + with pytest.raises(BackupError, match="is not accessible"): + BackupManager(dest, backup_utils.backup_logger) + + +def test_negative_keep(): + """Test a negative keep value.""" + dest = "/tmp/test" + with pytest.raises(BackupError, match="keep variable can't be negative"): + BackupManager(dest, backup_utils.backup_logger, keep=-1) + + +def test_inaccessible_exe(): + """Test case where rsync is not accessible.""" + dest = "/tmp/test" + rsync_exe = f"_{_random_string()}" + with pytest.raises(BackupError, match=f"{rsync_exe} not accessible."): + BackupManager(dest, backup_utils.backup_logger, exes={"rsync": rsync_exe}) + + +def test_inaccessible_path(): + """Test case where path is not accessible.""" + dest = f"/_{_random_string()}" # I assume there will be a permission error for this path + with pytest.raises(BackupError, match=f"Couldn't access/create '{dest}'"): + BackupManager(dest, backup_utils.backup_logger) + + +def test_rsync_failure(): + """Test case where rsync fails.""" + dest = "/tmp/test" + with pytest.raises(BackupError, match="rsync failed"): + manager = BackupManager(dest, backup_utils.backup_logger) + # pick a src that doesn't exists + manager.call_rsync(Path(f"/_{_random_string()}"), Path(dest)) + + +def test_existing_backups_failure(): + """Test case where existing backups fail to be determined.""" + dest = "/tmp/test" + with pytest.raises(BackupError, match="Existing backups determination failed"): + manager = BackupManager(dest, backup_utils.backup_logger) + # override the path to something that will fail + manager.path = f"/_{_random_string()}" + manager.get_existing_backup_folders() + + +def test_mv_failure(monkeypatch, temp_container, temp_dir): + """Test case where mv command fails by monkeypatching.""" + + # save a reference to the original run_cmd command + original_run_cmd = backup_utils.BackupManager.run_cmd + + # monkeypatch the run_cmd command to fail when "mv" is used + def mock_run_cmd(self, args): + if "mv" in args: + return False, "" + return original_run_cmd(self, args) + + monkeypatch.setattr( + backup_utils.BackupManager, + "run_cmd", + mock_run_cmd, + ) + + # make a container and back it up + temp_container.init_container(clear=True) + # Add a few objects + for idx in range(100): + temp_container.add_object(f"test-{idx}".encode()) + + dest = Path(temp_dir) / "backup" + with pytest.raises(BackupError, match="Failed to move"): + manager = BackupManager(str(dest), backup_utils.backup_logger) + manager.backup_auto_folders( + lambda path, prev: backup_utils.backup_container( + manager, temp_container, path, prev + ) + ) + + +def test_sqlite_failure(monkeypatch, temp_container, temp_dir): + """Test case where sqlite fails to make a backup file.""" + + # monkeypatch sqlite backup to do nothing + def mock_sqlite_backup(src, dst): # pylint: disable=unused-argument + pass + + monkeypatch.setattr( + backup_utils, + "_sqlite_backup", + mock_sqlite_backup, + ) + + # make a container + temp_container.init_container(clear=True) + # Add a few objects + for idx in range(100): + temp_container.add_object(f"test-{idx}".encode()) + + dest = Path(temp_dir) / "backup" + with pytest.raises(BackupError, match="'.*' failed to be created."): + manager = BackupManager(str(dest), backup_utils.backup_logger) + manager.backup_auto_folders( + lambda path, prev: backup_utils.backup_container( + manager, temp_container, path, prev + ) + ) diff --git a/tests/test_cli.py b/tests/test_cli.py index 0629484..4bef7eb 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -253,7 +253,8 @@ def test_backup(temp_container, temp_dir, remote, verbosity): @pytest.mark.skipif( platform.system() == "Windows", reason="Backup not supported on Windows" ) -def test_backup_repeated(temp_container, temp_dir): +@pytest.mark.parametrize("remote", [False, True]) +def test_backup_repeated(temp_container, temp_dir, remote): """Test the backup command repeated 3 times. Considering --keep 1 is default, the last one should get deleted. @@ -268,8 +269,13 @@ def test_backup_repeated(temp_container, temp_dir): path = Path(temp_dir) / "backup" + if remote: + destination = f"localhost:{str(path)}" + else: + destination = str(path) + for _ in range(3): - result = CliRunner().invoke(cli.backup, [str(path)], obj=obj) + result = CliRunner().invoke(cli.backup, [destination], obj=obj) assert result.exit_code == 0 assert path.exists() From 8a2c33f37f21cc8919765ebf9e36a9047935eb8d Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Thu, 7 Dec 2023 13:54:59 +0100 Subject: [PATCH 14/34] testing to 100% --- disk_objectstore/cli.py | 5 +- tests/test_backup.py | 108 +++++++++++++++++++++++++++++++++++----- tests/test_cli.py | 44 ++++++++++++++++ 3 files changed, 142 insertions(+), 15 deletions(-) diff --git a/disk_objectstore/cli.py b/disk_objectstore/cli.py index 18acd3e..ed34c0a 100644 --- a/disk_objectstore/cli.py +++ b/disk_objectstore/cli.py @@ -236,7 +236,7 @@ def backup( backup_utils.backup_logger.setLevel(logging.DEBUG) else: click.echo("Unsupported verbosity.") - return 1 + sys.exit(1) with dostore.container as container: try: @@ -253,5 +253,4 @@ def backup( ) except backup_utils.BackupError as e: click.echo(f"Error: {e}") - return 1 - return 0 + sys.exit(1) diff --git a/tests/test_backup.py b/tests/test_backup.py index b4e6bb5..7a453ec 100644 --- a/tests/test_backup.py +++ b/tests/test_backup.py @@ -2,6 +2,7 @@ """ +import platform import random import string from pathlib import Path @@ -11,6 +12,10 @@ from disk_objectstore import backup_utils from disk_objectstore.backup_utils import BackupError, BackupManager +pytestmark = pytest.mark.skipif( + platform.system() == "Windows", reason="Backup not supported on Windows" +) + def _random_string(n=10): return "".join(random.choices(string.ascii_lowercase + string.digits, k=n)) @@ -71,8 +76,40 @@ def test_existing_backups_failure(): manager.get_existing_backup_folders() +def test_sqlite_failure(monkeypatch, temp_container, temp_dir): + """Test case where sqlite fails to make a backup file.""" + + # monkeypatch sqlite backup to do nothing + def mock_sqlite_backup(src, dst): # pylint: disable=unused-argument + pass + + monkeypatch.setattr( + backup_utils, + "_sqlite_backup", + mock_sqlite_backup, + ) + + # make a container + temp_container.init_container(clear=True) + # Add a few objects + for idx in range(100): + temp_container.add_object(f"test-{idx}".encode()) + + dest = Path(temp_dir) / "backup" + with pytest.raises(BackupError, match="'.*' failed to be created."): + manager = BackupManager(str(dest), backup_utils.backup_logger) + manager.backup_auto_folders( + lambda path, prev: backup_utils.backup_container( + manager, temp_container, path, prev + ) + ) + + def test_mv_failure(monkeypatch, temp_container, temp_dir): - """Test case where mv command fails by monkeypatching.""" + """ + Test case where mv command fails by monkeypatching. + Make sure correct BackupError is raised. + """ # save a reference to the original run_cmd command original_run_cmd = backup_utils.BackupManager.run_cmd @@ -105,30 +142,77 @@ def mock_run_cmd(self, args): ) -def test_sqlite_failure(monkeypatch, temp_container, temp_dir): - """Test case where sqlite fails to make a backup file.""" +def test_ln_failure(monkeypatch, temp_container, temp_dir, caplog): + """ + Test case where ln command fails by monkeypatching. + Make sure correct warning is logged. + """ - # monkeypatch sqlite backup to do nothing - def mock_sqlite_backup(src, dst): # pylint: disable=unused-argument - pass + # save a reference to the original run_cmd command + original_run_cmd = backup_utils.BackupManager.run_cmd + + # monkeypatch the run_cmd command to fail when "mv" is used + def mock_run_cmd(self, args): + if "ln" in args: + return False, "" + return original_run_cmd(self, args) monkeypatch.setattr( - backup_utils, - "_sqlite_backup", - mock_sqlite_backup, + backup_utils.BackupManager, + "run_cmd", + mock_run_cmd, ) - # make a container + # make a container and back it up temp_container.init_container(clear=True) # Add a few objects for idx in range(100): temp_container.add_object(f"test-{idx}".encode()) dest = Path(temp_dir) / "backup" - with pytest.raises(BackupError, match="'.*' failed to be created."): - manager = BackupManager(str(dest), backup_utils.backup_logger) + manager = BackupManager(str(dest), backup_utils.backup_logger) + manager.backup_auto_folders( + lambda path, prev: backup_utils.backup_container( + manager, temp_container, path, prev + ) + ) + assert "Couldn't create symlink" in caplog.text + + +def test_rm_failure(monkeypatch, temp_container, temp_dir, caplog): + """ + Test case where rm command fails by monkeypatching. + Make sure correct warning is logged. + Note, this is used for deleting old backups, so create two with keep=0. + """ + + # save a reference to the original run_cmd command + original_run_cmd = backup_utils.BackupManager.run_cmd + + # monkeypatch the run_cmd command to fail when "mv" is used + def mock_run_cmd(self, args): + if "rm" in args: + return False, "" + return original_run_cmd(self, args) + + monkeypatch.setattr( + backup_utils.BackupManager, + "run_cmd", + mock_run_cmd, + ) + + # make a container and back it up + temp_container.init_container(clear=True) + # Add a few objects + for idx in range(100): + temp_container.add_object(f"test-{idx}".encode()) + + dest = Path(temp_dir) / "backup" + manager = BackupManager(str(dest), backup_utils.backup_logger, keep=0) + for _ in range(2): manager.backup_auto_folders( lambda path, prev: backup_utils.backup_container( manager, temp_container, path, prev ) ) + assert "Warning: couldn't delete old backup" in caplog.text diff --git a/tests/test_cli.py b/tests/test_cli.py index 4bef7eb..0162652 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -286,3 +286,47 @@ def test_backup_repeated(temp_container, temp_dir, remote): assert "last-backup" in path_contents assert len(backup_dirs) == 2 + + +@pytest.mark.skipif( + platform.system() == "Windows", reason="Backup not supported on Windows" +) +def test_backup_unsupported_verbosity(temp_container, temp_dir): + """Test failure when providing unsupported verbosity value""" + + temp_container.init_container(clear=True) + # Add a few objects + for idx in range(100): + temp_container.add_object(f"test-{idx}".encode()) + + obj = cli.ContainerContext(temp_container.get_folder()) + + path = Path(temp_dir) / "backup" + + args = [str(path), "--verbosity=unsupported"] + + result = CliRunner().invoke(cli.backup, args, obj=obj) + + assert result.exit_code == 1 + assert "Unsupported verbosity" in result.stdout + + +@pytest.mark.skipif( + platform.system() == "Windows", reason="Backup not supported on Windows" +) +def test_backup_failure(temp_container): + """Test failure when providing invalid destination""" + + temp_container.init_container(clear=True) + # Add a few objects + for idx in range(100): + temp_container.add_object(f"test-{idx}".encode()) + + obj = cli.ContainerContext(temp_container.get_folder()) + + dest = "abc:abc:" + + result = CliRunner().invoke(cli.backup, [dest], obj=obj) + + assert result.exit_code == 1 + assert "Error:" in result.stdout From 5da58dce623e38308770d3733c114451f4c032b6 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Thu, 7 Dec 2023 16:27:06 +0200 Subject: [PATCH 15/34] remove module name in from logging output --- disk_objectstore/backup_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/disk_objectstore/backup_utils.py b/disk_objectstore/backup_utils.py index f35edfe..67fe9c0 100644 --- a/disk_objectstore/backup_utils.py +++ b/disk_objectstore/backup_utils.py @@ -15,7 +15,7 @@ from disk_objectstore.container import Container -logging.basicConfig() +logging.basicConfig(format="%(levelname)s:%(message)s") backup_logger = logging.getLogger(__name__) From b88d0eeb742ff6f0272b88f8c7a3bc80b8aa97c8 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Thu, 7 Dec 2023 17:00:28 +0200 Subject: [PATCH 16/34] update docs --- docs/index.rst | 14 ++++++ docs/pages/backup.md | 105 ++++++++++++++++++++++++++++++++++++++++ docs/pages/cli_usage.md | 2 + 3 files changed, 121 insertions(+) create mode 100644 docs/pages/backup.md diff --git a/docs/index.rst b/docs/index.rst index 41dac50..abee521 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -64,6 +64,19 @@ and does not require a server running. ---------------------------------------------- +:fa:`bookmark,mr-1` **Making a backup** + + Information on how to back up a container. + + +++++++++++++++++++++++++++++++++++++++++++++ + + .. link-button:: pages/backup + :type: ref + :text: To the backup explanation + :classes: btn-outline-primary btn-block stretched-link + + ---------------------------------------------- + :fa:`cogs,mr-1` **Design** Background information on the design of the package. @@ -84,4 +97,5 @@ and does not require a server running. pages/advanced_usage pages/cli_usage pages/packing + pages/backup pages/design diff --git a/docs/pages/backup.md b/docs/pages/backup.md new file mode 100644 index 0000000..7577ff3 --- /dev/null +++ b/docs/pages/backup.md @@ -0,0 +1,105 @@ +# Backing up the container + +## User instructions + +A disk-objectstore container is fully contained in its root folder. If the container is not being modified, a backup can be made by just making a copy of this folder. The recommended way is to use the `rsync` tool, as the library was designed to be performant with it and make use of the incremental copying capabilities. + +However, the preferred way to make a backup, that is also safe while the container is being used (except for when repacking or deleting files), is to use the built-in CLI command: + +```console +$ dostore backup --help +Usage: dostore backup [OPTIONS] DEST + + Create a backup of the container to destination location DEST, in a + subfolder backup__ and point a symlink called `last- + backup` to it. + + NOTE: This is safe to run while the container is being used. + + NOTE: the symlink `last-backup` is omitted if the filesystem doesn't support + it. + + Destination (DEST) can either be a local path, or a remote destination + (reachable via ssh). In the latter case, remote destination needs to have + the following syntax: + + [@]: + + i.e., contain the remote host name and the remote path, separated by a colon + (and optionally the remote user separated by an @ symbol). You can tune SSH + parameters using the standard options given by OpenSSH, such as adding + configuration options to ~/.ssh/config (e.g. to allow for passwordless login + - recommended, since this script might ask multiple times for the password). + + NOTE: 'rsync' and other UNIX-specific commands are called, thus the command + will not work on non-UNIX environments. + +Options: + --keep INTEGER Number of previous backups to keep in the destination. + (default: 1) + --rsync_exe TEXT Specify the 'rsync' executable, if not in PATH. Used for + both local and remote destinations. + --verbosity TEXT Set verbosity [silent|info|debug], default is 'info'. + --help Show this message and exit. + +``` + +For more detailed information about how the backup is made, see the next section. + +## Detailed info/design + +The primary purpose of the backup functionality is to copy the content of the container in a specific order that prevents data corruption due to the container being updated. This order is the following + +1. loose files; +2. sqlite database that contains the packed file indexes; +3. packed files. + +To understand why, let's consider ways the backup could become corrupted: + +- In the case of packing files (`optimize`) or adding directly packed files, the library first adds data to the pack file and then writes the metadata to the sqlite database. The backup becomes corrupted if the following happens + + 1. data is being added to a pack file; + 2. backup copies the pack file, containing the incomplete section; + 3. packfile is completed & the sqlite database is updated; + 4. backup copies the sqlite database. + + This results in the backup containing an index that references an incomplete section in a pack file. To prevent this, is to always copy first the sqlite db and then the pack files. This can still result in an incomplete section in the pack files but it effectively doesn't exist for the backup. + +- If loose files are packed up at the end, the following might happen: + + 1. backup copies pack files & sqlite db; + 2. user runs optimize & clean_storage, which adds loose files to a pack & deletes the original files; + 3. backup copies loose files. + + This results in files missing in the backup. Therefore, loose files should be copied first. + +Note: one should not run the backup while repacking and deleting files. + +Implementation details: + +- The backup command runs operating-system-level commands on the destination machine by using the python subprocess library. These currently include + + 1. running rsync. + 2. For remote destinations, checking if it is accessible (`ssh exit`); + 3. checking if destination path exists (`[ -e ]`); + 4. checking if destination directory can be made, if it doesn't exist (`mkdir `); + 5. moving and removing folders. + + For 3-5, remote cases just append `ssh ` in front of the command, while rsync is used via its native interface to access a remote destination. For both of these cases of remote access, the standard configuration options of OpenSSH are used (such as configuration in `~/.ssh/config`) + +- Steps in order: + - Input validation: + - is remote accessible? + - is `DEST` accessible? + - is `rsync` executable found? + - Check if a backup already exists in `DEST` + - if yes, use the most recent one (based on timestamp in name) for `rsync --link-dest` argument in all `rsync` calls + - Create `DEST/live-backup` folder + - rsync loose folder to `DEST/live-backup` + - dump sqlite database in a safe manner to a `tempfile.TemporaryDirectory()` + - rsync the sqlite database to `DEST/live-backup` + - rsync the packed files to `DEST/live-backup` + - rsync everything else to `DEST/live-backup` + - rename `DEST/live-backup` to `DEST/backup__` + - update `DEST/last-backup` symlink to point to `DEST/backup__` + - delete number of previous backups down to `--keep` argument diff --git a/docs/pages/cli_usage.md b/docs/pages/cli_usage.md index 781fcd0..722eff2 100644 --- a/docs/pages/cli_usage.md +++ b/docs/pages/cli_usage.md @@ -16,9 +16,11 @@ Options: Commands: add-files Add file(s) to the container + backup Create a backup of the container to destination location... create Create a container optimize Optimize the container's memory use status Print details about the container + validate Validate the container ``` Create a container: From f5476a583d6d12cdc7a0737b104ce1586fec9748 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Thu, 7 Dec 2023 17:19:10 +0200 Subject: [PATCH 17/34] fix docs --- docs/index.rst | 2 +- docs/pages/backup.md | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index abee521..c1b825d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -64,7 +64,7 @@ and does not require a server running. ---------------------------------------------- -:fa:`bookmark,mr-1` **Making a backup** + :fa:`bookmark,mr-1` **Making backups** Information on how to back up a container. diff --git a/docs/pages/backup.md b/docs/pages/backup.md index 7577ff3..fd2cfed 100644 --- a/docs/pages/backup.md +++ b/docs/pages/backup.md @@ -1,4 +1,4 @@ -# Backing up the container +# Making backups ## User instructions @@ -44,6 +44,20 @@ Options: ``` +Example usage: + +```console +$ dostore backup /path/to/backup +INFO:Last backup is '/path/to/backup/backup_20231207142602_ymqf', using it for rsync --link-dest. +INFO:Transferred /path/to/dostore/loose to /path/to/backup/live-backup +INFO:Dumped the SQLite database to /tmp/tmpgewwse3f/packs.idx +INFO:Transferred SQLite database to /path/to/backup/live-backup +INFO:Transferred /path/to/dostore/packs to /path/to/backup/live-backup +INFO:Backup moved from '/path/to/backup/live-backup' to '/path/to/backup/backup_20231207142913_pz7m'. +INFO:Added symlink 'last-backup' to 'backup_20231207142913_pz7m'. +INFO:Deleted old backup: /path/to/backup/backup_20231207131741_zar7 +``` + For more detailed information about how the backup is made, see the next section. ## Detailed info/design From fd2d03855b6ee2c6ecee3e42b05680bfb9fad034 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Thu, 7 Dec 2023 17:30:13 +0200 Subject: [PATCH 18/34] tiny docstring update --- disk_objectstore/backup_utils.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/disk_objectstore/backup_utils.py b/disk_objectstore/backup_utils.py index 67fe9c0..06c7fd4 100644 --- a/disk_objectstore/backup_utils.py +++ b/disk_objectstore/backup_utils.py @@ -82,11 +82,7 @@ def check_path_exists(self, path: Path) -> bool: return self.run_cmd(cmd)[0] def validate(self): - """Validate the backup config inputs - - :return: - True if validation passes, False otherwise. - """ + """Validate the backup config inputs""" if self.keep < 0: raise BackupError( "Input validation failed: keep variable can't be negative!" @@ -151,8 +147,6 @@ def call_rsync( # pylint: disable=too-many-arguments Add a trailing slash to the source path. This makes rsync copy the contents of the folder instead of the folder itself. - :return: - True if successful and False if unsuccessful. """ assert "rsync" in self.exes @@ -238,7 +232,7 @@ def delete_old_backups(self): else: self.logger.warning("Warning: couldn't delete old backup: %s", folder) - def backup_auto_folders(self, backup_func: Callable): + def backup_auto_folders(self, backup_func: Callable) -> None: """Create a backup, managing live and previous backup folders automatically The running backup is done to `/live-backup`. When it completes, it is moved to @@ -250,8 +244,6 @@ def backup_auto_folders(self, backup_func: Callable): Function that is used to make a single backup. Needs to have two arguments: path and previous_backup location (which can be None). - :return: - True is successful and False if unsuccessful. """ live_folder = self.path / "live-backup" @@ -331,8 +323,6 @@ def backup_container( 2) sqlite database; 3) packed files. - :return: - True if successful and False if unsuccessful. """ container_root_path = container.get_folder() From a195ca5f9a012b55fa8ca692b0405aad42fa36a9 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Thu, 7 Dec 2023 20:07:07 +0200 Subject: [PATCH 19/34] re-add dest_trailing_slash to call_rsync, as it's used in aiida-core --- disk_objectstore/backup_utils.py | 7 +++++++ tests/test_backup.py | 11 +++++++++++ 2 files changed, 18 insertions(+) diff --git a/disk_objectstore/backup_utils.py b/disk_objectstore/backup_utils.py index 06c7fd4..5b228b1 100644 --- a/disk_objectstore/backup_utils.py +++ b/disk_objectstore/backup_utils.py @@ -136,6 +136,7 @@ def call_rsync( # pylint: disable=too-many-arguments dest: Path, link_dest: Optional[Path] = None, src_trailing_slash: bool = False, + dest_trailing_slash: bool = False, extra_args: Optional[list] = None, ): """Call rsync with specified arguments and handle possible errors & stdout/stderr @@ -147,6 +148,10 @@ def call_rsync( # pylint: disable=too-many-arguments Add a trailing slash to the source path. This makes rsync copy the contents of the folder instead of the folder itself. + :param dest_trailing_slash: + Add a trailing slash to the destination path. This makes rsync interpret the + destination as a folder and create it if it doesn't exists. + """ assert "rsync" in self.exes @@ -169,6 +174,8 @@ def call_rsync( # pylint: disable=too-many-arguments all_args += [str(src)] dest_str = str(dest) + if dest_trailing_slash: + dest_str += "/" if not self.remote: all_args += [dest_str] diff --git a/tests/test_backup.py b/tests/test_backup.py index 7a453ec..600136f 100644 --- a/tests/test_backup.py +++ b/tests/test_backup.py @@ -66,6 +66,17 @@ def test_rsync_failure(): manager.call_rsync(Path(f"/_{_random_string()}"), Path(dest)) +def test_rsync_dest_trailing_slash(temp_dir): + """Test case for dest_trailing_slash.""" + dest1 = Path(temp_dir) / "dest1" + dest2 = Path(temp_dir) / "dest2" + # manager will create dest1 folder + manager = BackupManager(str(dest1), backup_utils.backup_logger) + # dest_trailing_slash=True will create dest2 + manager.call_rsync(dest1, dest2, dest_trailing_slash=True) + assert dest2.exists() + + def test_existing_backups_failure(): """Test case where existing backups fail to be determined.""" dest = "/tmp/test" From 3eb7d17b2f3f63a3a40ddff81d99943addd4c271 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Thu, 11 Jan 2024 11:37:02 +0200 Subject: [PATCH 20/34] Update disk_objectstore/cli.py Co-authored-by: Sebastiaan Huber --- disk_objectstore/cli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/disk_objectstore/cli.py b/disk_objectstore/cli.py index ed34c0a..7ef883f 100644 --- a/disk_objectstore/cli.py +++ b/disk_objectstore/cli.py @@ -191,7 +191,8 @@ def optimize( @click.option( "--keep", default=1, - help="Number of previous backups to keep in the destination. (default: 1)", + show_default=True, + help="Number of previous backups to keep in the destination.", ) @click.option( "--rsync_exe", From 8fa8d6e7a6cdfaae419522c3303da631e7fd35cd Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Thu, 11 Jan 2024 11:37:19 +0200 Subject: [PATCH 21/34] Update disk_objectstore/cli.py Co-authored-by: Sebastiaan Huber --- disk_objectstore/cli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/disk_objectstore/cli.py b/disk_objectstore/cli.py index 7ef883f..b9d97ed 100644 --- a/disk_objectstore/cli.py +++ b/disk_objectstore/cli.py @@ -202,7 +202,8 @@ def optimize( @click.option( "--verbosity", default="info", - help="Set verbosity [silent|info|debug], default is 'info'.", + type=click.Choice(('silent', 'info', 'debug')), + help="Set verbosity of the logger.", ) @pass_dostore def backup( From 37af8c0ef5150d3cfd0b56645cb68d39a97e8071 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 11 Jan 2024 09:38:39 +0000 Subject: [PATCH 22/34] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- disk_objectstore/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/disk_objectstore/cli.py b/disk_objectstore/cli.py index b9d97ed..fd8e8d7 100644 --- a/disk_objectstore/cli.py +++ b/disk_objectstore/cli.py @@ -202,7 +202,7 @@ def optimize( @click.option( "--verbosity", default="info", - type=click.Choice(('silent', 'info', 'debug')), + type=click.Choice(("silent", "info", "debug")), help="Set verbosity of the logger.", ) @pass_dostore From d2b7f74f867bd3ed181d11cc6a3fd641ae4c5a1e Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Thu, 11 Jan 2024 11:39:50 +0200 Subject: [PATCH 23/34] Update disk_objectstore/backup_utils.py Co-authored-by: Sebastiaan Huber --- disk_objectstore/backup_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/disk_objectstore/backup_utils.py b/disk_objectstore/backup_utils.py index 5b228b1..deb0943 100644 --- a/disk_objectstore/backup_utils.py +++ b/disk_objectstore/backup_utils.py @@ -98,9 +98,7 @@ def validate(self): f"Input validation failed: {path} not accessible." ) - path_exists = self.check_path_exists(self.path) - - if not path_exists: + if not self.check_path_exists(self.path): success = self.run_cmd(["mkdir", str(self.path)])[0] if not success: raise BackupError( From ecc32b41bb51198cae2ab4bc57b81f7621f34de1 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Thu, 11 Jan 2024 11:44:39 +0200 Subject: [PATCH 24/34] Update disk_objectstore/backup_utils.py Co-authored-by: Sebastiaan Huber --- disk_objectstore/backup_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/disk_objectstore/backup_utils.py b/disk_objectstore/backup_utils.py index deb0943..88734d4 100644 --- a/disk_objectstore/backup_utils.py +++ b/disk_objectstore/backup_utils.py @@ -31,7 +31,7 @@ def split_remote_and_path(dest: str): if len(split_dest) == 2: return split_dest[0], Path(split_dest[1]) # more than 1 colon: - raise BackupError("Invalid destination format: :") + raise ValueError("Invalid destination format: :") def is_exe_found(exe: str) -> bool: From c80a499cec98aa33b29f9bfe383ab2e405b6b285 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Thu, 11 Jan 2024 11:45:56 +0200 Subject: [PATCH 25/34] Update disk_objectstore/cli.py Co-authored-by: Sebastiaan Huber --- disk_objectstore/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/disk_objectstore/cli.py b/disk_objectstore/cli.py index fd8e8d7..57e929f 100644 --- a/disk_objectstore/cli.py +++ b/disk_objectstore/cli.py @@ -253,6 +253,6 @@ def backup( backup_manager, container, path, prev ) ) - except backup_utils.BackupError as e: + except (ValueError, backup_utils.BackupError) as e: click.echo(f"Error: {e}") sys.exit(1) From 620693c1356e41b308ede3915133a3ba0cbe79f1 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Thu, 11 Jan 2024 11:46:22 +0200 Subject: [PATCH 26/34] Update disk_objectstore/backup_utils.py Co-authored-by: Sebastiaan Huber --- disk_objectstore/backup_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/disk_objectstore/backup_utils.py b/disk_objectstore/backup_utils.py index 88734d4..ef51898 100644 --- a/disk_objectstore/backup_utils.py +++ b/disk_objectstore/backup_utils.py @@ -190,9 +190,7 @@ def call_rsync( # pylint: disable=too-many-arguments res.stderr, ) - success = res.returncode == 0 - - if not success: + if res.returncode != 0: raise BackupError(f"rsync failed for: {str(src)} to {str(dest)}") # ---- From 60cd4297c3c83017bf430c4b620b72d040cd9b8b Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Thu, 11 Jan 2024 11:47:35 +0200 Subject: [PATCH 27/34] Update disk_objectstore/cli.py Co-authored-by: Sebastiaan Huber --- disk_objectstore/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/disk_objectstore/cli.py b/disk_objectstore/cli.py index 57e929f..5c373d7 100644 --- a/disk_objectstore/cli.py +++ b/disk_objectstore/cli.py @@ -195,7 +195,7 @@ def optimize( help="Number of previous backups to keep in the destination.", ) @click.option( - "--rsync_exe", + "--rsync-exe", default="rsync", help="Specify the 'rsync' executable, if not in PATH. Used for both local and remote destinations.", ) From a699a79d3dd9275171a1287ae977498159cabe02 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Thu, 11 Jan 2024 11:47:53 +0200 Subject: [PATCH 28/34] Update docs/pages/backup.md Co-authored-by: Sebastiaan Huber --- docs/pages/backup.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/pages/backup.md b/docs/pages/backup.md index fd2cfed..8f33d45 100644 --- a/docs/pages/backup.md +++ b/docs/pages/backup.md @@ -37,7 +37,7 @@ Usage: dostore backup [OPTIONS] DEST Options: --keep INTEGER Number of previous backups to keep in the destination. (default: 1) - --rsync_exe TEXT Specify the 'rsync' executable, if not in PATH. Used for + --rsync-exe TEXT Specify the 'rsync' executable, if not in PATH. Used for both local and remote destinations. --verbosity TEXT Set verbosity [silent|info|debug], default is 'info'. --help Show this message and exit. From ae367a889875ffda19054221f5e7de51778b0b59 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Thu, 11 Jan 2024 11:53:48 +0200 Subject: [PATCH 29/34] update dbackup docs --- docs/pages/backup.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/pages/backup.md b/docs/pages/backup.md index fd2cfed..1e4f4c1 100644 --- a/docs/pages/backup.md +++ b/docs/pages/backup.md @@ -47,12 +47,12 @@ Options: Example usage: ```console -$ dostore backup /path/to/backup +$ dostore --path /path/to/container backup /path/to/backup INFO:Last backup is '/path/to/backup/backup_20231207142602_ymqf', using it for rsync --link-dest. -INFO:Transferred /path/to/dostore/loose to /path/to/backup/live-backup +INFO:Transferred /path/to/container/loose to /path/to/backup/live-backup INFO:Dumped the SQLite database to /tmp/tmpgewwse3f/packs.idx INFO:Transferred SQLite database to /path/to/backup/live-backup -INFO:Transferred /path/to/dostore/packs to /path/to/backup/live-backup +INFO:Transferred /path/to/container/packs to /path/to/backup/live-backup INFO:Backup moved from '/path/to/backup/live-backup' to '/path/to/backup/backup_20231207142913_pz7m'. INFO:Added symlink 'last-backup' to 'backup_20231207142913_pz7m'. INFO:Deleted old backup: /path/to/backup/backup_20231207131741_zar7 From a33cc98079ef2db25ca186564ce9e780013b48e5 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Thu, 11 Jan 2024 11:59:26 +0200 Subject: [PATCH 30/34] change cli docstring --- disk_objectstore/cli.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/disk_objectstore/cli.py b/disk_objectstore/cli.py index 5c373d7..9fdfabe 100644 --- a/disk_objectstore/cli.py +++ b/disk_objectstore/cli.py @@ -209,8 +209,10 @@ def optimize( def backup( dostore: ContainerContext, dest: str, keep: int, rsync_exe: str, verbosity: str ): - """Create a backup of the container to destination location DEST, in a subfolder - backup__ and point a symlink called `last-backup` to it. + """Create a backup of the container. + + The backup is created at the `DEST` destination, in a subfolder + backup__ and a symlink `last-backup` is created to it in the same folder. NOTE: This is safe to run while the container is being used. From 94a37a5af072d528dff8a93f6d86691b90102773 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Thu, 11 Jan 2024 12:13:08 +0200 Subject: [PATCH 31/34] remove verbosity check --- disk_objectstore/cli.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/disk_objectstore/cli.py b/disk_objectstore/cli.py index 9fdfabe..fabb957 100644 --- a/disk_objectstore/cli.py +++ b/disk_objectstore/cli.py @@ -238,9 +238,6 @@ def backup( backup_utils.backup_logger.setLevel(logging.INFO) elif verbosity == "debug": backup_utils.backup_logger.setLevel(logging.DEBUG) - else: - click.echo("Unsupported verbosity.") - sys.exit(1) with dostore.container as container: try: From ddd915079517b89454400c802e0d2f32b620a24a Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Thu, 11 Jan 2024 12:15:46 +0200 Subject: [PATCH 32/34] Update disk_objectstore/backup_utils.py Co-authored-by: Sebastiaan Huber --- disk_objectstore/backup_utils.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/disk_objectstore/backup_utils.py b/disk_objectstore/backup_utils.py index ef51898..70eb51a 100644 --- a/disk_objectstore/backup_utils.py +++ b/disk_objectstore/backup_utils.py @@ -160,11 +160,8 @@ def call_rsync( # pylint: disable=too-many-arguments if link_dest: if not self.remote: # for local paths, use resolve() to get absolute path - link_dest_str = str(link_dest.resolve()) - else: - # for remote paths, we require absolute paths anyways - link_dest_str = str(link_dest) - all_args += [f"--link-dest={link_dest_str}"] + link_dest = link_dest.resolve() + all_args += [f"--link-dest={link_dest}"] if src_trailing_slash: all_args += [str(src) + "/"] From 54c1e23fd6ccd9861565980d0a35b4dc1720e685 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Thu, 11 Jan 2024 12:25:41 +0200 Subject: [PATCH 33/34] move validate to constructor; use valueerror instead of backuperror --- disk_objectstore/backup_utils.py | 36 ++++++++++++++------------------ 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/disk_objectstore/backup_utils.py b/disk_objectstore/backup_utils.py index ef51898..e5aa0a2 100644 --- a/disk_objectstore/backup_utils.py +++ b/disk_objectstore/backup_utils.py @@ -67,24 +67,10 @@ def __init__( if "rsync" not in self.exes: self.exes["rsync"] = "rsync" - self.validate() + # Validate the backup config inputs - def check_if_remote_accessible(self): - """Check if remote host is accessible via ssh""" - self.logger.info(f"Checking if '{self.remote}' is accessible...") - success = self.run_cmd(["exit"])[0] - if not success: - raise BackupError(f"Remote '{self.remote}' is not accessible!") - self.logger.info("Success! '%s' is accessible!", self.remote) - - def check_path_exists(self, path: Path) -> bool: - cmd = ["[", "-e", str(path), "]"] - return self.run_cmd(cmd)[0] - - def validate(self): - """Validate the backup config inputs""" if self.keep < 0: - raise BackupError( + raise ValueError( "Input validation failed: keep variable can't be negative!" ) @@ -94,17 +80,27 @@ def validate(self): if self.exes: for _, path in self.exes.items(): if not is_exe_found(path): - raise BackupError( - f"Input validation failed: {path} not accessible." - ) + raise ValueError(f"Input validation failed: {path} not accessible.") if not self.check_path_exists(self.path): success = self.run_cmd(["mkdir", str(self.path)])[0] if not success: - raise BackupError( + raise ValueError( f"Input validation failed: Couldn't access/create '{str(self.path)}'!" ) + def check_if_remote_accessible(self): + """Check if remote host is accessible via ssh""" + self.logger.info(f"Checking if '{self.remote}' is accessible...") + success = self.run_cmd(["exit"])[0] + if not success: + raise BackupError(f"Remote '{self.remote}' is not accessible!") + self.logger.info("Success! '%s' is accessible!", self.remote) + + def check_path_exists(self, path: Path) -> bool: + cmd = ["[", "-e", str(path), "]"] + return self.run_cmd(cmd)[0] + def run_cmd( self, args: list, From ac10458a7084dd209c91fb2a18cfed1a0e8f3b23 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Thu, 11 Jan 2024 12:42:40 +0200 Subject: [PATCH 34/34] adapt tests --- tests/test_backup.py | 8 ++++---- tests/test_cli.py | 23 ----------------------- 2 files changed, 4 insertions(+), 27 deletions(-) diff --git a/tests/test_backup.py b/tests/test_backup.py index 600136f..a8b199f 100644 --- a/tests/test_backup.py +++ b/tests/test_backup.py @@ -24,7 +24,7 @@ def _random_string(n=10): def test_invalid_destination(): """Test invalid destination with two colons.""" dest = "localhost:/tmp/test:" - with pytest.raises(BackupError, match="Invalid destination format"): + with pytest.raises(ValueError, match="Invalid destination format"): BackupManager(dest, backup_utils.backup_logger) @@ -38,7 +38,7 @@ def test_inaccessible_remote(): def test_negative_keep(): """Test a negative keep value.""" dest = "/tmp/test" - with pytest.raises(BackupError, match="keep variable can't be negative"): + with pytest.raises(ValueError, match="keep variable can't be negative"): BackupManager(dest, backup_utils.backup_logger, keep=-1) @@ -46,14 +46,14 @@ def test_inaccessible_exe(): """Test case where rsync is not accessible.""" dest = "/tmp/test" rsync_exe = f"_{_random_string()}" - with pytest.raises(BackupError, match=f"{rsync_exe} not accessible."): + with pytest.raises(ValueError, match=f"{rsync_exe} not accessible."): BackupManager(dest, backup_utils.backup_logger, exes={"rsync": rsync_exe}) def test_inaccessible_path(): """Test case where path is not accessible.""" dest = f"/_{_random_string()}" # I assume there will be a permission error for this path - with pytest.raises(BackupError, match=f"Couldn't access/create '{dest}'"): + with pytest.raises(ValueError, match=f"Couldn't access/create '{dest}'"): BackupManager(dest, backup_utils.backup_logger) diff --git a/tests/test_cli.py b/tests/test_cli.py index 0162652..5b7e2b9 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -288,29 +288,6 @@ def test_backup_repeated(temp_container, temp_dir, remote): assert len(backup_dirs) == 2 -@pytest.mark.skipif( - platform.system() == "Windows", reason="Backup not supported on Windows" -) -def test_backup_unsupported_verbosity(temp_container, temp_dir): - """Test failure when providing unsupported verbosity value""" - - temp_container.init_container(clear=True) - # Add a few objects - for idx in range(100): - temp_container.add_object(f"test-{idx}".encode()) - - obj = cli.ContainerContext(temp_container.get_folder()) - - path = Path(temp_dir) / "backup" - - args = [str(path), "--verbosity=unsupported"] - - result = CliRunner().invoke(cli.backup, args, obj=obj) - - assert result.exit_code == 1 - assert "Unsupported verbosity" in result.stdout - - @pytest.mark.skipif( platform.system() == "Windows", reason="Backup not supported on Windows" )