From faa29096c6b14729c39eef6dd8ffad5cb21581b5 Mon Sep 17 00:00:00 2001 From: Noam Ross Date: Mon, 27 Nov 2023 18:06:42 -0500 Subject: [PATCH 1/8] Major refactor --- .github/CONTRIBUTING.md | 3 - .github/workflows/R-CMD-check.yaml | 2 +- .gitignore | 1 + .lintr | 7 +- DESCRIPTION | 13 +- NAMESPACE | 28 ++-- R/cache.R | 165 ++++++++++++++++++++++++ R/commits_between.R | 87 ------------- R/dir_ls_version.R | 65 ---------- R/example_repo.R | 23 ++-- R/example_utils_txt.R | 3 +- R/extract.R | 71 ---------- R/extract_relic.R | 33 +++++ R/file_copy_version.R | 91 ------------- R/file_read_version.R | 41 ------ R/get_file_version.R | 39 ++++++ R/git_to_disk.R | 48 +++++++ R/github_to_disk.R | 56 ++++++++ R/make_relic.R | 130 +++++++++++++++++++ R/path_in_cache.R | 37 ++++++ R/relic-package.R | 7 +- R/relic.R | 27 ---- R/s3_to_disk.R | 6 + R/tar_exists_version.R | 24 ---- R/tar_meta_version.R | 32 +++-- R/tar_read_version.R | 132 +++++++++++-------- R/utils.R | 61 +++++---- inst/WORDLIST | 3 + inst/cache_timestamp.rds | Bin 0 -> 95 bytes man/commits_between.Rd | 38 ------ man/create_example_repo.Rd | 8 +- man/dir_ls_version.Rd | 65 ---------- man/file_copy_version.Rd | 78 ----------- man/file_read_version.Rd | 37 ------ man/get_file_version.Rd | 38 ++++++ man/is_relic.Rd | 14 -- man/relic_cache.Rd | 56 ++++++-- man/tar_exists_version.Rd | 38 ------ man/tar_meta_version.Rd | 32 +++-- man/tar_read_version.Rd | 64 ++------- tests/testthat/helper.R | 24 ++++ tests/testthat/setup.R | 21 +-- tests/testthat/test-cache.R | 5 +- tests/testthat/test-file_read_version.R | 14 +- tests/testthat/tests-strict.R | 53 ++++++++ 45 files changed, 916 insertions(+), 904 deletions(-) create mode 100644 R/cache.R delete mode 100644 R/commits_between.R delete mode 100644 R/dir_ls_version.R delete mode 100644 R/extract.R create mode 100644 R/extract_relic.R delete mode 100644 R/file_copy_version.R delete mode 100644 R/file_read_version.R create mode 100644 R/get_file_version.R create mode 100644 R/git_to_disk.R create mode 100644 R/github_to_disk.R create mode 100644 R/make_relic.R create mode 100644 R/path_in_cache.R delete mode 100644 R/relic.R create mode 100644 R/s3_to_disk.R delete mode 100644 R/tar_exists_version.R create mode 100644 inst/cache_timestamp.rds delete mode 100644 man/commits_between.Rd delete mode 100644 man/dir_ls_version.Rd delete mode 100644 man/file_copy_version.Rd delete mode 100644 man/file_read_version.Rd create mode 100644 man/get_file_version.Rd delete mode 100644 man/is_relic.Rd delete mode 100644 man/tar_exists_version.Rd create mode 100644 tests/testthat/helper.R create mode 100644 tests/testthat/tests-strict.R diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index fd81a0a..1229a20 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -26,9 +26,6 @@ This outlines how to propose a change to `relic`. should not call `libgit2` directly nor call `git` via the command line. If `git2r` does not expose needed functionality in `libgit2`, consider making a contribution to `git2r`. -- In general `relic` only deals with the local git repository. It does not - interface with remote repositories, nor interact with the APIs of services like - GitLab, GitHub, or gitea. ## Testing diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 3160cc9..f26309a 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -36,7 +36,7 @@ jobs: use-public-rspm: true - uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: any::sessioninfo, any::rcmdcheck, any::pkgdepends, any::covr, ropensci-review-tools/pkgcheck + extra-packages: any::sessioninfo, any::rcmdcheck, any::pkgdepends, any::covr, any::ropensci-review-tools/pkgcheck, any::riskmetric needs: check - name: Run just package tests and record test coverage run: | diff --git a/.gitignore b/.gitignore index 3b4034a..5b7d215 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ docs tests/testthat/*.log check artifacts +relic-test ### R ### # History files *Rhistory diff --git a/.lintr b/.lintr index 3d4eb22..9897437 100644 --- a/.lintr +++ b/.lintr @@ -1,2 +1,7 @@ -linters: linters_with_defaults() # see vignette("lintr") +linters: linters_with_defaults(line_length_linter = NULL, indentation_linter = NULL) +exclusions: list( + "inst/scratch", + "inst/example-repo" = list(undesirable_function_linter = Inf) + ) encoding: "UTF-8" + diff --git a/DESCRIPTION b/DESCRIPTION index 2ef6df7..988e125 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -7,7 +7,8 @@ Authors@R: c( person("EcoHealth Alliance", role = c("cph", "fnd")) ) Description: The 'relic' package provides tools for extracting files and - objects from the history of a git repository. It is a high-level + objects from the revision history, including local and remote git + repositories and S3 buckets. It is a high-level interface designed to enable comparison of objects in reproducible research workflows, especially pipelines that use the 'targets' package. @@ -18,17 +19,20 @@ BugReports: https://github.com/ecohealthalliance/relic/issues Imports: fs, git2r, + gh, rlang Suggests: callr, - devtag, glue, knitr, lintr, minioclient, - paws, + paws.storage, + openssl, rmarkdown, + rprojroot, spelling, + styler, targets, testthat (>= 3.0.0), withr @@ -36,8 +40,7 @@ VignetteBuilder: knitr Remotes: cboettig/minioclient#14, - ropensci/git2r, - moodymudskipper/devtag + ropensci/git2r Config/testthat/edition: 3 Encoding: UTF-8 Language: en-US diff --git a/NAMESPACE b/NAMESPACE index 2f0ba55..d633dbe 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,26 +1,28 @@ # Generated by roxygen2: do not edit by hand -export(commits_between) +S3method(extract_relic,default) +S3method(extract_relic,relic_git_blob) +S3method(extract_relic,relic_git_tree) +S3method(extract_relic,relic_github_blob) +S3method(extract_relic,relic_github_tree) +S3method(extract_relic,relic_s3_obj) export(create_example_repo) -export(dir_ls_version) -export(dir_ls_versions) -export(file_copy_version) -export(file_copy_versions) -export(file_read_version) -export(file_read_versions) -export(relic) +export(get_file_version) export(relic_cache) -export(relic_cache_clear) -export(tar_exists_version) -export(tar_exists_version_raw) +export(relic_cache_cleanup) +export(relic_cache_cleanup_time) +export(relic_cache_delete) +export(relic_cache_max_age) +export(relic_cache_max_size) +export(relic_cache_regular_cleanup) export(tar_meta_version) export(tar_read_raw_version) -export(tar_read_raw_versions) export(tar_read_version) -export(tar_read_versions) import(fs) import(git2r) +importFrom(gh,gh) importFrom(rlang,abort) importFrom(rlang,check_installed) importFrom(rlang,inform) +importFrom(rlang,is_scalar_character) importFrom(rlang,warn) diff --git a/R/cache.R b/R/cache.R new file mode 100644 index 0000000..dc4c9d7 --- /dev/null +++ b/R/cache.R @@ -0,0 +1,165 @@ +#' The relic cache +#' +#' @description The relic cache directory stores files that have been retrieved +#' from both local and remote repositories to avoid repeated extractions or +#' downloads. Its location can be set with the environment variable +#' `RELIC_CACHE_DIR` or `options("relic.cache.dir")`, and it defaults to the +#' user cache directory. The cache is cleaned up regularly at package startup, +#' but can also be cleaned up manually with `relic_cache_cleanup()` or cleared +#' entirely with `relic_cache_delete()`. +#' @export +#' @return The path to the relic cache directory +#' @examples +#' relic_cache() +relic_cache <- function() { + path_norm(dir_create(Sys.getenv( + "RELIC_CACHE_DIR", + getOption("relic.cache.dir", + default = tools::R_user_dir("relic", "cache") + ) + ))) +} + +#' @export +#' @rdname relic_cache +relic_cache_delete <- function() { + dir_delete(relic_cache()) +} + +#' @param max_age The maximum age of files to keep in the cache, as a [difftime][base::difftime()] +#' object. Files older than this will be deleted. Defaults to `Inf`. Can be +#' set with the environment variable `RELIC_CACHE_MAX_AGE` or +#' `options("relic.cache.max.age")`, which take numeric time in days or a +#' string with units, e.g., "1 day" or "2 weeks". +#' @param max_size The maximum size of the cache, as a string that can be parsed +#' by [fs::fs_bytes()]. Defaults to "20 MB". Can be set with the environment +#' variable `RELIC_CACHE_MAX_SIZE` or `options("relic.cache.max.size")`. +#' Cached files will be deleted from oldest to youngest until the cache size +#' is under this limit. +#' @export +#' @rdname relic_cache +relic_cache_cleanup <- function(max_age = relic_cache_max_age(), max_size = relic_cache_max_size()) { + cache_all <- dir_info(relic_cache(), recurse = TRUE, include_dirs = TRUE, all = TRUE) + min_age <- Sys.time() - max_age + file_delete(cache_all[cache_all$modification_time < min_age, ]$path) + cache_all <- cache_all[cache_all$modification_time >= min_age, ] + + # Delete oldest files up until size is under max size + cache_files <- cache_all[cache_all$type %in% c("file"), ] + file_delete(cache_files[cumsum(cache_files$size) > max_size, ]$path) + + # Delete any symlinks that point to non-existent files + file_delete(cache_all[cache_all$type == "symlink" & !file_exists(cache_all$path), ]$path) + + # Delete empty directories recursively, by checking if their path is found in the path of any files + cache_dirs <- cache_all[cache_all$type %in% c("directory"), ] + for (dir in cache_dirs$path) { + if (dir_exists(dir) && !length(dir_ls(dir, all = TRUE))) { + dir_delete(dir) + } + } + # Delete any symlinks that point to non-existent files again to get rid of directories + file_delete(cache_all[cache_all$type == "symlink" & !file_exists(cache_all$path), ]$path) +} + +#' @param cleanup_time The time between cache cleanups, as a [difftime][base::difftime()] object. +#' Defaults to 1 day. Can be set with the environment variable +#' `RELIC_CACHE_CLEANUP_TIME` or `options("relic.cache.cleanup.time")`, which +#' take numeric time in days or a string with units, e.g., "1 day" or "2 +#' weeks". If set to "Inf", no cleanup will be performed at startup. +#' @export +#' @rdname relic_cache +relic_cache_regular_cleanup <- function(cleanup_time = relic_cache_cleanup_time()) { + cache_timestamp_file <- path(path_package("relic"), "cache_timestamp.rds") + if (!file_exists(cache_timestamp_file) || + readRDS(cache_timestamp_file) < (Sys.time() - cleanup_time)) { + relic_cache_cleanup() + } + saveRDS(Sys.time(), cache_timestamp_file) +} + +#' @export +#' @rdname relic_cache +relic_cache_max_size <- function() { + fs_bytes(Sys.getenv( + "RELIC_CACHE_MAX_SIZE", + getOption("relic.cache.max.size", + default = "20 MB" + ) + )) +} + +#' @export +#' @rdname relic_cache +relic_cache_max_age <- function() { + parse_age(Sys.getenv( + "RELIC_CACHE_MAX_AGE", + getOption("relic.cache.max.age", + default = Inf + ) + )) +} + +#' @export +#' @rdname relic_cache +relic_cache_cleanup_time <- function() { + parse_age(Sys.getenv( + "RELIC_CACHE_CLEANUP_TIME", + getOption("relic.cache.cleanup.time", + default = 1 + ) + )) +} + +parse_bool <- function(x) { + if (is.logical(x)) { + out <- x + } else if (is.numeric(x)) { + out <- (x != 0) + } else if (is.character(x)) { + x <- tolower(x) + if (x %in% c("true", "t", "yes", "y", "1")) { + out <- TRUE + } else if (x %in% c("false", "f", "no", "n", "0")) { + out <- FALSE + } + } else { + abort("Invalid boolean value: ", x) + } + out +} + + +# nolint start: cyclocomp_linter +parse_age <- function(x) { + if (is.na(x) || is.null(x) || !length(x) || !nzchar(x) || x == "Inf" || is.infinite(x)) { # nolint + return(as.difftime(Inf, units = "days")) + } else if (is.numeric(x)) { + return(as.difftime(x, units = "days")) + } + x <- strsplit(x, "\\s")[[1]] + units <- if (is.na(x[2])) "days" else x[2] + as.difftime(as.numeric(x[1]), units = units) +} +# nolint end + +cache_sha <- function() { + path(dir_create(relic_cache(), "sha")) +} + +cache_git <- function() { + path(dir_create(relic_cache(), "git")) +} + +cache_gh <- function() { + path(dir_create(relic_cache(), "gh")) +} + +cache_s3 <- function() { + path(dir_create(relic_cache(), "s3")) +} + + +relic_git_cache_path <- function(relic) { + path(relic_cache(), relic@commit$sha, relic@path) +} diff --git a/R/commits_between.R b/R/commits_between.R deleted file mode 100644 index 3dc145d..0000000 --- a/R/commits_between.R +++ /dev/null @@ -1,87 +0,0 @@ -#' List commits between two git commits -#' -#' Given two commit objects, find the commits between them. -#' -#' "Between" is bit of a slippery concept in git, since there may be multiple -#' paths between two commits. Internally it uses [git2r::commits()] which, in -#' turn using the [revwalk -#' API](https://libgit2.org/libgit2/#HEAD/group/revwalk), which is similar to -#' [git rev-list](https://git-scm.com/docs/git-rev-list). Essentially, it walks -#' back from the `to/from` commit (whichever is the descendant) until it reaches -#' the other. If there are multiple paths between the two commits, it will -#' return the commits on all paths. -#' -#' @param from A commit object or revision string. A string of the form -#' 'from...to' may also be passed, in which case the commits between the two -#' revision strings are used and the `to` argument is ignored. If a list of commits -#' is passed, these commits are used rather than calculating the commits between. -#' @param to A commit object or reference -#' @param filter_file The path to a file relative to the git directory. If not NULL, -#' only commits modifying this file will be returned. Note that modifying -#' commits that occurred before the file was given its present name are not -#' returned. -#' @param repo The path to the git repository -#' @return A list of commits -#' @export -commits_between <- function(from, to = NULL, filter_file = NULL, repo = ".") { - repo <- as_repo(repo) - if (is.list(from) && all(vapply(from, is_commit, logical(1)))) { - return(commits) - } - range <- commits_range(from, to, repo) - if (is.null(range[[2]])) { - return(list(range[[1]])) - } - - # Check which is a descendant of the other - to_descendant <- descendant_of(commit = range[[2]], ancestor = range[[1]]) - from_descendant <- descendant_of(commit = range[[1]], ancestor = range[[2]]) - if (!to_descendant && !from_descendant) stop("The two commits are not related") - descendant <- if (to_descendant) range[[2]] else range[[1]] - ancestor <- if (to_descendant) range[[1]] else range[[2]] - - # Find a branch the descendant is on - branches <- branches(repo, flags = "local") - branch_matches <- vapply(branches, function(branch) { - tip <- lookup_commit(branch) - identical(tip, descendant) || - descendant_of(commit = tip, ancestor = descendant) - }, logical(1)) - if (!any(branch_matches)) stop("The descendant commit is not on any branch") - ref_branch <- names(branches)[branch_matches][[1]] - - commits <- commits( - repo = repo, - ref = ref_branch, - path = path - ) - # Filter out commits that are not descendants of the ancestor or - # are descendants of the descendant - commits <- Filter(function(commit) { - identical(commit, descendant) || - identical(commit, ancestor) || - (descendant_of(commit = commit, ancestor = ancestor) && - !descendant_of(commit = commit, ancestor = descendant)) - }, commits) - - if (to_descendant) commits <- rev(commits) - - commits -} - -#' Parse 1 or 2 commits or revision strings into two commits -#' @noRd -commits_range <- function(from, to, repo) { - if (is.character(from)) { - refs <- strsplit(from, "...", fixed = TRUE)[[1]] - if (length(refs) == 2) { - from <- revparse_single(repo, refs[[1]]) - to <- revparse_single(repo, refs[[2]]) - } - } - # Look up the commit objects if references are given - if (is.character(from)) from <- revparse_single(repo, from) - if (is.character(to)) to <- revparse_single(repo, to) - - list(from, to) -} diff --git a/R/dir_ls_version.R b/R/dir_ls_version.R deleted file mode 100644 index ce4e64c..0000000 --- a/R/dir_ls_version.R +++ /dev/null @@ -1,65 +0,0 @@ -#' List files and folders in a git repository -#' -#' @inheritParams file_read_version -#' @param all If TRUE hidden files are also returned -#' @param recurse If TRUE recurse fully, if a positive number the number of levels to recurse -#' @param type One or more of "any", "file", "directory", "symlink", or "submodule" -#' @param regexp A regular expression (e.g. '\\.csv$'⁠) passed on to grep() to filter paths. -#' @param glob A wildcard aka globbing pattern (e.g. '*.csv'⁠) passed on to grep() to filter paths -#' @param invert If TRUE, return paths that do not match the pattern or glob -#' @return A character vector of paths -#' @export -dir_ls_version <- function(path = ".", ref = "HEAD", all = FALSE, recurse = FALSE, type = "any", regexp = NULL, glob = NULL, invert = FALSE, repo = ".") { - commit <- as_commit(ref, repo) - tree <- get_obj_at_commit(path, commit) - if (!is_tree(tree)) abort("Path is not a directory") - if (!is.numeric(recurse)) recurse <- if (recurse) Inf else 0 - - modelist <- list( - file = c("100644", "100755"), - directory = "040000", - symlink = "120000", - submodules = "160000" - ) - modes <- if (type == "any") unlist(modelist) else unlist(modelist[type]) - - paths <- dir_ls_recurse(tree, recurse, modes) - - if (!all) { - paths <- path_filter(paths, regexp = "^[^\\.].*$") - } - paths <- path_filter(paths, glob = glob, regexp = regexp, invert = invert) - paths -} - -dir_ls_recurse <- function(tree, recurse, modes) { - paths <- tree$name[sprintf("%06o", tree$filemode) %in% modes] - if (recurse > 0) { - dirs <- tree$name[tree$type == "tree"] - if (!length(dirs)) { - return(paths) - } - trees <- tree[tree$type == "tree"] - if (is_tree(trees)) trees <- list(trees) - new_paths <- lapply(trees, function(x) dir_ls_recurse(x, recurse - 1, modes)) - for (i in seq_along(dirs)) new_paths[[i]] <- path(dirs[i], new_paths[[i]]) - paths <- c(paths, unlist(new_paths)) - } - path(paths) -} - -#' @inheritParams commits_between -#' @export -#' @rdname dir_ls_version -dir_ls_versions <- function(path, from = "HEAD", to = NULL, all = FALSE, recurse = FALSE, type = "any", regexp = NULL, glob = NULL, repo = ".") { - commits <- commits_between(from = from, to = to, repo = repo) - versions <- list() - for (i in seq_along(commits)) { - versions[[i]] <- dir_ls_version(path, ref = commits[[i]], all = all, recurse = recurse, type = type, regexp = regexp, glob = glob, repo = repo) - } - names(versions) <- vapply(commits, sha, character(1)) - versions -} - - -## TODO make a dir_tree_version that prints diff --git a/R/example_repo.R b/R/example_repo.R index 0b8fb2c..7eda372 100644 --- a/R/example_repo.R +++ b/R/example_repo.R @@ -9,11 +9,11 @@ #' @param overwrite Whether to overwrite the directory if it already exists. #' @return The path to the created repository #' @export -#' @examplesIf rlang::is_installed("targets") +#' @examplesIf rlang::is_installed("targets") && rlang::is_interactive() #' -#' example_repo <- create_example_repo(s3 = FALSE) -#' fs::dir_ls(example_repo, all = TRUE) -#' dir_ls_version(".", "initial-target-file", repo = example_repo) +#' example_repo <- create_example_repo(s3 = FALSE) +#' fs::dir_ls(example_repo, all = TRUE) +#' dir_ls_version(".", "initial-target-file", repo = example_repo) create_example_repo <- function(dir = fs::file_temp("relic_example_"), reporter = "silent", s3 = TRUE, overwrite = TRUE) { check_installed(c("targets", "glue", "withr")) if (dir_exists(dir)) if (overwrite) dir_delete(dir) else abort("Directory already exists") @@ -47,10 +47,10 @@ create_example_repo <- function(dir = fs::file_temp("relic_example_"), reporter targets::tar_make(reporter = reporter) stamp("longer-cars") - if(s3) { - insert_lines_at( - "_targets.R", 2, - 'Sys.setenv( + if (s3) { + insert_lines_at( + "_targets.R", 2, + 'Sys.setenv( AWS_ACCESS_KEY_ID="minioadmin", AWS_SECRET_ACCESS_KEY="minioadmin", AWS_DEFAULT_REGION="us-east-1" @@ -64,9 +64,10 @@ create_example_repo <- function(dir = fs::file_temp("relic_example_"), reporter )), repository = "aws" ) - ') - targets::tar_make(reporter = reporter) - stamp("setup-s3") + ' + ) + targets::tar_make(reporter = reporter) + stamp("setup-s3") } dir diff --git a/R/example_utils_txt.R b/R/example_utils_txt.R index 6c4ccc0..8a90e9d 100644 --- a/R/example_utils_txt.R +++ b/R/example_utils_txt.R @@ -24,7 +24,6 @@ append_to_file <- function(path, text) { overwrite_at_line <- function(path, line, text) { file_lines <- readLines(path) text <- strsplit(glue::glue(text, .open = "<<", .close = ">>", ), "\n")[[1]] - # text_lines <- seq_along(text) + line - 1 for (i in seq_along(text)) { file_lines[line + i - 1] <- text[i] } @@ -45,6 +44,6 @@ delete_lines <- function(path, lines) { insert_lines_at <- function(path, line, text) { file_lines <- readLines(path) text <- strsplit(glue::glue(text, .open = "<<", .close = ">>", ), "\n")[[1]] - file_lines <- c(file_lines[1:(line-1)], text, file_lines[(line):length(file_lines)]) + file_lines <- c(file_lines[1:(line - 1)], text, file_lines[(line):length(file_lines)]) writeLines(file_lines, path) } diff --git a/R/extract.R b/R/extract.R deleted file mode 100644 index d1993f8..0000000 --- a/R/extract.R +++ /dev/null @@ -1,71 +0,0 @@ -# Lower-level functions to extract files from a commit -get_obj_at_commit <- function(path, commit) { - splits <- path_split(path)[[1]] - obj <- tree(commit) - if (path == ".") { - return(obj) - } - - for (i in seq_along(splits)) { - parent <- obj - obj <- obj[splits[i]] - if (rlang::is_empty(obj)) { - return(NULL) - } - } - - if (is_blob(obj)) { - return(relic(obj, mode = parent$filemode[parent$id == obj$sha])) - } else { - return(obj) - } -} - -read_blob <- function(blob) { - content(blob, split = FALSE, raw = git2r::is_binary(blob)) -} - -#' Write out the contents of a blob to a file -#' Will always create the path and overwrite existing files -#' @noRd -blob_to_file <- function(relic, path, mode = attr(relic, "mode")) { - contents <- read_blob(relic) - dir_create(path_dir(path)) - if (mode == "120000") { - link_create(contents, path) - } else if (mode %in% c("100644", "100755")) { - if (is.raw(contents)) { - writeBin(contents, path) - } else { - cat(contents, file = path) - } - if (mode == "100755") { - file_chmod(path, mode = "+x") - } - } else { - abort("Unknown blob mode: ", mode, ". Must be one of 100644, 100755, 120000") - } - - path -} - -tree_to_dir <- function(obj, path, recurse = TRUE) { - dir_create(path) - if (!is.numeric(recurse)) recurse <- (if (recurse) Inf else 0) + 1 - while (recurse > 0) { - recurse <- recurse - 1 - for (i in seq_along(obj)) { - obj_i <- obj[i] - name_i <- tree$name[i] - path_i <- path(dir, name_i) - if (is_tree(obj_i)) { - dir_create(path_i) - tree_to_dir(obj_i, path_i, recurse) - } else if (is_blob(obj_i)) { - blob_to_file(obj_i, path_i, sprintf("%06o", tree$filemode[i])) - } else { - abort("Object is not a git blob or tree") - } - } - } -} diff --git a/R/extract_relic.R b/R/extract_relic.R new file mode 100644 index 0000000..07749d2 --- /dev/null +++ b/R/extract_relic.R @@ -0,0 +1,33 @@ +extract_relic <- function(relic, ...) { + UseMethod("extract_relic") +} + +#' @export +extract_relic.default <- function(relic, to, ...) { + stop("Unknown relic type.") +} + +#' @export +extract_relic.relic_git_blob <- function(relic, to, ...) { + git_blob_to_file(relic$obj, to, relic$mode) +} + +#' @export +extract_relic.relic_git_tree <- function(relic, to, recurse = TRUE, ...) { + git_tree_to_dir(relic$obj, to, recurse) +} + +#' @export +extract_relic.relic_github_blob <- function(relic, to, ...) { + github_blob_to_file(relic$path, to, relic$mode, relic$commit_sha, relic$repo$owner, relic$repo$repo) +} + +#' @export +extract_relic.relic_github_tree <- function(relic, to, recurse = TRUE, ...) { + github_tree_to_dir(relic$path, relic$obj, to, recurse, relic$commit_sha, relic$repo$owner, relic$repo$repo) +} + +#' @export +extract_relic.relic_s3_obj <- function(relic, to, ...) { + s3_obj_to_file(relic$path, relic$commit_sha, relic$bucket, to, relic$endpoint, relic$region) +} diff --git a/R/file_copy_version.R b/R/file_copy_version.R deleted file mode 100644 index d358a9b..0000000 --- a/R/file_copy_version.R +++ /dev/null @@ -1,91 +0,0 @@ -#' Copy files from git history -#' -#' Copy files from git history to disk. Directories are copied recursively. -#' -#' The default behavior is to copy the file to a cache directory named after -#' both the commit and and the file path. These paths are universally unique, -#' so if `use_cache = TRUE`, the file will not be re-copied if it already exists -#' at that path. -#' -#' @inheritParams file_read_version -#' @param name the name to give the file when it is copied to disk. Defaults to -#' the original name of the file or directory. -#' @param dir The path to the directory where the file should be copied. -#' Defaults to a system-level temporary directory named after the SHA of the -#' git commit. Will be created if does not exist. -#' @param full_name If TRUE, the file will be written to the full relative path -#' of its location in the git repository, below `dir`. Defaults to TRUE if -#' both `name` and `dir` are NULL, FALSE otherwise. -#' @param recurse If `path` is a directory, should should it be copied recursively? If -#' numeric, how many levels deep should it be copied? Defaults to TRUE. -#' @param use_cache Should the cache be used? Defaults to TRUE. -#' @return -#' - For `file_copy_version()`, A character vector of paths to the copied files or directories. -#' - For `file_copy_versions()`, a list of character vectors of paths. -#' Where files do not exist in a commit, `NA` values are returned. -#' @export -file_copy_version <- function(path, ref = "HEAD", name = NULL, dir = NULL, full_name = NULL, recurse = TRUE, repo = ".", use_cache = TRUE) { - path(vapply(path, \(x) file_copy_version_single(x, ref, name, dir, full_name, recurse, repo, use_cache), character(1))) -} - -#' @inheritParams commits_between -#' @export -#' @rdname file_copy_version -file_copy_versions <- function(path, from = "HEAD", to = NULL, name = NULL, dir = NULL, full_name = NULL, recurse = TRUE, repo = ".", use_cache = TRUE) { - commits <- commits_between(from = from, to = to, repo = repo) - versions <- lapply(commits, \(x) file_copy_version(path, x, name, dir, full_name, recurse, repo, use_cache)) - names(versions) <- vapply(commits, sha, character(1)) - versions -} - -file_copy_version_single <- function(path, ref, name, dir, full_name, recurse, repo, use_cache) { - commit <- as_commit(ref, repo) - - if (is.null(full_name)) full_name <- is.null(name) && is.null(dir) - if (is.null(name)) name <- path_file(path) - if (is.null(dir)) dir <- path(relic_cache(), sha(commit)) - if (full_name) dir <- path(dir, path_dir(path)) - - out_path <- path_norm(path(dir, name)) - - obj <- get_obj_at_commit(path, commit) - if (is_none(obj)) { - return(NA_character_) - } - - if (is_tree(obj)) { - dir_create(out_path) - return(file_copy_recursive(obj, out_path, recurse, repo, use_cache)) - } else if (is_blob(obj)) { - if (!file_exists(out_path) || !use_cache) { - blob_to_file(obj, out_path, attr(obj, "mode")) - } - } else { - abort("Path is not a file or directory") - } - - out_path -} - -file_copy_recursive <- function(obj, dir, recurse, repo, use_cache) { - if (!is.numeric(recurse)) recurse <- if (recurse) Inf else 0 - - if (recurse > 0) { - for (i in seq_along(obj)) { - obj_i <- obj[i] - name_i <- obj$name[i] - path_i <- path(dir, name_i) - if (inherits(obj_i, "git_tree")) { - dir_create(path_i) - file_copy_recursive(obj_i, dir = path_i, recurse = recurse - 1) - } else if (inherits(obj_i, "git_blob")) { - if (!file_exists(path_i) || !use_cache) { - blob_to_file(obj_i, path_i) - } - } else { - abort("Object is not a git blob or tree") - } - } - } - dir -} diff --git a/R/file_read_version.R b/R/file_read_version.R deleted file mode 100644 index 6f0eead..0000000 --- a/R/file_read_version.R +++ /dev/null @@ -1,41 +0,0 @@ -#' Read in files from git history -#' -#' Reads the contents of a file or files from a git repository into memory. If -#' the file does not exist, or is a directory, NULL will be returned. -#' -#' @param path A path or vector of paths of files and/or folders to extract, -#' relative to the git directory -#' @param ref A git commit SHA, tag, branch or other [revision -#' string](https://git-scm.com/docs/git-rev-parse.html#_specifying_revisions): -#' such as "HEAD~1", or a [git2r::commit()] object. Defaults to "HEAD". -#' @param repo The path to the git repository or a [git2r::repository()] object. -#' @export -#' @rdname file_read_version -#' @return -#' - For `file_read_version()`, a list containing the file contents of each path, named by the path. Each will be a length-1 character vector, or raw vector for binary files. -#' - For `file_read_versions()`, a list list of lists, named by the commit SHA. -file_read_version <- function(path, ref = "HEAD", repo = ".") { - contents <- lapply(path, \(x) file_read_version_single(x, ref, repo)) - names(contents) <- path - contents -} - -#' @export -#' @inheritParams commits_between -#' @rdname file_read_version -file_read_versions <- function(path, from = "HEAD", to = NULL, repo = ".") { - commits <- commits_between(from, to, repo) - versions <- lapply(commits, \(x) file_read_version(path, x, repo)) - names(versions) <- vapply(commits, sha, character(1)) - versions -} - -file_read_version_single <- function(path, ref, repo) { - commit <- as_commit(ref, repo) - obj <- get_obj_at_commit(path, commit) - if (is_tree(obj) || rlang::is_empty(obj)) { - return(NULL) - } else if (is_blob(obj)) { - return(read_blob(obj)) - } -} diff --git a/R/get_file_version.R b/R/get_file_version.R new file mode 100644 index 0000000..58e2949 --- /dev/null +++ b/R/get_file_version.R @@ -0,0 +1,39 @@ +#' Get in files from versioned repository +#' +#' Fetches the contents of a file from a versioned repository - a local git +#' repository, a GitHub repository, or an S3 bucket. Always fetches the file +#' to a local cache and returns the path to it. +#' +#' @param path Path of a file relative to the git directory or bucket +#' @param ref A git commit SHA, tag, branch or other [revision +#' string](https://git-scm.com/docs/git-rev-parse.html#_specifying_revisions): +#' such as "HEAD~1", "main@{2023-02-26 18:30:00}", or "branch@{yesterday}". +#' Can also be a [git2r::commit()] object. For S3 buckets, a VersionID. +#' Defaults to "HEAD", which also means the latest version in an S3 bucket. +#' @param repo The repository to get the file from. This can be a local git +#' directory, a GitHub repository (URL or "owner/repo" string), or an S3 bucket +#' indicated by "s3://bucket-name". Defaults to the current working directory. +#' @param endpoint,region For S3 buckets, the endpoint and region of the bucket. +#' If NULL, the default endpoint and region in local config or environment variables are used. +#' (Usually `us-east-1` and `s3.amazonaws.com`.) +#' @export +#' @return A [path][fs::path()] to the file in the local cache. +get_file_version <- function(path, ref = "HEAD", repo = ".", endpoint = NULL, region = NULL) { + if (!is_scalar_character(path)) { + stop("path must be a single string") + } + + # Check if the file is in the cache already + in_cache <- path_in_cache(path, ref) + if (!is.na(in_cache)) { + return(in_cache) + } + relic <- make_relic(path, ref, repo, endpoint, region) + + relic_cache_path <- path(cache_sha(), relic$commit_sha, relic$path) + if (relic$type == "blob" && file_exists(relic_cache_path)) { + return(relic_cache_path) + } else { + extract_relic(relic, relic_cache_path) + } +} diff --git a/R/git_to_disk.R b/R/git_to_disk.R new file mode 100644 index 0000000..594817a --- /dev/null +++ b/R/git_to_disk.R @@ -0,0 +1,48 @@ +git_blob_to_file <- function(blob, to, mode) { + contents <- content(blob, split = FALSE, raw = TRUE) + write_with_mode(contents, to, mode) + to +} + +git_tree_to_dir <- function(tree, to, recurse, skip = TRUE) { + if (!is.numeric(recurse)) recurse <- (if (recurse) Inf else 0) + 1 + if (recurse > 0) { + for (i in seq_along(tree)) { + obj_i <- tree[i] + name_i <- tree$name[i] + path_i <- path(to, name_i) + if (is_tree(obj_i)) { + dir_create(path_i) + git_tree_to_dir(obj_i, path_i, recurse = recurse - 1) + } else if (is_blob(obj_i)) { + if (!skip || !file_exists(path_i)) { + git_blob_to_file(obj_i, path_i, tree$filemode[i]) + } + } else { + abort("Object is not a git blob or tree") + } + } + } + to +} + +write_with_mode <- function(contents, to, mode, read_only = TRUE) { + dir_create(path_dir(to)) + if (mode == 40960L) { + link_create(rawToChar(contents), to) + } else if (mode %in% c(33188L, 33261L)) { + writeBin(contents, to) + } else { + abort(paste0( + "Unknown blob mode: ", mode, + ", Must be one of 40960L, 33188L, 33261L. Submodules not yet supported." + )) + } + if (read_only) { + file_chmod(to, mode = "u-w") + } + if (mode == 33261L) { + file_chmod(to, mode = "+x") + } + to +} diff --git a/R/github_to_disk.R b/R/github_to_disk.R new file mode 100644 index 0000000..446d13c --- /dev/null +++ b/R/github_to_disk.R @@ -0,0 +1,56 @@ +github_blob_to_file <- function(path, to, mode, commit_sha, owner, repo) { + contents <- as.raw(gh("/repos/{owner}/{repo}/contents/{path}", + owner = owner, repo = repo, path = path, ref = commit_sha, + .accept = "application/vnd.github.v3.raw" + )) + write_with_mode(contents, to, mode) + to +} + +github_tree_to_dir <- function(path, tree, to, recurse, commit_sha, owner, repo, skip = TRUE) { + if (!is.numeric(recurse)) recurse <- (if (recurse) Inf else 0) + 1 + if (recurse > 0) { + for (i in seq_along(tree)) { + obj_i <- tree[[i]] + name_i <- obj_i$name + to_path_i <- path(to, name_i) + abs_path_i <- path(path, name_i) + if (obj_i$type == "tree") { + dir_create(to_path_i) + next_tree <- get_github_tree(obj_i$oid, owner, repo) + github_tree_to_dir(abs_path_i, next_tree, to_path_i, recurse = recurse - 1, commit_sha, owner, repo) + } else if (obj_i$type == "blob") { + if (!skip || !file_exists(to_path_i)) { + github_blob_to_file(abs_path_i, to_path_i, obj_i$mode, commit_sha, owner, repo) + } + } else { + abort("Object is not a git blob or tree") + } + } + } + to +} + + +get_github_tree <- function(oid, owner, repo) { + vars <- list(owner = owner, repo = repo, expression = oid) + gh_response <- gql(gh_tree_query(), variables = vars) + gh_response$data$repository$object$entries +} + +gh_tree_query <- function() { + "query ($owner: String!, $repo: String!, $expression: String!) { + repository(owner: $owner, name: $repo) { + object(expression: $expression) { + ... on Tree { + entries { + name + mode + type + oid + } + } + } + } + }" +} diff --git a/R/make_relic.R b/R/make_relic.R new file mode 100644 index 0000000..bf765e4 --- /dev/null +++ b/R/make_relic.R @@ -0,0 +1,130 @@ +make_relic <- function(path, ref, repo, ...) { + repo <- as_relic_repo(repo) + switch(class(repo), + relic_git_repo = make_relic_git(path, ref, repo), + relic_github_repo = make_relic_github(path, ref, repo), + relic_s3_repo = make_relic_s3(path, ref, repo, ...) + ) +} + +make_relic_git <- function(path, ref, repo) { + commit <- as_commit(ref, repo) + path <- path(path) + splits <- path_split(path)[[1]] + obj <- tree(commit) + if (path == ".") { + mode <- 16384L + type <- "tree" + } else { + for (i in seq_along(splits)) { + parent <- obj + obj <- obj[splits[i]] + if (rlang::is_empty(obj)) abort(paste0("File '", path, "' does not exist at '", ref, "' in ", repo)) + } + mode <- parent$filemode[parent$name == splits[i]] + type <- parent$type[parent$name == splits[i]] + } + structure(list( + commit_sha = commit$sha, + obj = obj, + path = path, + mode = mode, + type = type, + repo = structure(commit$repo$path, class = "relic_git_repo") + ), class = paste0("relic_git_", type)) +} + +make_relic_github <- function(path, ref, repo) { + vars <- list( + owner = repo$owner, + repo = repo$repo, + expression = ref, + path = path + ) + gh_response <- gql(query = gh_relic_query(), variables = vars) + structure(list( + commit_sha = gh_response$data$repository$object$oid, + obj = gh_response$data$repository$object$file$object$entries, + path = path(gh_response$data$repository$object$file$path), + mode = gh_response$data$repository$object$file$mode, + type = gh_response$data$repository$object$file$type, + repo = structure(list( + owner = gh_response$data$repository$owner$login, + repo = gh_response$data$repository$name + ), class = "relic_github_repo") + ), class = paste0("relic_github_", gh_response$data$repository$object$file$type)) +} + +# For now only fetching single items. Future might fetch multiple, allowing for +# version to be set by date +make_relic_s3 <- function(key, version, bucket, endpoint = NULL, region = NULL) { + check_installed("paws.storage") + s3 <- paws.storage::s3( + endpoint = endpoint, + region = region + ) + + versions <- list() + truncated <- TRUE + while (truncated) { + versions_resp <- s3$list_object_versions( + Bucket = bucket, + Prefix = key, + VersionIdMarker = if (!length(versions)) NULL else versions_resp$NextVersionIdMarker, + KeyMarker = if (!length(versions)) NULL else versions_resp$NextKeyMarker + ) + versions <- c(versions, versions_resp$Versions) + truncated <- versions_resp$IsTruncated + } + version_ids <- vapply(versions, function(x) x$VersionId, character(1)) + if (is.null(version) || + tolower(version) %in% c("latest", "current", "head", "null")) { + version_id <- version_ids[vapply(versions, function(x) x$IsLatest, logical(1))] + } else { + version_id <- version_ids[substr(version_ids, 1, nchar(version)) == version] + } + + structure(list( + path = key, + commit_sha = version_id, + bucket = bucket, + endpoint = endpoint, + region = region, + type = "blob" + ), class = "relic_s3_obj") +} + +gh_relic_query <- function() { + "query ($owner: String!, $repo: String!, $expression: String!, $path: String!) { + repository(owner: $owner, name: $repo) { + url + owner { + login + } + name + object(expression: $expression) { + ... on Commit { + oid + file(path: $path) { + path + name + mode + type + oid + object { + ... on Tree { + entries { + path + name + mode + type + oid + } + } + } + } + } + } + } + }" +} diff --git a/R/path_in_cache.R b/R/path_in_cache.R new file mode 100644 index 0000000..96b377d --- /dev/null +++ b/R/path_in_cache.R @@ -0,0 +1,37 @@ +#' Given a ref checks if it exists in the cache and returns the full SHA +#' @noRd +ref_in_cache <- function(ref) { + if (!is_scalar_character(ref)) { + abort("ref must be a single character string") + } + + if (nchar(ref) < 4) { + abort("ref must be at least 4 characters long") + } + + sha_dirs <- path_file(dir_ls(cache_sha(), type = "directory", recurse = FALSE)) + sha_dirs <- sha_dirs[substr(sha_dirs, 1, nchar(ref)) == ref] + + if (length(sha_dirs) > 1) { + abort("Ambiguous lookup: multiple matches found for ref ", ref, "in cache.") + } else if (length(sha_dirs) == 0) { + return(NA_character_) + } else { + return(sha_dirs) + } +} + +#' returns path to a file in the cache, and NA if it doesn't exist, vectorized. +#' Only works for files, not directories, which also return NA +#' @noRd +#' +path_in_cache <- function(path, ref) { + ref <- ref_in_cache(ref) + if (is.na(ref)) { + return(rep(NA_character_, length(path))) + } + paths <- path(cache_sha(), ref, path) + paths[!file_exists(paths) | is_dir(paths)] <- NA_character_ + + paths +} diff --git a/R/relic-package.R b/R/relic-package.R index 4759dfd..5787ec6 100644 --- a/R/relic-package.R +++ b/R/relic-package.R @@ -5,6 +5,11 @@ ## usethis namespace: start #' @import fs git2r -#' @importFrom rlang warn abort inform check_installed +#' @importFrom rlang warn abort inform check_installed is_scalar_character +#' @importFrom gh gh ## usethis namespace: end NULL + +.onLoad <- function(...) { + if (!is.infinite(relic_cache_cleanup_time())) relic_cache_regular_cleanup() +} diff --git a/R/relic.R b/R/relic.R deleted file mode 100644 index 3fc8763..0000000 --- a/R/relic.R +++ /dev/null @@ -1,27 +0,0 @@ -#' An internal S3 class of git objects with additional metadata -#' @param x an object -is_relic <- function(x) inherits(x, "relic") && !is.null(attr(x, "mode")) - -#' @export -#' @noRd -relic <- function(x, ...) { - UseMethod("relic") -} - -relic.git_tree <- function(x, i, ...) { - blob <- x[i] - if (!is_blob(blob)) abort("Object is not a git blob") - structure( - blob, - mode = if (is.integer(mode)) sprintf("%06o", mode) else mode, - class = c("relic", "git_blob") - ) -} - -relic.git_blob <- function(x, mode, ...) { - structure( - x, - mode = if (is.integer(mode)) sprintf("%06o", mode) else mode, - class = c("relic", "git_blob") - ) -} diff --git a/R/s3_to_disk.R b/R/s3_to_disk.R new file mode 100644 index 0000000..0af716a --- /dev/null +++ b/R/s3_to_disk.R @@ -0,0 +1,6 @@ +s3_obj_to_file <- function(key, version_id, bucket, to, endpoint = NULL, region = NULL) { + s3 <- paws.storage::s3(endpoint = endpoint, region = region) + dir_create(path_dir(to)) + s3$download_file(Bucket = bucket, Key = key, Filename = to, VersionId = version_id) + to +} diff --git a/R/tar_exists_version.R b/R/tar_exists_version.R deleted file mode 100644 index 3714616..0000000 --- a/R/tar_exists_version.R +++ /dev/null @@ -1,24 +0,0 @@ -#' Check if a target exists in a version of the pipeline -#' -#' @inheritParams tar_read_version -#' @export -tar_exists_version <- function(name, ref = "HEAD", branches = NULL, repo = ".", store = targets::tar_path_store()) { - check_installed("targets") - name <- targets::tar_deparse_language(substitute(name)) - tar_exists_version_raw(name, ref, branches, repo, store) -} - -#' @export -#' @rdname tar_exists_version -tar_exists_version_raw <- function(name, ref = "HEAD", branches = NULL, repo = ".", store = targets::tar_path_store()) { - check_installed("targets") - repo <- as_repo(repo) - ref <- as_commit(ref, repo) - path_store <- file_copy_version(store, ref, repo = repo, recurse = FALSE) - meta <- tar_meta_version(ref = ref, store = path_store) - target_meta <- meta[meta$name == name, ] - if (!nrow(target_meta)) { - return(FALSE) - } - TRUE -} diff --git a/R/tar_meta_version.R b/R/tar_meta_version.R index 6aebd9c..8e03626 100644 --- a/R/tar_meta_version.R +++ b/R/tar_meta_version.R @@ -1,15 +1,31 @@ -#' Read a target project's metadata from git history +#' Read a target project's metadata from repository history +#' +#' This function extracts [targets metadata][targets::tar_meta()] from versioned history. +#' In most cases this is a git repository, but it can also be an S3 cloud bucket +#' for a project using cloud versioning and storing the metadata file in the cloud. +#' (See `repository_meta` in [targets::tar_option_set()]). #' -#' @export #' @param ref A git commit SHA, tag, branch or other [revision #' string](https://git-scm.com/docs/git-rev-parse.html#_specifying_revisions): -#' such as "HEAD~1", or a [git2r::commit()] object. Defaults to "HEAD". -#' @param store Path to the targets store within the project. Defaults to the current project's store. -#' @param repo The path to the git repository or a [git2r::repository()] object. +#' such as "HEAD~1", "main@{2023-02-26 18:30:00}", or "branch@{yesterday}". +#' Can also be a [git2r::commit()] object. For S3 buckets, a VersionID. +#' Defaults to "HEAD", which also means the latest version in an S3 bucket. +#' @param store Path to the targets store within the project. Defaults to `"_targets`, or the current project's [store name][targets::tar_path_store()] if `repo = "."`. +#' @param repo The repository to get the file from. This can be a local git +#' directory, a GitHub repository (URL or "owner/repo" string), or an S3 bucket +#' indicated by "s3://bucket-name". Defaults to the current working directory. +#' @param endpoint,region For S3 buckets, the endpoint and region of the bucket. #' @param ... Arguments passed to [targets::tar_meta()] -tar_meta_version <- function(ref = "HEAD", ..., store = targets::tar_path_store(), repo = ".") { +#' @return A data frame with one row per target/object. See [targets::tar_meta()] for details. +#' @export +tar_meta_version <- function(ref = "HEAD", store = NULL, repo = ".", endpoint = NULL, region = NULL, ...) { check_installed("targets") - path_meta <- file_copy_version(path(store, "meta", "meta"), ref, repo = repo) - meta <- targets::tar_meta(..., store = store) + if (is.null(store)) { + store <- if (repo == ".") targets::tar_config_get("store") else "_targets" + } + metafile <- get_file_version(path(store, "meta", "meta"), ref, repo = repo, endpoint, region) + path_store <- path_dir(path_dir(metafile)) + meta <- targets::tar_meta(..., store = path_store) + attr(meta, "path_store") <- path_store meta } diff --git a/R/tar_read_version.R b/R/tar_read_version.R index 333beb4..7ccd6c4 100644 --- a/R/tar_read_version.R +++ b/R/tar_read_version.R @@ -7,74 +7,96 @@ #' #' For cloud targets, the target metadata is read from git history and then #' this metadata is used to download the target from the cloud. For this to work, -#' cloud storage must be set up with versioning. Note that the cloud configuration -#' will use same bucket/endpoint/credentials set in the _current_ environment. +#' cloud storage must be set up with versioning. Note that targets metadata +#' includes the bucket, endpoint, and region of a S3-stored target, but you must still +#' provide an AWS access key and secret as environment variables. If these differ +#' from the credentials used for your current project or environment, can use +#' [withr::with_envvar()] to temporarily set the credentials. #' #' @param name Name of the target. `tar_read_version()` can take a symbol, `tar_read_raw_version()` requires a character. -#' @param branches Integer of indices of the (targets) branches to load if the target is a pattern. -#' @param store Path to the targets store within the project. Defaults to the current project's store. -#' @inheritParams file_read_version +#' @param ref A git commit SHA, tag, branch or other [revision +#' string](https://git-scm.com/docs/git-rev-parse.html#_specifying_revisions): +#' such as "HEAD~1", "main@{2023-02-26 18:30:00}", or "branch@{yesterday}". +#' Can also be a [git2r::commit()] object. For S3 buckets, a VersionID. +#' Defaults to "HEAD", which also means the latest version in an S3 bucket. +#' @param store Path to the targets store within the project. Defaults to `"_targets`, or the current project's [store name][targets::tar_path_store()] if `repo = "."`. +#' @param repo The repository to get the file from. This can be a local git +#' directory, a GitHub repository (URL or "owner/repo" string), or an S3 bucket +#' indicated by "s3://bucket-name". Defaults to the current working directory. #' @export -#' @return The target's return values, loaded files in the git file/⁠, or the paths to the custom files and directories if format = "file" was set. -#' If the target is not found at the commit, NULL is returned. See [tar_exists_version()] to check for the presence of a target. -tar_read_version <- function(name, ref = "HEAD", branches = NULL, extract_files = TRUE, repo = ".", store = targets::tar_path_store()) { +#' @return The target's value. If the target is of `format = "file"`, this will be the path to the file in the [relic cache][relic_cache()]. +tar_read_version <- function(name, ref = "HEAD", repo = ".", store = targets::tar_path_store()) { check_installed("targets") name <- targets::tar_deparse_language(substitute(name)) - tar_read_raw_version(name, branches, ref, extract_files, repo, store) -} - -#' @export -#' @inheritParams commits_between -#' @rdname tar_read_version -tar_read_versions <- function(name, from = "HEAD", to = NULL, branches = NULL, extract_files = TRUE, repo = ".", store = targets::tar_path_store()) { - check_installed("targets") - name <- targets::tar_deparse_language(substitute(name)) - tar_read_raw_versions(name, from, to, branches, extract_files, repo, store) + tar_read_raw_version(name, ref, repo, store) } #' @rdname tar_read_version -#' @param extract_files If TRUE, targets of type "file" will be extracted to a temporary directory and these paths will be returned. If FALSE, the paths as stored in the target will be returned unmodified #' @export -tar_read_raw_version <- function(name, ref = "HEAD", branches = NULL, extract_files = TRUE, repo = ".", store = targets::tar_path_store()) { +tar_read_raw_version <- function(name, ref = "HEAD", repo = ".", store = NULL) { check_installed("targets") - repo <- as_repo(repo) - ref <- as_commit(ref, repo) - path_store <- file_copy_version(store, ref, repo = repo, recurse = FALSE) - meta <- tar_meta_version(ref = ref, store = path_store) - target_meta <- meta[meta$name == name, ] - if (!nrow(target_meta)) { - return(NULL) + meta <- tar_meta_version(ref = ref, store = store, repo = repo) + path_store <- attr(meta, "path_store") + index <- meta$name == name + if (!any(index) || sum(index) > 1) { + abort("Target '", name, "' not found in the targets store (or multiple found).") } - # For local targets - if (target_meta$repository == "local") { - if (!target_meta$format == "file") { - if (target_meta$type == "pattern") { - file_copy_version(path(targets::tar_path_objects_dir(path_store), target_meta$children[[1]]), ref, repo = repo) - } else { - file_copy_version(path(targets::tar_path_objects_dir(path_store), name), ref, repo = repo) - } - target <- targets::tar_read_raw(name, meta = meta, store = path_store) - } else if (target_meta$format == "file") { - file_path <- targets::tar_read_raw(name, meta = meta, store = path_store) - if (extract_files) { - target <- file_copy_version(file_path, ref, repo = repo) - } else { - target <- file_path - } - } - } else { - targets <- targets::tar_read_raw(name, meta = meta, store = path_store) + record <- meta[max(which(index)), , drop = FALSE] + + if (record$type == "pattern") { + abort("Branched targets not supported yet") } + + target <- switch(record$repository, + local = read_target_aws(record, path_store = path_store), + aws = read_target_aws(record, path_store = path_store), + abort("Unknown targets repository type: ", record$repository) + ) + target } -#' @rdname tar_read_version -#' @export -tar_read_raw_versions <- function(name, from = "HEAD", to = NULL, branches = NULL, extract_files = TRUE, repo = ".", store = targets::tar_path_store()) { - check_installed("targets") - repo <- as_repo(repo) - commits <- commits_between(from, to, repo) - versions <- lapply(commits, \(x) tar_read_raw_version(name, x, branches, extract_files, repo, store)) - names(versions) <- vapply(commits, sha, character(1)) - versions +read_target_aws <- function(record, path_store) { + aws_loc <- aws_loc_from_meta_path(record$path[[1]]) + local_target_path <- get_file_version( + path = aws_loc$key, ref = aws_loc$version, + repo = paste0("s3://", aws_loc$bucket), + endpoint = aws_loc$endpoint, region = aws_loc$region + ) + record_local <- record + record_local$path <- list(local_target_path) + record_local$repository <- "local" + targets::tar_read_raw(record_local$name, + meta = record_local, + store = path_dir(path_dir(local_target_path)) + ) +} + +read_target_local <- function(record, path_store) { + # For local targets + local_target_path <- get_file_version( + path = record$path[[1]], ref = record$version, + repo = record$repository + ) + record_local <- record + record_local$path <- list(local_target_path) + + targets::tar_read_raw(record_local$name, + meta = record_local, + store = path_store + ) +} + +aws_loc_from_meta_path <- function(path) { + splits <- strsplit(path, "=") + aws_loc <- structure(lapply(splits, function(x) x[[2]]), + .Names = vapply(splits, function(x) x[[1]], character(1)) + ) + if (!is.null(aws_loc$endpoint)) { + aws_loc$endpoint <- rawToChar(openssl::base64_decode(aws_loc$endpoint)) + if (aws_loc$endpoint == "NUL") { + aws_loc["endpoint"] <- list(NULL) + } + } + aws_loc } diff --git a/R/utils.R b/R/utils.R index 27edde4..11ea521 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,50 +1,47 @@ -#' The relic cache directory -#' -#' Get the relic cache directory, which can be specified as either the R option -#' `relic.cache.dir` or the environment variable `RELIC_CACHE_DIR`. (The -#' environment variable has higher priority). If neither -#' is set, the default is set by tools::R_user_dir("relic", "cache"). -#' @export -#' @return The path to the relic cache directory -#' @examples -#' relic_cache() -relic_cache <- function() { - Sys.getenv( - "RELIC_CACHE_DIR", - getOption("relic.cache.dir", - default = tools::R_user_dir("relic", "cache") - ) - ) -} - -#' @export -#' @rdname relic_cache -relic_cache_clear <- function() { - dir_delete(relic_cache()) -} - - -#' Wrappers to convert objects to git2r objects but allow git2r objects to pass though +#' Wrappers to convert objects to git2r objects but allow git2r objects to pass +#' though #' @noRd -as_repo <- function(x) { +as_relic_repo <- function(x) { if (inherits(x, "git_repository")) { - x + repo <- structure(path(x$path), class = "relic_git_repo") + } else if (is.character(x) && dir_exists(x)) { + repo <- structure(path(discover_repository(x)), class = "relic_git_repo") + } else if (!is.null(gh_repo <- github_owner_repo(x))) { + repo <- structure(gh_repo, class = "relic_github_repo") + } else if (is.character(x) && substr(x, 1, 5) == "s3://") { + repo <- structure(substr(x, 6, nchar(x)), class = "relic_s3_repo") } else { - repository(x) + abort("Unable to find repository for ", x) } + return(repo) } +github_owner_repo <- function(url) { + gh_regex <- "^(git@github.com:|https?://github.com/)?(?[^/]+)/(?[^/]+)(\\.git|/.*)?$" + owner_repo <- regmatches(url, regexec(gh_regex, url, perl = TRUE))[[1]][c("owner", "repo")] + if (any(is.na(owner_repo))) { + return(NULL) + } + as.list(owner_repo) +} + + #' @rdname as_repo #' @noRd as_commit <- function(x, repo = ".") { if (is_commit(x)) { x } else if (is.character(x)) { - repo <- as_repo(repo) revparse_single(repo, x) } else { commit(x) } } -is_none <- function(x) length(x) == 0 +gql <- function(query, ...) { + gh( + endpoint = "POST /graphql", query = query, + .send_headers = c("X-Github-Next-Global-ID" = "1"), + ... + ) +} diff --git a/inst/WORDLIST b/inst/WORDLIST index 58e7392..0d821b6 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -1,5 +1,6 @@ CMD codecov +config csv dir doltr @@ -13,9 +14,11 @@ MinIO ORCID pkgcheck recurse +repo retarting revwalk scm SHA submodule symlink +VersionID diff --git a/inst/cache_timestamp.rds b/inst/cache_timestamp.rds new file mode 100644 index 0000000000000000000000000000000000000000..b71bc3ecbc33e5fd5f1710d9f8d356792b76f4c6 GIT binary patch literal 95 zcmb2|=3oE==I#ec2?+^l35jVb32CfGk`d0%cS>{{l4>0I7C%1g!T(Y3m=hbDn3}-+ w#@0sRB^s9M8+S;Y5MW4BTcJ?hP`@GFrHFINX|`w67$W~OuygK_O9h$>0J~EkzyJUM literal 0 HcmV?d00001 diff --git a/man/commits_between.Rd b/man/commits_between.Rd deleted file mode 100644 index 3250d67..0000000 --- a/man/commits_between.Rd +++ /dev/null @@ -1,38 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/commits_between.R -\name{commits_between} -\alias{commits_between} -\title{List commits between two git commits} -\usage{ -commits_between(from, to = NULL, filter_file = NULL, repo = ".") -} -\arguments{ -\item{from}{A commit object or revision string. A string of the form -'from...to' may also be passed, in which case the commits between the two -revision strings are used and the \code{to} argument is ignored. If a list of commits -is passed, these commits are used rather than calculating the commits between.} - -\item{to}{A commit object or reference} - -\item{filter_file}{The path to a file relative to the git directory. If not NULL, -only commits modifying this file will be returned. Note that modifying -commits that occurred before the file was given its present name are not -returned.} - -\item{repo}{The path to the git repository} -} -\value{ -A list of commits -} -\description{ -Given two commit objects, find the commits between them. -} -\details{ -"Between" is bit of a slippery concept in git, since there may be multiple -paths between two commits. Internally it uses \code{\link[git2r:commits]{git2r::commits()}} which, in -turn using the \href{https://libgit2.org/libgit2/#HEAD/group/revwalk}{revwalk API}, which is similar to -\href{https://git-scm.com/docs/git-rev-list}{git rev-list}. Essentially, it walks -back from the \code{to/from} commit (whichever is the descendant) until it reaches -the other. If there are multiple paths between the two commits, it will -return the commits on all paths. -} diff --git a/man/create_example_repo.Rd b/man/create_example_repo.Rd index 0febe5e..0592062 100644 --- a/man/create_example_repo.Rd +++ b/man/create_example_repo.Rd @@ -29,10 +29,10 @@ This function generates a repository with a commit history that can be used for testing and examples. } \examples{ -\dontshow{if (rlang::is_installed("targets")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if (rlang::is_installed("targets") && rlang::is_interactive()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} - example_repo <- create_example_repo(s3 = FALSE) - fs::dir_ls(example_repo, all = TRUE) - dir_ls_version(".", "initial-target-file", repo = example_repo) +example_repo <- create_example_repo(s3 = FALSE) +fs::dir_ls(example_repo, all = TRUE) +dir_ls_version(".", "initial-target-file", repo = example_repo) \dontshow{\}) # examplesIf} } diff --git a/man/dir_ls_version.Rd b/man/dir_ls_version.Rd deleted file mode 100644 index 0002078..0000000 --- a/man/dir_ls_version.Rd +++ /dev/null @@ -1,65 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dir_ls_version.R -\name{dir_ls_version} -\alias{dir_ls_version} -\alias{dir_ls_versions} -\title{List files and folders in a git repository} -\usage{ -dir_ls_version( - path = ".", - ref = "HEAD", - all = FALSE, - recurse = FALSE, - type = "any", - regexp = NULL, - glob = NULL, - invert = FALSE, - repo = "." -) - -dir_ls_versions( - path, - from = "HEAD", - to = NULL, - all = FALSE, - recurse = FALSE, - type = "any", - regexp = NULL, - glob = NULL, - repo = "." -) -} -\arguments{ -\item{path}{A path or vector of paths of files and/or folders to extract, -relative to the git directory} - -\item{ref}{A git commit SHA, tag, branch or other \href{https://git-scm.com/docs/git-rev-parse.html#_specifying_revisions}{revision string}: -such as "HEAD~1", or a \code{\link[git2r:commit]{git2r::commit()}} object. Defaults to "HEAD".} - -\item{all}{If TRUE hidden files are also returned} - -\item{recurse}{If TRUE recurse fully, if a positive number the number of levels to recurse} - -\item{type}{One or more of "any", "file", "directory", "symlink", or "submodule"} - -\item{regexp}{A regular expression (e.g. '\\.csv$'⁠) passed on to grep() to filter paths.} - -\item{glob}{A wildcard aka globbing pattern (e.g. '*.csv'⁠) passed on to grep() to filter paths} - -\item{invert}{If TRUE, return paths that do not match the pattern or glob} - -\item{repo}{The path to the git repository or a \code{\link[git2r:repository]{git2r::repository()}} object.} - -\item{from}{A commit object or revision string. A string of the form -'from...to' may also be passed, in which case the commits between the two -revision strings are used and the \code{to} argument is ignored. If a list of commits -is passed, these commits are used rather than calculating the commits between.} - -\item{to}{A commit object or reference} -} -\value{ -A character vector of paths -} -\description{ -List files and folders in a git repository -} diff --git a/man/file_copy_version.Rd b/man/file_copy_version.Rd deleted file mode 100644 index 92efce6..0000000 --- a/man/file_copy_version.Rd +++ /dev/null @@ -1,78 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/file_copy_version.R -\name{file_copy_version} -\alias{file_copy_version} -\alias{file_copy_versions} -\title{Copy files from git history} -\usage{ -file_copy_version( - path, - ref = "HEAD", - name = NULL, - dir = NULL, - full_name = NULL, - recurse = TRUE, - repo = ".", - use_cache = TRUE -) - -file_copy_versions( - path, - from = "HEAD", - to = NULL, - name = NULL, - dir = NULL, - full_name = NULL, - recurse = TRUE, - repo = ".", - use_cache = TRUE -) -} -\arguments{ -\item{path}{A path or vector of paths of files and/or folders to extract, -relative to the git directory} - -\item{ref}{A git commit SHA, tag, branch or other \href{https://git-scm.com/docs/git-rev-parse.html#_specifying_revisions}{revision string}: -such as "HEAD~1", or a \code{\link[git2r:commit]{git2r::commit()}} object. Defaults to "HEAD".} - -\item{name}{the name to give the file when it is copied to disk. Defaults to -the original name of the file or directory.} - -\item{dir}{The path to the directory where the file should be copied. -Defaults to a system-level temporary directory named after the SHA of the -git commit. Will be created if does not exist.} - -\item{full_name}{If TRUE, the file will be written to the full relative path -of its location in the git repository, below \code{dir}. Defaults to TRUE if -both \code{name} and \code{dir} are NULL, FALSE otherwise.} - -\item{recurse}{If \code{path} is a directory, should should it be copied recursively? If -numeric, how many levels deep should it be copied? Defaults to TRUE.} - -\item{repo}{The path to the git repository or a \code{\link[git2r:repository]{git2r::repository()}} object.} - -\item{use_cache}{Should the cache be used? Defaults to TRUE.} - -\item{from}{A commit object or revision string. A string of the form -'from...to' may also be passed, in which case the commits between the two -revision strings are used and the \code{to} argument is ignored. If a list of commits -is passed, these commits are used rather than calculating the commits between.} - -\item{to}{A commit object or reference} -} -\value{ -\itemize{ -\item For \code{file_copy_version()}, A character vector of paths to the copied files or directories. -\item For \code{file_copy_versions()}, a list of character vectors of paths. -Where files do not exist in a commit, \code{NA} values are returned. -} -} -\description{ -Copy files from git history to disk. Directories are copied recursively. -} -\details{ -The default behavior is to copy the file to a cache directory named after -both the commit and and the file path. These paths are universally unique, -so if \code{use_cache = TRUE}, the file will not be re-copied if it already exists -at that path. -} diff --git a/man/file_read_version.Rd b/man/file_read_version.Rd deleted file mode 100644 index a91c3df..0000000 --- a/man/file_read_version.Rd +++ /dev/null @@ -1,37 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/file_read_version.R -\name{file_read_version} -\alias{file_read_version} -\alias{file_read_versions} -\title{Read in files from git history} -\usage{ -file_read_version(path, ref = "HEAD", repo = ".") - -file_read_versions(path, from = "HEAD", to = NULL, repo = ".") -} -\arguments{ -\item{path}{A path or vector of paths of files and/or folders to extract, -relative to the git directory} - -\item{ref}{A git commit SHA, tag, branch or other \href{https://git-scm.com/docs/git-rev-parse.html#_specifying_revisions}{revision string}: -such as "HEAD~1", or a \code{\link[git2r:commit]{git2r::commit()}} object. Defaults to "HEAD".} - -\item{repo}{The path to the git repository or a \code{\link[git2r:repository]{git2r::repository()}} object.} - -\item{from}{A commit object or revision string. A string of the form -'from...to' may also be passed, in which case the commits between the two -revision strings are used and the \code{to} argument is ignored. If a list of commits -is passed, these commits are used rather than calculating the commits between.} - -\item{to}{A commit object or reference} -} -\value{ -\itemize{ -\item For \code{file_read_version()}, a list containing the file contents of each path, named by the path. Each will be a length-1 character vector, or raw vector for binary files. -\item For \code{file_read_versions()}, a list list of lists, named by the commit SHA. -} -} -\description{ -Reads the contents of a file or files from a git repository into memory. If -the file does not exist, or is a directory, NULL will be returned. -} diff --git a/man/get_file_version.Rd b/man/get_file_version.Rd new file mode 100644 index 0000000..e0e9fdb --- /dev/null +++ b/man/get_file_version.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_file_version.R +\name{get_file_version} +\alias{get_file_version} +\title{Get in files from versioned repository} +\usage{ +get_file_version( + path, + ref = "HEAD", + repo = ".", + endpoint = NULL, + region = NULL +) +} +\arguments{ +\item{path}{Path of a file relative to the git directory or bucket} + +\item{ref}{A git commit SHA, tag, branch or other \href{https://git-scm.com/docs/git-rev-parse.html#_specifying_revisions}{revision string}: +such as "HEAD~1", "main@{2023-02-26 18:30:00}", or "branch@{yesterday}". +Can also be a \code{\link[git2r:commit]{git2r::commit()}} object. For S3 buckets, a VersionID. +Defaults to "HEAD", which also means the latest version in an S3 bucket.} + +\item{repo}{The repository to get the file from. This can be a local git +directory, a GitHub repository (URL or "owner/repo" string), or an S3 bucket +indicated by "s3://bucket-name". Defaults to the current working directory.} + +\item{endpoint, region}{For S3 buckets, the endpoint and region of the bucket. +If NULL, the default endpoint and region in local config or environment variables are used. +(Usually \code{us-east-1} and \code{s3.amazonaws.com}.)} +} +\value{ +A \link[fs:path]{path} to the file in the local cache. +} +\description{ +Fetches the contents of a file from a versioned repository - a local git +repository, a GitHub repository, or an S3 bucket. Always fetches the file +to a local cache and returns the path to it. +} diff --git a/man/is_relic.Rd b/man/is_relic.Rd deleted file mode 100644 index 707f9cd..0000000 --- a/man/is_relic.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/relic.R -\name{is_relic} -\alias{is_relic} -\title{An internal S3 class of git objects with additional metadata} -\usage{ -is_relic(x) -} -\arguments{ -\item{x}{an object} -} -\description{ -An internal S3 class of git objects with additional metadata -} diff --git a/man/relic_cache.Rd b/man/relic_cache.Rd index a115da1..20eb4e2 100644 --- a/man/relic_cache.Rd +++ b/man/relic_cache.Rd @@ -1,22 +1,62 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R +% Please edit documentation in R/cache.R \name{relic_cache} \alias{relic_cache} -\alias{relic_cache_clear} -\title{The relic cache directory} +\alias{relic_cache_delete} +\alias{relic_cache_cleanup} +\alias{relic_cache_regular_cleanup} +\alias{relic_cache_max_size} +\alias{relic_cache_max_age} +\alias{relic_cache_cleanup_time} +\title{The relic cache} \usage{ relic_cache() -relic_cache_clear() +relic_cache_delete() + +relic_cache_cleanup( + max_age = relic_cache_max_age(), + max_size = relic_cache_max_size() +) + +relic_cache_regular_cleanup(cleanup_time = relic_cache_cleanup_time()) + +relic_cache_max_size() + +relic_cache_max_age() + +relic_cache_cleanup_time() +} +\arguments{ +\item{max_age}{The maximum age of files to keep in the cache, as a \link[base:difftime]{difftime} +object. Files older than this will be deleted. Defaults to \code{Inf}. Can be +set with the environment variable \code{RELIC_CACHE_MAX_AGE} or +\code{options("relic.cache.max.age")}, which take numeric time in days or a +string with units, e.g., "1 day" or "2 weeks".} + +\item{max_size}{The maximum size of the cache, as a string that can be parsed +by \code{\link[fs:fs_bytes]{fs::fs_bytes()}}. Defaults to "20 MB". Can be set with the environment +variable \code{RELIC_CACHE_MAX_SIZE} or \code{options("relic.cache.max.size")}. +Cached files will be deleted from oldest to youngest until the cache size +is under this limit.} + +\item{cleanup_time}{The time between cache cleanups, as a \link[base:difftime]{difftime} object. +Defaults to 1 day. Can be set with the environment variable +\code{RELIC_CACHE_CLEANUP_TIME} or \code{options("relic.cache.cleanup.time")}, which +take numeric time in days or a string with units, e.g., "1 day" or "2 +weeks". If set to "Inf", no cleanup will be performed at startup.} } \value{ The path to the relic cache directory } \description{ -Get the relic cache directory, which can be specified as either the R option -\code{relic.cache.dir} or the environment variable \code{RELIC_CACHE_DIR}. (The -environment variable has higher priority). If neither -is set, the default is set by tools::R_user_dir("relic", "cache"). +The relic cache directory stores files that have been retrieved +from both local and remote repositories to avoid repeated extractions or +downloads. Its location can be set with the environment variable +\code{RELIC_CACHE_DIR} or \code{options("relic.cache.dir")}, and it defaults to the +user cache directory. The cache is cleaned up regularly at package startup, +but can also be cleaned up manually with \code{relic_cache_cleanup()} or cleared +entirely with \code{relic_cache_delete()}. } \examples{ relic_cache() diff --git a/man/tar_exists_version.Rd b/man/tar_exists_version.Rd deleted file mode 100644 index 8348e34..0000000 --- a/man/tar_exists_version.Rd +++ /dev/null @@ -1,38 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/tar_exists_version.R -\name{tar_exists_version} -\alias{tar_exists_version} -\alias{tar_exists_version_raw} -\title{Check if a target exists in a version of the pipeline} -\usage{ -tar_exists_version( - name, - ref = "HEAD", - branches = NULL, - repo = ".", - store = targets::tar_path_store() -) - -tar_exists_version_raw( - name, - ref = "HEAD", - branches = NULL, - repo = ".", - store = targets::tar_path_store() -) -} -\arguments{ -\item{name}{Name of the target. \code{tar_read_version()} can take a symbol, \code{tar_read_raw_version()} requires a character.} - -\item{ref}{A git commit SHA, tag, branch or other \href{https://git-scm.com/docs/git-rev-parse.html#_specifying_revisions}{revision string}: -such as "HEAD~1", or a \code{\link[git2r:commit]{git2r::commit()}} object. Defaults to "HEAD".} - -\item{branches}{Integer of indices of the (targets) branches to load if the target is a pattern.} - -\item{repo}{The path to the git repository or a \code{\link[git2r:repository]{git2r::repository()}} object.} - -\item{store}{Path to the targets store within the project. Defaults to the current project's store.} -} -\description{ -Check if a target exists in a version of the pipeline -} diff --git a/man/tar_meta_version.Rd b/man/tar_meta_version.Rd index 70da2a5..00e999e 100644 --- a/man/tar_meta_version.Rd +++ b/man/tar_meta_version.Rd @@ -2,25 +2,39 @@ % Please edit documentation in R/tar_meta_version.R \name{tar_meta_version} \alias{tar_meta_version} -\title{Read a target project's metadata from git history} +\title{Read a target project's metadata from repository history} \usage{ tar_meta_version( ref = "HEAD", - ..., - store = targets::tar_path_store(), - repo = "." + store = NULL, + repo = ".", + endpoint = NULL, + region = NULL, + ... ) } \arguments{ \item{ref}{A git commit SHA, tag, branch or other \href{https://git-scm.com/docs/git-rev-parse.html#_specifying_revisions}{revision string}: -such as "HEAD~1", or a \code{\link[git2r:commit]{git2r::commit()}} object. Defaults to "HEAD".} +such as "HEAD~1", "main@{2023-02-26 18:30:00}", or "branch@{yesterday}". +Can also be a \code{\link[git2r:commit]{git2r::commit()}} object. For S3 buckets, a VersionID. +Defaults to "HEAD", which also means the latest version in an S3 bucket.} -\item{...}{Arguments passed to \code{\link[targets:tar_meta]{targets::tar_meta()}}} +\item{store}{Path to the targets store within the project. Defaults to \verb{"_targets}, or the current project's \link[targets:tar_path_store]{store name} if \code{repo = "."}.} + +\item{repo}{The repository to get the file from. This can be a local git +directory, a GitHub repository (URL or "owner/repo" string), or an S3 bucket +indicated by "s3://bucket-name". Defaults to the current working directory.} -\item{store}{Path to the targets store within the project. Defaults to the current project's store.} +\item{endpoint, region}{For S3 buckets, the endpoint and region of the bucket.} -\item{repo}{The path to the git repository or a \code{\link[git2r:repository]{git2r::repository()}} object.} +\item{...}{Arguments passed to \code{\link[targets:tar_meta]{targets::tar_meta()}}} +} +\value{ +A data frame with one row per target/object. See \code{\link[targets:tar_meta]{targets::tar_meta()}} for details. } \description{ -Read a target project's metadata from git history +This function extracts \link[targets:tar_meta]{targets metadata} from versioned history. +In most cases this is a git repository, but it can also be an S3 cloud bucket +for a project using cloud versioning and storing the metadata file in the cloud. +(See \code{repository_meta} in \code{\link[targets:tar_option_set]{targets::tar_option_set()}}). } diff --git a/man/tar_read_version.Rd b/man/tar_read_version.Rd index 9ac1da1..b3dcf03 100644 --- a/man/tar_read_version.Rd +++ b/man/tar_read_version.Rd @@ -2,73 +2,34 @@ % Please edit documentation in R/tar_read_version.R \name{tar_read_version} \alias{tar_read_version} -\alias{tar_read_versions} \alias{tar_read_raw_version} -\alias{tar_read_raw_versions} \title{Read a target's value from a git repository} \usage{ tar_read_version( name, ref = "HEAD", - branches = NULL, - extract_files = TRUE, repo = ".", store = targets::tar_path_store() ) -tar_read_versions( - name, - from = "HEAD", - to = NULL, - branches = NULL, - extract_files = TRUE, - repo = ".", - store = targets::tar_path_store() -) - -tar_read_raw_version( - name, - ref = "HEAD", - branches = NULL, - extract_files = TRUE, - repo = ".", - store = targets::tar_path_store() -) - -tar_read_raw_versions( - name, - from = "HEAD", - to = NULL, - branches = NULL, - extract_files = TRUE, - repo = ".", - store = targets::tar_path_store() -) +tar_read_raw_version(name, ref = "HEAD", repo = ".", store = NULL) } \arguments{ \item{name}{Name of the target. \code{tar_read_version()} can take a symbol, \code{tar_read_raw_version()} requires a character.} \item{ref}{A git commit SHA, tag, branch or other \href{https://git-scm.com/docs/git-rev-parse.html#_specifying_revisions}{revision string}: -such as "HEAD~1", or a \code{\link[git2r:commit]{git2r::commit()}} object. Defaults to "HEAD".} - -\item{branches}{Integer of indices of the (targets) branches to load if the target is a pattern.} - -\item{extract_files}{If TRUE, targets of type "file" will be extracted to a temporary directory and these paths will be returned. If FALSE, the paths as stored in the target will be returned unmodified} - -\item{repo}{The path to the git repository or a \code{\link[git2r:repository]{git2r::repository()}} object.} - -\item{store}{Path to the targets store within the project. Defaults to the current project's store.} +such as "HEAD~1", "main@{2023-02-26 18:30:00}", or "branch@{yesterday}". +Can also be a \code{\link[git2r:commit]{git2r::commit()}} object. For S3 buckets, a VersionID. +Defaults to "HEAD", which also means the latest version in an S3 bucket.} -\item{from}{A commit object or revision string. A string of the form -'from...to' may also be passed, in which case the commits between the two -revision strings are used and the \code{to} argument is ignored. If a list of commits -is passed, these commits are used rather than calculating the commits between.} +\item{repo}{The repository to get the file from. This can be a local git +directory, a GitHub repository (URL or "owner/repo" string), or an S3 bucket +indicated by "s3://bucket-name". Defaults to the current working directory.} -\item{to}{A commit object or reference} +\item{store}{Path to the targets store within the project. Defaults to \verb{"_targets}, or the current project's \link[targets:tar_path_store]{store name} if \code{repo = "."}.} } \value{ -The target's return values, loaded files in the git file/⁠, or the paths to the custom files and directories if format = "file" was set. -If the target is not found at the commit, NULL is returned. See \code{\link[=tar_exists_version]{tar_exists_version()}} to check for the presence of a target. +The target's value. If the target is of \code{format = "file"}, this will be the path to the file in the \link[=relic_cache]{relic cache}. } \description{ Reads the content of targets from a git repository. Target metadata and @@ -79,6 +40,9 @@ extracted and the paths to the extracted files are returned. \details{ For cloud targets, the target metadata is read from git history and then this metadata is used to download the target from the cloud. For this to work, -cloud storage must be set up with versioning. Note that the cloud configuration -will use same bucket/endpoint/credentials set in the \emph{current} environment. +cloud storage must be set up with versioning. Note that targets metadata +includes the bucket, endpoint, and region of a S3-stored target, but you must still +provide an AWS access key and secret as environment variables. If these differ +from the credentials used for your current project or environment, can use +\code{\link[withr:with_envvar]{withr::with_envvar()}} to temporarily set the credentials. } diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R new file mode 100644 index 0000000..15bb856 --- /dev/null +++ b/tests/testthat/helper.R @@ -0,0 +1,24 @@ +test_package_root <- function() { + x <- tryCatch( + rprojroot::find_package_root_file(), + error = function(e) NULL + ) + + if (!is.null(x)) { + return(x) + } + + pkg <- testthat::testing_package() + x <- tryCatch( + rprojroot::find_package_root_file( + path = file.path("..", "..", "00_pkg_src", pkg) + ), + error = function(e) NULL + ) + + if (!is.null(x)) { + return(x) + } + + stop("Cannot find package root") +} diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R index 2cbfc79..0e2fe28 100644 --- a/tests/testthat/setup.R +++ b/tests/testthat/setup.R @@ -1,16 +1,15 @@ -Sys.setenv("RELIC_TEST_S3"="true") - withr::local_envvar( - "R_USER_CACHE_DIR" = tempdir()) + "R_USER_CACHE_DIR" = tempdir() +) ## Run a MinIO server in the background to test S3 object storage with `targets` -if(nzchar(Sys.getenv("RELIC_TEST_S3"))) { - s3_dir <- file_temp("s3_cache") - dir_create(s3_dir) +if (nzchar(Sys.getenv("RELIC_TEST_S3"))) { + s3_dir <- fs::file_temp("s3_cache") + fs::dir_create(s3_dir) # Set minioclient directory to package directory so it is cached with packages mc_dir <- file.path(find.package("minioclient"), "mc_bin") - dir_create(mc_dir) + fs::dir_create(mc_dir) withr::local_options(list(minioclient.dir = mc_dir, minioserver.dir = mc_dir)) minioclient::install_mc() minioclient::install_minio_server() @@ -28,7 +27,7 @@ if(nzchar(Sys.getenv("RELIC_TEST_S3"))) { s3_repo <- create_example_repo(s3 = TRUE) - if(rlang::is_interactive()) { + if (rlang::is_interactive()) { s3_srv$kill() dir_delete(s3_dir) } else { @@ -36,14 +35,8 @@ if(nzchar(Sys.getenv("RELIC_TEST_S3"))) { withr::defer(dir_delete(s3_dir), testthat::teardown_env()) withr::defer(dir_delete(s3_repo), testthat::teardown_env()) } - } ## Create an example repository for testing ex_repo <- create_example_repo(s3 = FALSE) withr::defer(dir_delete(ex_repo), testthat::teardown_env()) - - - - - diff --git a/tests/testthat/test-cache.R b/tests/testthat/test-cache.R index df41bbb..8c7a2c4 100644 --- a/tests/testthat/test-cache.R +++ b/tests/testthat/test-cache.R @@ -1,8 +1,7 @@ test_that("A relic cache dir set by environent variable overrides one set by options", { - env_dir <- tempfile() - options_dir <- tempfile() + env_dir <- path_norm(tempfile()) + options_dir <- path_norm(tempfile()) withr::local_envvar(RELIC_CACHE_DIR = env_dir) withr::local_options(relic.cache.dir = options_dir) expect_equal(relic_cache(), env_dir) }) - diff --git a/tests/testthat/test-file_read_version.R b/tests/testthat/test-file_read_version.R index 93e6555..47bf1e7 100644 --- a/tests/testthat/test-file_read_version.R +++ b/tests/testthat/test-file_read_version.R @@ -1,11 +1,9 @@ -test_that("files are read in properly", { - +test_that("files are found in properly", { withr::local_dir(ex_repo) - bin_file <- file_read_version("_targets/objects/cars", "first-targets-run", repo = ex_repo)[[1]] - text_file <- file_read_version("_targets.R", "first-targets-run", repo = ex_repo)[[1]] - no_file <- file_read_version("_targets/objects/cars", "initial-target-file", repo = ex_repo)[[1]] - expect_type(bin_file, "raw") - expect_type(text_file, "character") - expect_null(no_file) + bin_file <- get_file_version("_targets/objects/cars", "first-targets-run", repo = ex_repo) + text_file <- get_file_version("_targets.R", "first-targets-run", repo = ex_repo) + expect_error(get_file_version("_targets/objects/cars", "initial-target-file", repo = ex_repo)) + expect_true(file_exists(bin_file)) + expect_true(file_exists(text_file)) }) diff --git a/tests/testthat/tests-strict.R b/tests/testthat/tests-strict.R new file mode 100644 index 0000000..33ae7d9 --- /dev/null +++ b/tests/testthat/tests-strict.R @@ -0,0 +1,53 @@ +strict_tests <- nzchar(Sys.getenv("RELIC_TEST_STRICT")) +pkgroot <- test_package_root() + +test_that("Linting", { + testthat::skip_on_cran() + testthat::skip_on_covr() + lints <- lintr::lint_package(pkgroot) + has_lints <- length(lints) > 0L + if (has_lints) { + lint_output <- format(lints) + } + if (strict_tests) { + testthat::expect(!has_lints, paste0( + "Not lint free\n", + format(lints) + )) + } else { + if (has_lints) warning("Lints found, run `lintr::lint_package()` to see them.") + expect_true(TRUE) + } +}) + +test_that("Spelling", { + testthat::skip_on_cran() + testthat::skip_on_covr() + typos <- spelling::spell_check_package(pkgroot) + has_typos <- nrow(typos) > 0L + if (strict_tests) { + expect(!has_typos, paste0( + "Not typo free: \n", + capture.output(typos) + )) + } else { + if (has_typos) warning("Typos found, run `spelling::spell_check_package()` to see them.") + expect_true(TRUE) + } +}) + +test_that("Styling", { + testthat::skip_on_cran() + testthat::skip_on_covr() + styler_output <- capture.output(styler_data <- styler::style_pkg(dry = "on")) + has_style_changes <- any(styler_data$changed) + if (strict_tests) { + expect(!has_style_changes, paste0( + "Style changes to make: \n", + collapse(styler_output, "\n") + )) + } else { + if (has_style_changes) warning("Style changes found, run `styler::style_pkg()` to fix them.") + expect_true(TRUE) + } +}) From 4d1cd5f62842c15970e33113f3750e93a9814256 Mon Sep 17 00:00:00 2001 From: Noam Ross Date: Mon, 27 Nov 2023 18:08:02 -0500 Subject: [PATCH 2/8] Update docs --- .github/CONTRIBUTING.md | 5 +---- README.Rmd | 2 +- README.md | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 1229a20..b455e94 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -7,9 +7,6 @@ This outlines how to propose a change to `relic`. - `relic` is a high-level interface designed for working with git repositories of data science workflows. It is not intended to be a general-purpose client for git. -- `relic`'s primary features are for extracting and comparing files and data - from git history and running code within and across git history. Functions - in the package are generally _read-only_ and do not commit or modify history. - `relic` aims to have relatively few dependencies for its core functions (`git2r` and low-level packages such as `fs` and `rlang`). For extended functionality, other packages may be used, but these should live under `Suggests:` and @@ -21,7 +18,7 @@ This outlines how to propose a change to `relic`. - `relic` has specific support for workflows using [`targets`](https://books.ropensci.org/targets/). Similar functionality for other workflow managers may be considered in the future, as may high-level - interfaces for dealing with other versioned data such as S3 buckets. + interfaces for dealing with other versioned data. - `relic` uses `git2r` to interface with git/libgit2. In general `relic` functions should not call `libgit2` directly nor call `git` via the command line. If `git2r` does not expose needed functionality in `libgit2`, consider making diff --git a/README.Rmd b/README.Rmd index 62a4101..0adc15e 100644 --- a/README.Rmd +++ b/README.Rmd @@ -46,7 +46,7 @@ especially well when used in conjunction with [cloud-based, versioned object sto You can install the development version of `relic` like so: ```r -install.packages("relic", repos = c("https://ecohealthalliance.r-universe.dev")) +devtools::install_github("ecohealthalliance/relic") ``` diff --git a/README.md b/README.md index 3a3613b..a7bd7b5 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ storage](https://books.ropensci.org/targets/cloud-storage.html). You can install the development version of `relic` like so: ``` r -install.packages("relic", repos = c("https://ecohealthalliance.r-universe.dev")) +devtools::install_github("ecohealthalliance/relic") ``` ## Related work From 9931605a99b00399b1b0ba2eea2f2b3248dd4e47 Mon Sep 17 00:00:00 2001 From: Noam Ross Date: Mon, 27 Nov 2023 18:18:59 -0500 Subject: [PATCH 3/8] Fix pkgcheck dependency --- .github/workflows/R-CMD-check.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index f26309a..9d150cc 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -36,7 +36,7 @@ jobs: use-public-rspm: true - uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: any::sessioninfo, any::rcmdcheck, any::pkgdepends, any::covr, any::ropensci-review-tools/pkgcheck, any::riskmetric + extra-packages: any::sessioninfo, any::rcmdcheck, any::pkgdepends, any::covr, any::riskmetric, github::ropensci-review-tools/pkgcheck, needs: check - name: Run just package tests and record test coverage run: | From 05a79257c4d7a986f3a358ed5f2768fba5ead55e Mon Sep 17 00:00:00 2001 From: Noam Ross Date: Mon, 27 Nov 2023 18:31:34 -0500 Subject: [PATCH 4/8] Move to CRAN git2r version --- DESCRIPTION | 1 - 1 file changed, 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 988e125..b4a1c27 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -40,7 +40,6 @@ VignetteBuilder: knitr Remotes: cboettig/minioclient#14, - ropensci/git2r Config/testthat/edition: 3 Encoding: UTF-8 Language: en-US From a47898da00716b4d473a85e58d6352cedc2dc948 Mon Sep 17 00:00:00 2001 From: Noam Ross Date: Mon, 27 Nov 2023 18:32:52 -0500 Subject: [PATCH 5/8] Ignore timestamp file --- .Rbuildignore | 1 + .gitignore | 1 + inst/cache_timestamp.rds | Bin 95 -> 0 bytes 3 files changed, 2 insertions(+) delete mode 100644 inst/cache_timestamp.rds diff --git a/.Rbuildignore b/.Rbuildignore index 9663f4e..e0372e9 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -12,5 +12,6 @@ ^pkgdown$ ^inst/scratch$ ^inst/mc$ +^ints/cache_timestamp.rds ^check$ ^artifacts$ diff --git a/.gitignore b/.gitignore index 5b7d215..663a0f7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ inst/doc inst/scratch +inst/cache_timestamp.rds inst/mc docs tests/testthat/*.log diff --git a/inst/cache_timestamp.rds b/inst/cache_timestamp.rds deleted file mode 100644 index b71bc3ecbc33e5fd5f1710d9f8d356792b76f4c6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 95 zcmb2|=3oE==I#ec2?+^l35jVb32CfGk`d0%cS>{{l4>0I7C%1g!T(Y3m=hbDn3}-+ w#@0sRB^s9M8+S;Y5MW4BTcJ?hP`@GFrHFINX|`w67$W~OuygK_O9h$>0J~EkzyJUM From 6a52691d47d540ae37374cabf22870d32776024b Mon Sep 17 00:00:00 2001 From: Noam Ross Date: Mon, 27 Nov 2023 18:50:31 -0500 Subject: [PATCH 6/8] Update docs --- man/relic-package.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/relic-package.Rd b/man/relic-package.Rd index 9861f9d..3d4621c 100644 --- a/man/relic-package.Rd +++ b/man/relic-package.Rd @@ -8,7 +8,7 @@ \description{ \if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}} -The 'relic' package provides tools for extracting files and objects from the history of a git repository. It is a high-level interface designed to enable comparison of objects in reproducible research workflows, especially pipelines that use the 'targets' package. +The 'relic' package provides tools for extracting files and objects from the revision history, including local and remote git repositories and S3 buckets. It is a high-level interface designed to enable comparison of objects in reproducible research workflows, especially pipelines that use the 'targets' package. } \seealso{ Useful links: From 132ffd2d7b45bb29f8842e6afda9dfd5db950f34 Mon Sep 17 00:00:00 2001 From: Noam Ross Date: Mon, 27 Nov 2023 19:03:08 -0500 Subject: [PATCH 7/8] Restore github git2r --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index b4a1c27..988e125 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -40,6 +40,7 @@ VignetteBuilder: knitr Remotes: cboettig/minioclient#14, + ropensci/git2r Config/testthat/edition: 3 Encoding: UTF-8 Language: en-US From 8b8efa3e76fc7caac52eff781f5038665c3c96b2 Mon Sep 17 00:00:00 2001 From: Noam Ross Date: Mon, 27 Nov 2023 19:23:09 -0500 Subject: [PATCH 8/8] Remove styling test --- R/cache.R | 2 +- tests/testthat/test-cache.R | 4 ++-- tests/testthat/tests-strict.R | 16 +--------------- 3 files changed, 4 insertions(+), 18 deletions(-) diff --git a/R/cache.R b/R/cache.R index dc4c9d7..046162f 100644 --- a/R/cache.R +++ b/R/cache.R @@ -12,7 +12,7 @@ #' @examples #' relic_cache() relic_cache <- function() { - path_norm(dir_create(Sys.getenv( + dir_create(path_tidy(Sys.getenv( "RELIC_CACHE_DIR", getOption("relic.cache.dir", default = tools::R_user_dir("relic", "cache") diff --git a/tests/testthat/test-cache.R b/tests/testthat/test-cache.R index 8c7a2c4..02f8ca0 100644 --- a/tests/testthat/test-cache.R +++ b/tests/testthat/test-cache.R @@ -1,6 +1,6 @@ test_that("A relic cache dir set by environent variable overrides one set by options", { - env_dir <- path_norm(tempfile()) - options_dir <- path_norm(tempfile()) + env_dir <- path_tidy(tempfile()) + options_dir <- path_tidy(tempfile()) withr::local_envvar(RELIC_CACHE_DIR = env_dir) withr::local_options(relic.cache.dir = options_dir) expect_equal(relic_cache(), env_dir) diff --git a/tests/testthat/tests-strict.R b/tests/testthat/tests-strict.R index 33ae7d9..fe18869 100644 --- a/tests/testthat/tests-strict.R +++ b/tests/testthat/tests-strict.R @@ -36,18 +36,4 @@ test_that("Spelling", { } }) -test_that("Styling", { - testthat::skip_on_cran() - testthat::skip_on_covr() - styler_output <- capture.output(styler_data <- styler::style_pkg(dry = "on")) - has_style_changes <- any(styler_data$changed) - if (strict_tests) { - expect(!has_style_changes, paste0( - "Style changes to make: \n", - collapse(styler_output, "\n") - )) - } else { - if (has_style_changes) warning("Style changes found, run `styler::style_pkg()` to fix them.") - expect_true(TRUE) - } -}) +