From 6084e0afa86da04fef52c41eafff4aa005177a84 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 29 May 2024 23:40:21 -0300 Subject: [PATCH] docs: Update readme for release (#42) --- DESCRIPTION | 2 +- NAMESPACE | 2 + R/array.R | 102 ++++++++++++++++++++++++------- R/pkg-nanoarrow.R | 16 ++--- R/pkg-wk.R | 3 + R/type.R | 38 +++++++++++- README.Rmd | 48 ++++++++++++++- README.md | 112 ++++++++++++++++++++++++++++++++++- man/geoarrow-package.Rd | 4 +- man/geoarrow_schema_parse.Rd | 15 ++++- tests/testthat/test-array.R | 49 ++++++++++++--- tests/testthat/test-type.R | 25 ++++++++ 12 files changed, 374 insertions(+), 42 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 54920b4..ede40f9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -30,7 +30,7 @@ Roxygen: list(markdown = TRUE) RoxygenNote: 7.2.3 Imports: nanoarrow (>= 0.5.0), - wk (>= 0.6.0) + wk (>= 0.9.0) LinkingTo: wk Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index 34f3db7..9bd1c48 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -35,6 +35,7 @@ S3method(wk_handle,geoarrow_vctr) S3method(wk_is_geodesic,geoarrow_vctr) export(as_geoarrow_array) export(as_geoarrow_array_stream) +export(as_geoarrow_schema) export(as_geoarrow_vctr) export(geoarrow_handle) export(geoarrow_large_wkb) @@ -51,6 +52,7 @@ export(geoarrow_wkb) export(geoarrow_wkt) export(geoarrow_writer) export(infer_geoarrow_schema) +export(is_geoarrow_schema) export(na_extension_geoarrow) export(na_extension_large_wkb) export(na_extension_large_wkt) diff --git a/R/array.R b/R/array.R index 60c3b85..9f9337f 100644 --- a/R/array.R +++ b/R/array.R @@ -19,6 +19,8 @@ as_geoarrow_array <- function(x, ..., schema = NULL) { as_geoarrow_array.default <- function(x, ..., schema = NULL) { if (is.null(schema)) { schema <- infer_geoarrow_schema(x) + } else { + schema <- nanoarrow::as_nanoarrow_schema(schema) } wk::wk_handle(x, geoarrow_writer(schema)) @@ -26,23 +28,47 @@ as_geoarrow_array.default <- function(x, ..., schema = NULL) { #' @export as_geoarrow_array.nanoarrow_array <- function(x, ..., schema = NULL) { - schema_src <- nanoarrow::infer_nanoarrow_schema(x) + x_schema <- nanoarrow::infer_nanoarrow_schema(x) + + if (!is.null(schema)) { + schema <- nanoarrow::as_nanoarrow_schema(schema) + } + + # If this is not already a geoarrow array, see if we can interpret it as one + if (!is_geoarrow_schema(x_schema)) { + x_schema <- as_geoarrow_schema(x_schema) + + # Try to copy metadata from requested schema if present (e.g., so that + # a CRS or edge type can be assigned) + request_metadata <- schema$metadata[["ARROW:extension:metadata"]] + if (!is.null(request_metadata)) { + x_schema$metadata[["ARROW:extension:metadata"]] <- request_metadata + } + + # Reinterpret the array + x <- reinterpret_array(x, x_schema) + } + + if (is.null(schema)) { - schema <- infer_geoarrow_schema(x) + return(x) } - parsed <- geoarrow_schema_parse(schema) - parsed_src <- geoarrow_schema_parse(schema_src) - if (identical(parsed, parsed_src)) { - x - } else { - stream <- geoarrow_kernel_call_scalar( - "as_geoarrow", - nanoarrow::basic_array_stream(list(x), schema = schema_src, validate = FALSE), - list("type" = parsed$id) - ) - stream$get_next() + # If the source and request type are the same, return x + x_schema_parsed <- geoarrow_schema_parse(x_schema) + schema_parsed <- geoarrow_schema_parse(schema) + if (identical(x_schema_parsed, schema_parsed)) { + return(x) } + + # Otherwise, let as_geoarrow() sort this out + stream <- geoarrow_kernel_call_scalar( + "as_geoarrow", + nanoarrow::basic_array_stream(list(x), schema = x_schema, validate = FALSE), + list("type" = schema_parsed$id) + ) + + stream$get_next() } #' @export @@ -58,6 +84,10 @@ as_geoarrow_array_stream <- function(x, ..., schema = NULL) { #' @export as_geoarrow_array_stream.default <- function(x, ..., schema = NULL) { + if (!is.null(schema)) { + schema <- nanoarrow::as_nanoarrow_schema(schema) + } + nanoarrow::basic_array_stream( list(as_geoarrow_array(x, schema = schema)), schema = schema @@ -66,30 +96,60 @@ as_geoarrow_array_stream.default <- function(x, ..., schema = NULL) { #' @export as_geoarrow_array_stream.nanoarrow_array_stream <- function(x, ..., schema = NULL) { + x_schema <- x$get_schema() + + if (!is.null(schema)) { + schema <- nanoarrow::as_nanoarrow_schema(schema) + } + + # If this is not already a geoarrow array, see if we can interpret it as one + if (!is_geoarrow_schema(x_schema)) { + x_schema <- as_geoarrow_schema(x_schema) + + # Try to copy metadata from requested schema if present (e.g., so that + # a CRS or edge type can be assigned) + request_metadata <- schema$metadata[["ARROW:extension:metadata"]] + if (!is.null(request_metadata)) { + x_schema$metadata[["ARROW:extension:metadata"]] <- request_metadata + } + + # Reinterpret the array + x <- reinterpret_stream(x, x_schema) + } + + # If there is no requested schema, just return the stream (i.e., don't attempt to + # consume a stream to find its optimal output type) if (is.null(schema)) { return(x) } - x_schema <- x$get_schema() + # If the source is the same as the destination, return it x_schema_parsed <- geoarrow_schema_parse(x_schema) schema_parsed <- geoarrow_schema_parse(schema) if (identical(x_schema_parsed, schema_parsed)) { return(x) } - collected <- nanoarrow::collect_array_stream( - x, - schema = x_schema, - validate = FALSE - ) - geoarrow_kernel_call_scalar( "as_geoarrow", - nanoarrow::basic_array_stream(collected), + reinterpret_stream(x, x_schema, validate = FALSE), list("type" = schema_parsed$id) ) } +reinterpret_array <- function(array, schema, validate = TRUE) { + array2 <- nanoarrow::nanoarrow_allocate_array() + nanoarrow::nanoarrow_pointer_export(array, array2) + nanoarrow::nanoarrow_array_set_schema(array2, schema, validate = validate) + array2 +} + +reinterpret_stream <- function(stream, schema, validate = TRUE) { + # Eventually we should stream this somehow instead of collecting it all + chunks <- nanoarrow::collect_array_stream(stream, schema = schema, validate = FALSE) + nanoarrow::basic_array_stream(chunks, schema, validate = validate) +} + geoarrow_array_from_buffers <- function(schema, buffers) { schema <- nanoarrow::as_nanoarrow_schema(schema) extension_name <- schema$metadata[["ARROW:extension:name"]] diff --git a/R/pkg-nanoarrow.R b/R/pkg-nanoarrow.R index 9d185ad..bc75123 100644 --- a/R/pkg-nanoarrow.R +++ b/R/pkg-nanoarrow.R @@ -2,13 +2,7 @@ # Runs before coverage starts on load # nocov start register_geoarrow_extension <- function() { - all_ext_name <- c( - "geoarrow.wkt", "geoarrow.wkb", "geoarrow.point", "geoarrow.linestring", - "geoarrow.polygon", "geoarrow.multipoint", "geoarrow.mutlilinestring", - "geoarrow.multipolygon" - ) - - for (ext_name in all_ext_name) { + for (ext_name in all_extension_names()) { nanoarrow::register_nanoarrow_extension( ext_name, nanoarrow::nanoarrow_extension_spec(subclass = "geoarrow_extension_spec") @@ -50,3 +44,11 @@ as_nanoarrow_array_extension.geoarrow_extension_spec <- function( convert_array.geoarrow_vctr <- function(array, to, ...) { as_geoarrow_vctr(array, schema = nanoarrow::as_nanoarrow_schema(to)) } + +all_extension_names <- function() { + c( + "geoarrow.wkt", "geoarrow.wkb", "geoarrow.point", "geoarrow.linestring", + "geoarrow.polygon", "geoarrow.multipoint", "geoarrow.mutlilinestring", + "geoarrow.multipolygon" + ) +} diff --git a/R/pkg-wk.R b/R/pkg-wk.R index 43f578f..2b0586b 100644 --- a/R/pkg-wk.R +++ b/R/pkg-wk.R @@ -26,6 +26,7 @@ wk_is_geodesic.geoarrow_vctr <- function(x) { #' @export as_geoarrow_array.wk_wkt <- function(x, ..., schema = NULL) { if (!is.null(schema)) { + schema <- nanoarrow::as_nanoarrow_schema(schema) if (!identical(schema$metadata[["ARROW:extension:name"]], "geoarrow.wkt")) { return(NextMethod()) } @@ -48,6 +49,7 @@ as_geoarrow_array.wk_wkt <- function(x, ..., schema = NULL) { #' @export as_geoarrow_array.wk_wkb <- function(x, ..., schema = NULL) { if (!is.null(schema)) { + schema <- nanoarrow::as_nanoarrow_schema(schema) if (!identical(schema$metadata[["ARROW:extension:name"]], "geoarrow.wkb")) { return(NextMethod()) } @@ -70,6 +72,7 @@ as_geoarrow_array.wk_wkb <- function(x, ..., schema = NULL) { #' @export as_geoarrow_array.wk_xy <- function(x, ..., schema = NULL) { if (!is.null(schema)) { + schema <- nanoarrow::as_nanoarrow_schema(schema) if (!identical(schema$metadata[["ARROW:extension:name"]], "geoarrow.point")) { return(NextMethod()) } diff --git a/R/type.R b/R/type.R index d82c669..17f5c86 100644 --- a/R/type.R +++ b/R/type.R @@ -202,6 +202,8 @@ geoarrow_multipolygon <- function(dimensions = "XY", #' #' @param schema A [nanoarrow_schema][nanoarrow::as_nanoarrow_schema] #' @param extension_name An extension name to use if schema is a storage type. +#' @param infer_from_storage Attempt to guess an extension name if schema is not +#' a geoarrow extension type. #' #' @return A list of parsed properties #' @export @@ -209,15 +211,49 @@ geoarrow_multipolygon <- function(dimensions = "XY", #' @examples #' geoarrow_schema_parse(na_extension_geoarrow("POINT")) #' -geoarrow_schema_parse <- function(schema, extension_name = NULL) { +geoarrow_schema_parse <- function(schema, extension_name = NULL, + infer_from_storage = FALSE) { schema <- nanoarrow::as_nanoarrow_schema(schema) if (!is.null(extension_name)) { extension_name <- as.character(extension_name)[1] + } else if (infer_from_storage && is.null(schema$metadata[["ARROW:extension:name"]])) { + # Only a few storage types have unambiguous representations + extension_name <- switch( + schema$format, + "z" = , + "Z" = "geoarrow.wkb", + "u" = , + "U" = "geoarrow.wkt", + "+s" = "geoarrow.point" + ) } .Call(geoarrow_c_schema_parse, schema, extension_name) } +#' @rdname geoarrow_schema_parse +#' @export +is_geoarrow_schema <- function(schema) { + schema <- nanoarrow::as_nanoarrow_schema(schema) + ext <- schema$metadata[["ARROW:extension:name"]] + !is.null(ext) && (ext %in% all_extension_names()) +} + +#' @rdname geoarrow_schema_parse +#' @export +as_geoarrow_schema <- function(schema) { + schema <- nanoarrow::as_nanoarrow_schema(schema) + + # If this is already a geoarrow extension type, we're ready! + if (is_geoarrow_schema(schema)) { + return(schema) + } + + # Otherwise, try to infer + parsed <- geoarrow_schema_parse(schema, infer_from_storage = TRUE) + na_extension_geoarrow_internal(parsed$id, NULL, parsed$edge_type) +} + na_extension_geoarrow_internal <- function(type_id, crs, edges) { metadata <- na_extension_metadata_internal(crs, edges) schema <- nanoarrow::nanoarrow_allocate_schema() diff --git a/README.Rmd b/README.Rmd index 018b9cb..9fbd9ac 100644 --- a/README.Rmd +++ b/README.Rmd @@ -38,8 +38,54 @@ pak::pak("geoarrow/geoarrow-r") ## Example -This is a basic example which shows you how to solve a common problem: +The geoarrow package implements conversions to/from various geospatial types (e.g., sf, sfc, s2, wk) with various Arrow representations (e.g., arrow, nanoarrow). The most useful conversions are between the **arrow** and **sf** packages, which in most cases allow sf objects to be passed to **arrow** functions directly after `library(geoarrow)` or `requireNamespace("geoarrow")` has been called. ```{r example} library(geoarrow) +library(arrow, warn.conflicts = FALSE) +library(sf) + +nc <- read_sf(system.file("gpkg/nc.gpkg", package = "sf")) +tf <- tempfile(fileext = ".parquet") + +nc |> + tibble::as_tibble() |> + write_parquet(tf) + +open_dataset(tf) |> + dplyr::filter(startsWith(NAME, "A")) |> + dplyr::select(NAME, geom) |> + st_as_sf() +``` + +By default, arrow objects are converted to a neutral wrapper around chunked Arrow memory, which in turn implements conversions to most spatial types: + +```{r} +df <- read_parquet(tf) +df$geom +st_as_sfc(df$geom) +``` + +The entry point to creating arrays is `as_geoarrow_vctr()`: + +```{r} +as_geoarrow_vctr(c("POINT (0 1)", "POINT (2 3)")) ``` + +By default these do not attempt to create a new storage type; however, you can request a storage type or infer one from the data: + +```{r} +as_geoarrow_vctr(c("POINT (0 1)", "POINT (2 3)"), schema = geoarrow_native("POINT")) + +vctr <- as_geoarrow_vctr(c("POINT (0 1)", "POINT (2 3)")) +as_geoarrow_vctr(vctr, schema = infer_geoarrow_schema(vctr)) +``` + +There are a number of files to use as examples at that can be read with `arrow::read_ipc_file()`: + +```{r} +url <- "https://github.com/geoarrow/geoarrow-data/releases/download/v0.1.0/ns-water-basin_point.arrow" +tab <- read_ipc_file(url, as_data_frame = FALSE) +tab$geometry$type +``` + diff --git a/README.md b/README.md index 28efd84..7b469dc 100644 --- a/README.md +++ b/README.md @@ -36,8 +36,118 @@ pak::pak("geoarrow/geoarrow-r") ## Example -This is a basic example which shows you how to solve a common problem: +The geoarrow package implements conversions to/from various geospatial +types (e.g., sf, sfc, s2, wk) with various Arrow representations (e.g., +arrow, nanoarrow). The most useful conversions are between the **arrow** +and **sf** packages, which in most cases allow sf objects to be passed +to **arrow** functions directly after `library(geoarrow)` or +`requireNamespace("geoarrow")` has been called. ``` r library(geoarrow) +library(arrow, warn.conflicts = FALSE) +#> Warning: package 'arrow' was built under R version 4.3.3 +library(sf) +#> Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE + +nc <- read_sf(system.file("gpkg/nc.gpkg", package = "sf")) +tf <- tempfile(fileext = ".parquet") + +nc |> + tibble::as_tibble() |> + write_parquet(tf) + +open_dataset(tf) |> + dplyr::filter(startsWith(NAME, "A")) |> + dplyr::select(NAME, geom) |> + st_as_sf() +#> Simple feature collection with 6 features and 1 field +#> Geometry type: MULTIPOLYGON +#> Dimension: XY +#> Bounding box: xmin: -82.07776 ymin: 34.80792 xmax: -79.23799 ymax: 36.58965 +#> Geodetic CRS: NAD27 +#> NAME geom +#> 1 Ashe MULTIPOLYGON (((-81.47276 3... +#> 2 Alleghany MULTIPOLYGON (((-81.23989 3... +#> 3 Avery MULTIPOLYGON (((-81.94135 3... +#> 4 Alamance MULTIPOLYGON (((-79.24619 3... +#> 5 Alexander MULTIPOLYGON (((-81.10889 3... +#> 6 Anson MULTIPOLYGON (((-79.91995 3... +``` + +By default, arrow objects are converted to a neutral wrapper around +chunked Arrow memory, which in turn implements conversions to most +spatial types: + +``` r +df <- read_parquet(tf) +df$geom +#> +#> [1] +#> [2] +#> [3] +#> [4] +#> [5] +#> [6] +#> [7] +#> [8] +#> [9] +#> [10] +#> [11] +#> [12] +#> [13] +#> [14] +#> [15] +#> [16] +#> [17] +#> [18] +#> [19] +#> [20] +#> ...and 80 more values +st_as_sfc(df$geom) +#> Geometry set for 100 features +#> Geometry type: MULTIPOLYGON +#> Dimension: XY +#> Bounding box: xmin: -84.32385 ymin: 33.88199 xmax: -75.45698 ymax: 36.58965 +#> Geodetic CRS: NAD27 +#> First 5 geometries: +#> MULTIPOLYGON (((-81.47276 36.23436, -81.54084 3... +#> MULTIPOLYGON (((-81.23989 36.36536, -81.24069 3... +#> MULTIPOLYGON (((-80.45634 36.24256, -80.47639 3... +#> MULTIPOLYGON (((-76.00897 36.3196, -76.01735 36... +#> MULTIPOLYGON (((-77.21767 36.24098, -77.23461 3... +``` + +The entry point to creating arrays is `as_geoarrow_vctr()`: + +``` r +as_geoarrow_vctr(c("POINT (0 1)", "POINT (2 3)")) +#> +#> [1] +``` + +By default these do not attempt to create a new storage type; however, +you can request a storage type or infer one from the data: + +``` r +as_geoarrow_vctr(c("POINT (0 1)", "POINT (2 3)"), schema = geoarrow_native("POINT")) +#> +#> [1] + +vctr <- as_geoarrow_vctr(c("POINT (0 1)", "POINT (2 3)")) +as_geoarrow_vctr(vctr, schema = infer_geoarrow_schema(vctr)) +#> +#> [1] +``` + +There are a number of files to use as examples at + that can be read with +`arrow::read_ipc_file()`: + +``` r +url <- "https://github.com/geoarrow/geoarrow-data/releases/download/v0.1.0/ns-water-basin_point.arrow" +tab <- read_ipc_file(url, as_data_frame = FALSE) +tab$geometry$type +#> GeometryExtensionType +#> geoarrow.multipoint