diff --git a/src/DimensionalData.jl b/src/DimensionalData.jl index 9eb0c0ae2..0b176b383 100644 --- a/src/DimensionalData.jl +++ b/src/DimensionalData.jl @@ -84,6 +84,7 @@ const DD = DimensionalData # Common include("interface.jl") include("name.jl") +include("table_ops.jl") # Arrays include("array/array.jl") diff --git a/src/array/array.jl b/src/array/array.jl index 5acfd5ea9..f02449401 100644 --- a/src/array/array.jl +++ b/src/array/array.jl @@ -334,7 +334,7 @@ end DimArray <: AbstractDimArray DimArray(data, dims, refdims, name, metadata) - DimArray(data, dims::Tuple; refdims=(), name=NoName(), metadata=NoMetadata()) + DimArray(data, dims::Tuple; refdims=(), name=NoName(), metadata=NoMetadata(), selector=Contains()) The main concrete subtype of [`AbstractDimArray`](@ref). @@ -344,12 +344,13 @@ moves dimensions to reference dimension `refdims` after reducing operations ## Arguments -- `data`: An `AbstractArray`. +- `data`: An `AbstractArray` or a table with coordinate columns corresponding to `dims`. - `dims`: A `Tuple` of `Dimension` - `name`: A string name for the array. Shows in plots and tables. - `refdims`: refence dimensions. Usually set programmatically to track past slices and reductions of dimension for labelling and reconstruction. - `metadata`: `Dict` or `Metadata` object, or `NoMetadata()` +- `selector`: The coordinate selector type to use when materializing from a table. Indexing can be done with all regular indices, or with [`Dimension`](@ref)s and/or [`Selector`](@ref)s. @@ -429,6 +430,35 @@ function DimArray(A::AbstractBasicDimArray; newdata = collect(data) DimArray(newdata, format(dims, newdata); refdims, name, metadata) end +# Write a single column from a table with one or more coordinate columns to a DimArray +function DimArray(table, dims; name=NoName(), selector=Near(), precision=6, missingval=missing, kw...) + # Confirm that the Tables interface is implemented + Tables.istable(table) || throw(ArgumentError("`table` must satisfy the `Tables.jl` interface.")) + + # Get array dimensions + dims = guess_dims(table, dims, precision=precision) + + # Determine row indices based on coordinate values + indices = coords_to_indices(table, dims; selector=selector) + + # Extract the data column correspondong to `name` + col = name == NoName() ? data_col_names(table, dims) |> first : Symbol(name) + data = Tables.getcolumn(table, col) + + # Restore array data + array = restore_array(data, indices, dims, missingval) + + # Return DimArray + return DimArray(array, dims, name=col; kw...) +end +# Same as above, but guess dimension names +function DimArray(table; kw...) + # Confirm that the Tables interface is implemented + Tables.istable(table) || throw(ArgumentError("`table` must satisfy the `Tables.jl` interface.")) + + # Use default dimension + return DimArray(table, guess_dims(table; kw...); kw...) +end """ DimArray(f::Function, dim::Dimension; [name]) @@ -437,7 +467,7 @@ Apply function `f` across the values of the dimension `dim` the given dimension. Optionally provide a name for the result. """ function DimArray(f::Function, dim::Dimension; name=Symbol(nameof(f), "(", name(dim), ")")) - DimArray(f.(val(dim)), (dim,); name) + DimArray(f.(val(dim)), (dim,); name) end const DimVector = DimArray{T,1} where T diff --git a/src/stack/stack.jl b/src/stack/stack.jl index e5dd2763b..6e47f1ccb 100644 --- a/src/stack/stack.jl +++ b/src/stack/stack.jl @@ -278,6 +278,7 @@ end """ DimStack <: AbstractDimStack + DimStack(table, [dims]; kw...) DimStack(data::AbstractDimArray...; kw...) DimStack(data::Tuple{Vararg{AbstractDimArray}}; kw...) DimStack(data::NamedTuple{Keys,Vararg{AbstractDimArray}}; kw...) @@ -420,5 +421,12 @@ function DimStack(data::NamedTuple, dims::Tuple; all(map(d -> axes(d) == axes(first(data)), data)) || _stack_size_mismatch() DimStack(data, format(dims, first(data)), refdims, layerdims, metadata, layermetadata) end +# Write each column from a table with one or more coordinate columns to a layer in a DimStack +function DimStack(table, dims::Tuple; selector=DimensionalData.Contains(), kw...) + data_cols = _data_cols(table, dims) + indices = coords_to_indices(table, dims; selector=selector) + arrays = [restore_array(d, indices, dims; missingval=missing) for d in values(data_cols)] + return DimStack(NamedTuple{keys(data_cols)}(arrays), dims; kw...) +end layerdims(s::DimStack{<:Any,<:Any,<:Any,<:Any,<:Any,<:Any,Nothing}, name::Symbol) = dims(s) diff --git a/src/table_ops.jl b/src/table_ops.jl new file mode 100644 index 000000000..73b3d2525 --- /dev/null +++ b/src/table_ops.jl @@ -0,0 +1,299 @@ +""" + restore_array(data::AbstractVector, indices::AbstractVector{<:Integer}, dims::Tuple, missingval) + +Restore a dimensional array from its tabular representation. + +# Arguments +- `data`: An `AbstractVector` containing the flat data to be written to a `DimArray`. +- `indices`: An `AbstractVector` containing the flat indices corresponding to each element in `data`. +- `dims`: The dimensions of the destination `DimArray`. +- `missingval`: The value to write for missing elements in `data`. + +# Returns +An `Array` containing the ordered valued in `data` with the size specified by `dims`. +``` +""" +function restore_array(data::AbstractVector, indices::AbstractVector{<:Integer}, dims::Tuple, missingval) + # Allocate Destination Array + dst_size = prod(map(length, dims)) + dst = Vector{eltype(data)}(undef, dst_size) + dst[indices] .= data + + # Handle Missing Rows + _missingval = _cast_missing(data, missingval) + missing_rows = ones(Bool, dst_size) + missing_rows[indices] .= false + data = ifelse.(missing_rows, _missingval, dst) + + # Reshape Array + return reshape(data, size(dims)) +end + +""" + coords_to_indices(table, dims; selector=Near()) + +Return the flat index of each row in `table` based on its associated coordinates. +Dimension columns are determined from the name of each dimension in `dims`. +It is assumed that the source/destination array has the same dimension order as `dims`. + +# Arguments +- `table`: A table representation of a dimensional array. +- `dims`: A `Tuple` of `Dimension` corresponding to the source/destination array. +- `selector`: The selector type to use for non-numerical/irregular coordinates. + +# Example +```julia +julia> d = DimArray(rand(256, 256), (X, Y)); + +julia> t = DimTable(d); + +julia> coords_to_indices(t, dims(d)) +65536-element Vector{Int64}: + 1 + 2 + ⋮ + 65535 + 65536 +``` +""" +function coords_to_indices(table, dims::Tuple; selector=DimensionalData.Near()) + return _coords_to_indices(table, dims, selector) +end + +# Find the order of the table's rows according to the coordinate values +_coords_to_indices(table, dims::Tuple, sel::DimensionalData.Selector) = + _coords_to_indices(_dim_cols(table, dims), dims, sel) +function _coords_to_indices(coords::NamedTuple, dims::Tuple, sel::DimensionalData.Selector) + ords = _coords_to_ords(coords, dims, sel) + indices = _ords_to_indices(ords, dims) + return indices +end + +""" + guess_dims(table; kw...) + guess_dims(table, dims; precision=6) + +Guesses the dimensions of an array based on the provided tabular representation. + +# Arguments +- `table`: The input data table, which could be a `DataFrame`, `DimTable`, or any other Tables.jl compatible data structure. +The dimensions will be inferred from the corresponding coordinate collumns in the table. +- `dims`: One or more dimensions to be inferred. If no dimensions are specified, then `guess_dims` will default +to any available dimensions in the set `(:X, :Y, :Z, :Ti, :Band)`. Dimensions can be given as either a singular +value or as a `Pair` with both the dimensions and corresponding order. The order will be inferred from the data +when none is given. This should work for sorted coordinates, but will not be sufficient when the table's rows are +out of order. + +# Keyword Arguments +- `precision`: Specifies the number of digits to use for guessing dimensions (default = `6`). + +# Returns +A tuple containing the inferred dimensions from the table. + +# Example +```julia +julia> xdims = X(LinRange{Float64}(610000.0, 661180.0, 2560)); + +julia> ydims = Y(LinRange{Float64}(6.84142e6, 6.79024e6, 2560)); + +julia> bdims = Dim{:Band}([:B02, :B03, :B04]); + +julia> d = DimArray(rand(UInt16, 2560, 2560, 3), (xdims, ydims, bdims)); + +julia> t = DataFrame(d); + +julia> t_rand = Random.shuffle(t); + +julia> dims(d) +↓ X Sampled{Float64} LinRange{Float64}(610000.0, 661180.0, 2560) ForwardOrdered Regular Points, +→ Y Sampled{Float64} LinRange{Float64}(6.84142e6, 6.79024e6, 2560) ReverseOrdered Regular Points, +↗ Band Categorical{Symbol} [:B02, :B03, :B04] ForwardOrdered + +julia> DD.guess_dims(t) +↓ X Sampled{Float64} 610000.0:20.0:661180.0 ForwardOrdered Regular Points, +→ Y Sampled{Float64} 6.84142e6:-20.0:6.79024e6 ReverseOrdered Regular Points, +↗ Band Categorical{Symbol} [:B02, :B03, :B04] ForwardOrdered + +julia> DD.guess_dims(t, X, Y, :Band) +↓ X Sampled{Float64} 610000.0:20.0:661180.0 ForwardOrdered Regular Points, +→ Y Sampled{Float64} 6.84142e6:-20.0:6.79024e6 ReverseOrdered Regular Points, +↗ Band Categorical{Symbol} [:B02, :B03, :B04] ForwardOrdered + +julia> DD.guess_dims(t_rand, X => DD.ForwardOrdered, Y => DD.ReverseOrdered, :Band => DD.ForwardOrdered) +↓ X Sampled{Float64} 610000.0:20.0:661180.0 ForwardOrdered Regular Points, +→ Y Sampled{Float64} 6.84142e6:-20.0:6.79024e6 ReverseOrdered Regular Points, +↗ Band Categorical{Symbol} [:B02, :B03, :B04] ForwardOrdered +``` +""" +guess_dims(table; kw...) = guess_dims(table, _dim_col_names(table); kw...) +function guess_dims(table, dims::Tuple; precision=6) + map(dim -> _guess_dims(get_column(table, dim), dim, precision), dims) +end + +""" + get_column(table, dim::Type{<:DD.Dimension}) + get_column(table, dim::DD.Dimension) + get_column(table, dim::Symbol) + get_column(table, dim::Pair) + +Retrieve the coordinate data stored in the column specified by `dim`. + +# Arguments +- `table`: The input data table, which could be a `DataFrame`, `DimTable`, or any other Tables.jl compatible data structure. +- `dim`: A single dimension to be retrieved, which may be a `Symbol`, a `Dimension`, or a `Dimension => Order` pair. +""" +get_column(table, x::Type{<:DD.Dimension}) = Tables.getcolumn(table, DD.name(x)) +get_column(table, x::DD.Dimension) = Tables.getcolumn(table, DD.name(x)) +get_column(table, x::Symbol) = Tables.getcolumn(table, x) +get_column(table, x::Pair) = get_column(table, first(x)) + +""" + data_col_names(table, dims::Tuple) + +Return the names of all columns that don't matched the dimensions given by `dims`. + +# Arguments +- `table`: The input data table, which could be a `DataFrame`, `DimTable`, or any other Tables.jl compatible data structure. +- `dims`: A `Tuple` of one or more `Dimensions`. +""" +function data_col_names(table, dims::Tuple) + dim_cols = DD.name(dims) + return filter(x -> !(x in dim_cols), Tables.columnnames(table)) +end + +_guess_dims(coords::AbstractVector, dim::DD.Dimension, args...) = dim +_guess_dims(coords::AbstractVector, dim::Type{<:DD.Dimension}, args...) = _guess_dims(coords, DD.name(dim), args...) +_guess_dims(coords::AbstractVector, dim::Pair, args...) = _guess_dims(coords, first(dim), last(dim), args...) +function _guess_dims(coords::AbstractVector, dim::Symbol, ::Type{T}, precision::Int) where {T <: DD.Order} + return _guess_dims(coords, dim, T(), precision) +end +function _guess_dims(coords::AbstractVector, dim::Symbol, precision::Int) + dim_vals = _dim_vals(coords, precision) + order = _guess_dim_order(dim_vals) + span = _guess_dim_span(dim_vals, order, precision) + return _build_dim(dim_vals, dim, order, span) +end +function _guess_dims(coords::AbstractVector, dim::Symbol, order::DD.Order, precision::Int) + dim_vals = _dim_vals(coords, order, precision) + span = _guess_dim_span(dim_vals, order, precision) + return _build_dim(dim_vals, dim, order, span) +end + +# Extract coordinate columns from table +function _dim_cols(table, dims::Tuple) + dim_cols = DD.name(dims) + return NamedTuple{dim_cols}(Tables.getcolumn(table, col) for col in dim_cols) +end + +# Extract dimension column names from the given table +_dim_col_names(table) = filter(x -> x in Tables.columnnames(table), (:X,:Y,:Z,:Ti,:Band)) +_dim_col_names(table, dims::Tuple) = map(col -> Tables.getcolumn(table, col), DD.name(dims)) + +# Extract data columns from table +function _data_cols(table, dims::Tuple) + data_cols = data_col_names(table, dims) + return NamedTuple{Tuple(data_cols)}(Tables.getcolumn(table, col) for col in data_cols) +end + +# Determine the ordinality of a set of coordinates +_coords_to_ords(coords::AbstractVector, dim::Dimension, sel::DD.Selector) = _coords_to_ords(coords, dim, sel, DD.locus(dim), DD.span(dim)) +_coords_to_ords(coords::Tuple, dims::Tuple, sel::DD.Selector) = Tuple(_coords_to_ords(c, d, sel) for (c, d) in zip(coords, dims)) +_coords_to_ords(coords::NamedTuple, dims::Tuple, sel::DD.Selector) = _coords_to_ords(map(x -> coords[x], DD.name(dims)), dims, sel) + +# Determine the ordinality of a set of regularly spaced numerical coordinates +function _coords_to_ords( + coords::AbstractVector{<:Real}, + dim::Dimension, + ::DimensionalData.Near, + position::DimensionalData.Position, + span::DimensionalData.Regular) + step = DD.step(span) + float_ords = ((coords .- first(dim)) ./ step) .+ 1 + int_ords = _round_ords(float_ords, position) + return clamp!(int_ords, 1, length(dim)) +end + +# Determine the ordinality of a set of categorical or irregular coordinates +function _coords_to_ords( + coords::AbstractVector, + dim::Dimension, + sel::DimensionalData.Selector, + ::DimensionalData.Position, + ::DimensionalData.Span) + return map(c -> DimensionalData.selectindices(dim, rebuild(sel, c)), coords) +end + +# Round coordinate ordinality to the appropriate integer given the specified locus +_round_ords(ords::AbstractVector{<:Real}, ::DimensionalData.Start) = floor.(Int, ords) +_round_ords(ords::AbstractVector{<:Real}, ::DimensionalData.Center) = round.(Int, ords) +_round_ords(ords::AbstractVector{<:Real}, ::DimensionalData.End) = ceil.(Int, ords) + +# Extract dimension value from the given vector of coordinates +_dim_vals(coords::AbstractVector, precision::Int) = _unique_vals(coords, precision) +_dim_vals(coords::AbstractVector, ::DD.Order, precision::Int) = _unique_vals(coords, precision) +_dim_vals(coords::AbstractVector, ::DD.ForwardOrdered, precision::Int) = sort!(_unique_vals(coords, precision)) +_dim_vals(coords::AbstractVector, ::DD.ReverseOrdered, precision::Int) = sort!(_unique_vals(coords, precision), rev=true) + +# Extract all unique coordinates from the given vector +_unique_vals(coords::AbstractVector, ::Int) = unique(coords) +_unique_vals(coords::AbstractVector{<:Real}, precision::Int) = round.(coords, digits=precision) |> unique + +# Determine if the given coordinates are forward ordered, reverse ordered, or unordered +function _guess_dim_order(coords::AbstractVector) + try + if issorted(coords) + return DD.ForwardOrdered() + elseif issorted(coords, rev=true) + return DD.ReverseOrdered() + else + return DD.Unordered() + end + catch + return DD.Unordered() + end +end + +# Estimate the span between consecutive coordinates +_guess_dim_span(::AbstractVector, ::DD.Order, ::Int) = DD.Irregular() +function _guess_dim_span(coords::AbstractVector{<:Real}, ::DD.Ordered, precision::Int) + steps = round.((@view coords[2:end]) .- (@view coords[1:end-1]), digits=precision) + span = argmin(abs, steps) + return all(isinteger, round.(steps ./ span, digits=precision)) ? DD.Regular(span) : DD.Irregular() +end +function _guess_dim_span(coords::AbstractVector{<:Dates.AbstractTime}, ::DD.Ordered, precision::Int) + steps = (@view coords[2:end]) .- (@view coords[1:end-1]) + span = argmin(abs, steps) + return all(isinteger, round.(steps ./ span, digits=precision)) ? DD.Regular(span) : DD.Irregular() +end + +function _build_dim(vals::AbstractVector, dim::Symbol, order::DD.Order, ::DD.Span) + return rebuild(name2dim(dim), DD.Categorical(vals, order=order)) +end +function _build_dim(vals::AbstractVector{<:Union{Number,Dates.AbstractTime}}, dim::Symbol, order::DD.Order, span::DD.Irregular) + return rebuild(name2dim(dim), DD.Sampled(vals, order=order, span=span, sampling=DD.Points())) +end +function _build_dim(vals::AbstractVector{<:Union{Number,Dates.AbstractTime}}, dim::Symbol, order::DD.Order, span::DD.Regular) + n = round(Int, abs((last(vals) - first(vals)) / span.step) + 1) + dim_vals = StepRangeLen(first(vals), span.step, n) + return rebuild(name2dim(dim), DD.Sampled(dim_vals, order=order, span=span, sampling=DD.Points())) +end + +# Determine the index from a tuple of coordinate orders +function _ords_to_indices(ords, dims) + stride = 1 + indices = ones(Int, length(ords[1])) + for (ord, dim) in zip(ords, dims) + indices .+= (ord .- 1) .* stride + stride *= length(dim) + end + return indices +end + +_cast_missing(::AbstractArray, missingval::Missing) = missing +function _cast_missing(::AbstractArray{T}, missingval) where {T} + try + return convert(T, missingval) + catch e + return missingval + end +end \ No newline at end of file diff --git a/test/tables.jl b/test/tables.jl index b5bd416ea..23ea9eed5 100644 --- a/test/tables.jl +++ b/test/tables.jl @@ -1,4 +1,4 @@ -using DimensionalData, IteratorInterfaceExtensions, TableTraits, Tables, Test, DataFrames +using DimensionalData, IteratorInterfaceExtensions, TableTraits, Tables, Test, DataFrames, Random using DimensionalData.Lookups, DimensionalData.Dimensions using DimensionalData: DimTable, DimExtensionArray @@ -154,3 +154,48 @@ end @test Tables.columnnames(t3) == (:dimensions, :layer1, :layer2, :layer3) @test Tables.columnnames(t4) == (:band, :geometry, :value) end + +@testset "Materialize from table" begin + a = DimArray(rand(UInt8, 100, 100), (X(100:-1:1), Y(-250:5:249))) + b = DimArray(rand(Float32, 100, 100), (X(100:-1:1), Y(-250:5:249))) + c = DimArray(rand(Float64, 100, 100), (X(100:-1:1), Y(-250:5:249))) + ds = DimStack((a=a, b=b, c=c)) + t = DataFrame(ds) + t1 = Random.shuffle(t) + t2 = t[101:end,:] + + # Restore DimArray from shuffled table + @test all(DimArray(t1, dims(ds)) .== a) + @test all(DimArray(t1, dims(ds), name="a") .== a) + @test all(DimArray(t1, dims(ds), name="b") .== b) + @test all(DimArray(t1, dims(ds), name="c") .== c) + + # Restore DimArray from table with missing rows + @test all(DimArray(t2, dims(ds), name="a")[Y(2:100)] .== a[Y(2:100)]) + @test all(DimArray(t2, dims(ds), name="b")[Y(2:100)] .== b[Y(2:100)]) + @test all(DimArray(t2, dims(ds), name="c")[Y(2:100)] .== c[Y(2:100)]) + @test DimArray(t2, dims(ds), name="a")[Y(1)] .|> ismissing |> all + @test DimArray(t2, dims(ds), name="b")[Y(1)] .|> ismissing |> all + @test DimArray(t2, dims(ds), name="c")[Y(1)] .|> ismissing |> all + @test DimArray(t2, dims(ds), name="a")[Y(2:100)] .|> ismissing .|> (!) |> all + @test DimArray(t2, dims(ds), name="b")[Y(2:100)] .|> ismissing .|> (!) |> all + @test DimArray(t2, dims(ds), name="c")[Y(2:100)] .|> ismissing .|> (!) |> all + + # Restore DimStack from shuffled table + restored_stack = DimStack(t1, dims(ds)) + @test all(restored_stack.a .== ds.a) + @test all(restored_stack.b .== ds.b) + @test all(restored_stack.c .== ds.c) + + # Restore DimStack from table with missing rows + restored_stack = DimStack(t2, dims(ds)) + @test all(restored_stack.a[Y(2:100)] .== ds.a[Y(2:100)]) + @test all(restored_stack.b[Y(2:100)] .== ds.b[Y(2:100)]) + @test all(restored_stack.c[Y(2:100)] .== ds.c[Y(2:100)]) + @test restored_stack.a[Y(1)] .|> ismissing |> all + @test restored_stack.b[Y(1)] .|> ismissing |> all + @test restored_stack.c[Y(1)] .|> ismissing |> all + @test restored_stack.a[Y(2:100)] .|> ismissing .|> (!) |> all + @test restored_stack.b[Y(2:100)] .|> ismissing .|> (!) |> all + @test restored_stack.c[Y(2:100)] .|> ismissing .|> (!) |> all +end \ No newline at end of file