From abfbf2d0f49e6777ef70a44f4b8143110346eeb8 Mon Sep 17 00:00:00 2001 From: JoshuaBillson <61667893+JoshuaBillson@users.noreply.github.com> Date: Tue, 19 Sep 2023 10:53:39 -0600 Subject: [PATCH] Overhaul DimTable (#536) * Add initial implementation of `mergedims` * Use values instead of layers * Simplify base method * Perform all checks in main method * Simplify mergedims for arrays * Handle case where different numbers of dims provided * Add mergedims to docs * Implemented WideDimTable * Removed Dead Code * mergedims now allows selection of specific dimensions * Dropped MergedDimColumn * cleaned up code * More cleanup * Added comparedims when constructing a DimTable from multiple AbstractDimArrays * mergedims can now accept dimensions that are not present in the stack/array * Implemented unmergedims * Removed unnecessary argument from undergedims * DimTable now preserves name of AbstractDimArray if present. * Added parent field to DimTable * Passed test cases * Added test cases and updated docs * Updated docs for DimTable * Updated docs and test cases * Removed Any[] --------- Co-authored-by: Seth Axen --- docs/src/reference.md | 2 + src/DimensionalData.jl | 2 +- src/Dimensions/primitives.jl | 1 + src/array/array.jl | 117 +++++++++++++++++++++++++ src/stack/stack.jl | 11 +++ src/tables.jl | 162 ++++++++++++++++++++++++----------- test/merged.jl | 24 ++++++ test/tables.jl | 23 +++++ 8 files changed, 289 insertions(+), 53 deletions(-) diff --git a/docs/src/reference.md b/docs/src/reference.md index 98cf1e167..fe0d3dbda 100644 --- a/docs/src/reference.md +++ b/docs/src/reference.md @@ -50,6 +50,8 @@ rebuild modify broadcast_dims broadcast_dims! +mergedims +unmergedims reorder Base.cat Base.map diff --git a/src/DimensionalData.jl b/src/DimensionalData.jl index 1600a41f6..89506640f 100644 --- a/src/DimensionalData.jl +++ b/src/DimensionalData.jl @@ -66,7 +66,7 @@ export dims, refdims, metadata, name, lookup, bounds export dimnum, hasdim, hasselection, otherdims # utils -export set, rebuild, reorder, modify, broadcast_dims, broadcast_dims! +export set, rebuild, reorder, modify, broadcast_dims, broadcast_dims!, mergedims, unmergedims const DD = DimensionalData diff --git a/src/Dimensions/primitives.jl b/src/Dimensions/primitives.jl index 44069442b..ba16c6d3a 100644 --- a/src/Dimensions/primitives.jl +++ b/src/Dimensions/primitives.jl @@ -12,6 +12,7 @@ or are at least rotations/transformations of the same type. """ @inline dimsmatch(dims, query) = dimsmatch(<:, dims, query) @inline function dimsmatch(f::Function, dims::Tuple, query::Tuple) + length(dims) == length(query) || return false all(map((d, l) -> dimsmatch(f, d, l), dims, query)) end @inline dimsmatch(f::Function, dim, query) = dimsmatch(f, typeof(dim), typeof(query)) diff --git a/src/array/array.jl b/src/array/array.jl index c2598fda6..db1d36101 100644 --- a/src/array/array.jl +++ b/src/array/array.jl @@ -576,3 +576,120 @@ end # Thed default constructor is DimArray dimconstructor(dims::DimTuple) = dimconstructor(tail(dims)) dimconstructor(dims::Tuple{}) = DimArray + +""" + mergedims(old_dims => new_dim) => Dimension + +Return a dimension `new_dim` whose indices are a [`MergedLookup`](@ref) of the indices of +`old_dims`. +""" +function mergedims((old_dims, new_dim)::Pair) + data = vec(DimPoints(_astuple(old_dims))) + return rebuild(basedims(new_dim), MergedLookup(data, old_dims)) +end + +""" + mergedims(dims, old_dims => new_dim, others::Pair...) => dims_new + +If dimensions `old_dims`, `new_dim`, etc. are found in `dims`, then return new `dims_new` +where all dims in `old_dims` have been combined into a single dim `new_dim`. +The returned dimension will keep only the name of `new_dim`. Its coords will be a +[`MergedLookup`](@ref) of the coords of the dims in `old_dims`. New dimensions are always +placed at the end of `dims_new`. `others` contains other dimension pairs to be merged. + +# Example +````jldoctest +julia> ds = (X(0:0.1:0.4), Y(10:10:100), Ti([0, 3, 4])); +julia> mergedims(ds, Ti => :time, (X, Y) => :space) +Dim{:time} MergedLookup{Tuple{Int64}} Tuple{Int64}[(0,), (3,), (4,)] Ti, +Dim{:space} MergedLookup{Tuple{Float64, Int64}} Tuple{Float64, Int64}[(0.0, 10), (0.1, 10), …, (0.3, 100), (0.4, 100)] X, Y +```` +""" +function mergedims(all_dims, dim_pairs::Pair...) + # filter out dims completely missing + dim_pairs = map(x -> _filter_dims(all_dims, first(x)) => last(x), dim_pairs) + dim_pairs_complete = filter(dim_pairs) do (old_dims,) + dims_present = dims(all_dims, _astuple(old_dims)) + isempty(dims_present) && return false + all(hasdim(dims_present, old_dims)) || throw(ArgumentError( + "Not all dimensions $old_dims found in $(map(basetypeof, all_dims))" + )) + return true + end + isempty(dim_pairs_complete) && return all_dims + dim_pairs_concrete = map(dim_pairs_complete) do (old_dims, new_dim) + return dims(all_dims, _astuple(old_dims)) => new_dim + end + # throw error if old dim groups overlap + old_dims_tuples = map(first, dim_pairs_concrete) + if !dimsmatch(_cat_tuples(old_dims_tuples...), combinedims(old_dims_tuples...)) + throw(ArgumentError("Dimensions to be merged are not all unique")) + end + return _mergedims(all_dims, dim_pairs_concrete...) +end + +""" + mergedims(A::AbstractDimArray, dim_pairs::Pair...) => AbstractDimArray + mergedims(A::AbstractDimStack, dim_pairs::Pair...) => AbstractDimStack + +Return a new array or stack whose dimensions are the result of [`mergedims(dims(A), dim_pairs)`](@ref). +""" +function mergedims(A::AbstractDimArray, dim_pairs::Pair...) + isempty(dim_pairs) && return A + all_dims = dims(A) + dims_new = mergedims(all_dims, dim_pairs...) + dimsmatch(all_dims, dims_new) && return A + dims_perm = _unmergedims(dims_new, map(last, dim_pairs)) + Aperm = PermutedDimsArray(A, dims_perm) + data_merged = reshape(data(Aperm), map(length, dims_new)) + rebuild(A, data_merged, dims_new) +end + +""" + unmergedims(merged_dims::Tuple{Vararg{Dimension}}) => Tuple{Vararg{Dimension}} + +Return the unmerged dimensions from a tuple of merged dimensions. However, the order of the original dimensions are not necessarily preserved. +""" +function unmergedims(merged_dims::Tuple{Vararg{Dimension}}) + reduce(map(dims, merged_dims), init=()) do acc, x + x isa Tuple ? (acc..., x...) : (acc..., x) + end +end + +""" + unmergedims(A::AbstractDimArray, original_dims) => AbstractDimArray + unmergedims(A::AbstractDimStack, original_dims) => AbstractDimStack + +Return a new array or stack whose dimensions are restored to their original prior to calling [`mergedims(A, dim_pairs)`](@ref). +""" +function unmergedims(A::AbstractDimArray, original_dims) + merged_dims = dims(A) + unmerged_dims = unmergedims(merged_dims) + reshaped = reshape(data(A), size(unmerged_dims)) + permuted = permutedims(reshaped, dimnum(unmerged_dims, original_dims)) + return DimArray(permuted, original_dims) +end + +function _mergedims(all_dims, dim_pair::Pair, dim_pairs::Pair...) + old_dims, new_dim = dim_pair + dims_to_merge = dims(all_dims, _astuple(old_dims)) + merged_dim = mergedims(dims_to_merge => new_dim) + all_dims_new = (otherdims(all_dims, dims_to_merge)..., merged_dim) + isempty(dim_pairs) && return all_dims_new + return _mergedims(all_dims_new, dim_pairs...) +end + +function _unmergedims(all_dims, merged_dims) + _merged_dims = dims(all_dims, merged_dims) + unmerged_dims = map(all_dims) do d + hasdim(_merged_dims, d) || return _astuple(d) + return dims(lookup(d)) + end + return _cat_tuples(unmerged_dims...) +end + +_unmergedims(all_dims, dim_pairs::Pair...) = _cat_tuples(replace(all_dims, dim_pairs...)) + +_cat_tuples(tuples...) = mapreduce(_astuple, (x, y) -> (x..., y...), tuples) + +_filter_dims(alldims, dims) = filter(dim -> hasdim(alldims, dim), dims) \ No newline at end of file diff --git a/src/stack/stack.jl b/src/stack/stack.jl index bb0dfbe5e..815938bac 100644 --- a/src/stack/stack.jl +++ b/src/stack/stack.jl @@ -157,6 +157,17 @@ for func in (:index, :lookup, :metadata, :sampling, :span, :bounds, :locus, :ord @eval ($func)(s::AbstractDimStack, args...) = ($func)(dims(s), args...) end +function mergedims(ds::AbstractDimStack, dim_pairs::Pair...) + isempty(dim_pairs) && return ds + vals = map(A -> mergedims(A, dim_pairs...), values(ds)) + rebuild_from_arrays(ds, vals) +end + +function unmergedims(s::AbstractDimStack, original_dims) + return map(A -> unmergedims(A, original_dims), s) +end + + """ DimStack <: AbstractDimStack diff --git a/src/tables.jl b/src/tables.jl index 036357889..9aa84d3d8 100644 --- a/src/tables.jl +++ b/src/tables.jl @@ -1,4 +1,11 @@ -# Tables.jl interface +""" + AbstractDimTable <: Tables.AbstractColumns + +Abstract supertype for dim tables +""" +abstract type AbstractDimTable <: Tables.AbstractColumns end + +# Tables.jl interface for AbstractDimStack and AbstractDimArray DimTableSources = Union{AbstractDimStack,AbstractDimArray} @@ -20,6 +27,16 @@ Tables.schema(s::AbstractDimStack) = Tables.schema(DimTable(s)) @inline Tables.getcolumn(t::DimTableSources, dim::DimOrDimType) = Tables.getcolumn(t, dimnum(t, dim)) +function _colnames(s::AbstractDimStack) + dimkeys = map(dim2key, (dims(s))) + # The data is always the last column/s + (dimkeys..., keys(s)...) +end + + +# DimColumn + + """ DimColumn{T,D<:Dimension} <: AbstractVector{T} @@ -56,8 +73,6 @@ end dim(c::DimColumn) = getfield(c, :dim) dimstride(c::DimColumn) = getfield(c, :dimstride) -# Simple Array interface - Base.length(c::DimColumn) = getfield(c, :length) @inline function Base.getindex(c::DimColumn, i::Int) Base.@boundscheck checkbounds(c, i) @@ -70,6 +85,10 @@ Base.axes(c::DimColumn) = (Base.OneTo(length(c)),) Base.vec(c::DimColumn{T}) where T = [c[i] for i in eachindex(c)] Base.Array(c::DimColumn) = vec(c) + +# DimArrayColumn + + struct DimArrayColumn{T,A<:AbstractDimArray{T},DS,DL,L} <: AbstractVector{T} data::A dimstrides::DS @@ -89,8 +108,6 @@ Base.parent(c::DimArrayColumn) = getfield(c, :data) dimstrides(c::DimArrayColumn) = getfield(c, :dimstrides) dimlengths(c::DimArrayColumn) = getfield(c, :dimlengths) -# Simple Array interface - Base.length(c::DimArrayColumn) = getfield(c, :length) @inline function Base.getindex(c::DimArrayColumn, i::Int) Base.@boundscheck checkbounds(c, i) @@ -107,21 +124,20 @@ Base.axes(c::DimArrayColumn) = (Base.OneTo(length(c)),) Base.vec(c::DimArrayColumn{T}) where T = [c[i] for i in eachindex(c)] Base.Array(c::DimArrayColumn) = vec(c) -""" - AbstractDimTable <: Tables.AbstractColumns -Abstract supertype for dim tables -""" -abstract type AbstractDimTable <: Tables.AbstractColumns end +# DimTable + """ DimTable <: AbstractDimTable - DimTable(A::AbstractDimArray) + DimTable(s::AbstractDimStack; mergedims=nothing) + DimTable(x::AbstractDimArray; layersfrom=nothing, mergedims=nothing) + DimTable(xs::Vararg{AbstractDimArray}; layernames=nothing, mergedims=nothing) -Construct a Tables.jl/TableTraits.jl compatible object out of an `AbstractDimArray`. +Construct a Tables.jl/TableTraits.jl compatible object out of an `AbstractDimArray` or `AbstractDimStack`. -This table will have a column for the array data and columns for each +This table will have columns for the array data and columns for each `Dimension` index, as a [`DimColumn`]. These are lazy, and generated as required. @@ -131,31 +147,77 @@ column name `:Ti`, and `Dim{:custom}` becomes `:custom`. To get dimension columns, you can index with `Dimension` (`X()`) or `Dimension` type (`X`) as well as the regular `Int` or `Symbol`. + +# Keywords +* `mergedims`: Combine two or more dimensions into a new dimension. +* `layersfrom`: Treat a dimension of an `AbstractDimArray` as layers of an `AbstractDimStack`. + +# Example +```jldoctest +julia> a = DimArray(rand(32,32,3), (X,Y,Dim{:band})); + +julia> DimTable(a, layersfrom=Dim{:band}, mergedims=(X,Y)=>:geometry) +DimTable with 1024 rows, 4 columns, and schema: + :geometry Tuple{Int64, Int64} + :band_1 Float64 + :band_2 Float64 + :band_3 Float64 +``` """ -struct DimTable{Keys,S,DC,DAC} <: AbstractDimTable - stack::S - dimcolumns::DC - dimarraycolumns::DAC -end -DimTable{K}(stack::S, dimcolumns::DC, strides::SD) where {K,S,DC,SD} = - DimTable{K,S,DC,SD}(stack, dimcolumns, strides) -DimTable(A::AbstractDimArray, As::AbstractDimArray...) = DimTable((A, As...)) -function DimTable(As::Tuple{AbstractDimArray,Vararg{AbstractDimArray}}...) - DimTable(DimStack(As...)) -end -function DimTable(s::AbstractDimStack) +struct DimTable <: AbstractDimTable + parent::AbstractDimArray + colnames::Vector{Symbol} + dimcolumns::Vector{DimColumn} + dimarraycolumns::Vector{DimArrayColumn} +end + +function DimTable(s::AbstractDimStack; mergedims=nothing) + s = isnothing(mergedims) ? s : DimensionalData.mergedims(s, mergedims) dims_ = dims(s) dimcolumns = map(d -> DimColumn(d, dims_), dims_) dimarraycolumns = map(A -> DimArrayColumn(A, dims_), s) keys = _colnames(s) - DimTable{keys}(s, dimcolumns, dimarraycolumns) + return DimTable(first(s), collect(keys), collect(dimcolumns), collect(dimarraycolumns)) +end + +function DimTable(xs::Vararg{AbstractDimArray}; layernames=nothing, mergedims=nothing) + # Check that dims are compatible + comparedims(xs...) + + # Construct Layer Names + layernames = isnothing(layernames) ? [Symbol("layer_$i") for i in eachindex(xs)] : layernames + + # Construct DimColumns + xs = isnothing(mergedims) ? xs : map(x -> DimensionalData.mergedims(x, mergedims), xs) + dims_ = dims(first(xs)) + dimcolumns = collect(map(d -> DimColumn(d, dims_), dims_)) + dimnames = collect(map(dim2key, dims_)) + + # Construct DimArrayColumns + dimarraycolumns = collect(map(A -> DimArrayColumn(A, dims_), xs)) + + # Return DimTable + colnames = vcat(dimnames, layernames) + return DimTable(first(xs), colnames, dimcolumns, dimarraycolumns) +end + +function DimTable(x::AbstractDimArray; layersfrom=nothing, mergedims=nothing) + if !isnothing(layersfrom) && any(hasdim(x, layersfrom)) + nlayers = size(x, layersfrom) + layers = [(@view x[layersfrom(i)]) for i in 1:nlayers] + layernames = Symbol.(["$(dim2key(layersfrom))_$i" for i in 1:nlayers]) + return DimTable(layers..., layernames=layernames, mergedims=mergedims) + else + s = name(x) == NoName() ? DimStack((;value=x)) : DimStack(x) + return DimTable(s, mergedims=mergedims) + end end dimcolumns(t::DimTable) = getfield(t, :dimcolumns) dimarraycolumns(t::DimTable) = getfield(t, :dimarraycolumns) -dims(t::DimTable) = dims(parent(t)) +colnames(t::DimTable) = Tuple(getfield(t, :colnames)) -Base.parent(t::DimTable) = getfield(t, :stack) +Base.parent(t::DimTable) = getfield(t, :parent) for func in (:dims, :val, :index, :lookup, :metadata, :order, :sampling, :span, :bounds, :locus, :name, :label, :units) @@ -163,20 +225,18 @@ for func in (:dims, :val, :index, :lookup, :metadata, :order, :sampling, :span, end -# Tables interface - Tables.istable(::DimTable) = true Tables.columnaccess(::Type{<:DimTable}) = true Tables.columns(t::DimTable) = t -Tables.columnnames(c::DimTable{Keys}) where Keys = Keys -function Tables.schema(t::DimTable{Keys}) where Keys - s = parent(t) - types = (map(eltype, dims(s))..., map(eltype, parent(s))...) - Tables.Schema(Keys, types) +Tables.columnnames(c::DimTable) = colnames(c) + +function Tables.schema(t::DimTable) + types = vcat([map(eltype, dimcolumns(t))...], [map(eltype, dimarraycolumns(t))...]) + Tables.Schema(colnames(t), types) end -@inline function Tables.getcolumn(t::DimTable{Keys}, i::Int) where Keys - nkeys = length(Keys) +@inline function Tables.getcolumn(t::DimTable, i::Int) + nkeys = length(colnames(t)) if i > length(dims(t)) dimarraycolumns(t)[i - length(dims(t))] elseif i > 0 && i < nkeys @@ -185,38 +245,36 @@ end throw(ArgumentError("There is no table column $i")) end end + @inline function Tables.getcolumn(t::DimTable, dim::DimOrDimType) dimcolumns(t)[dimnum(t, dim)] end -# Retrieve a column by name -@inline function Tables.getcolumn(t::DimTable{Keys}, key::Symbol) where Keys - if key in keys(dimarraycolumns(t)) - dimarraycolumns(t)[key] + +@inline function Tables.getcolumn(t::DimTable, key::Symbol) + keys = colnames(t) + i = findfirst(==(key), keys) + if isnothing(i) + throw(ArgumentError("There is no table column $key")) else - dimcolumns(t)[dimnum(dims(t), key)] + return Tables.getcolumn(t, i) end end + @inline function Tables.getcolumn(t::DimTable, ::Type{T}, i::Int, key::Symbol) where T Tables.getcolumn(t, key) end -function _colnames(s::AbstractDimStack) - dimkeys = map(dim2key, (dims(s))) - # The data is always the last column/s - (dimkeys..., keys(s)...) -end - - # TableTraits.jl interface + function IteratorInterfaceExtensions.getiterator(x::DimTableSources) - return Tables.datavaluerows(Tables.columntable(x)) + return Tables.datavaluerows(Tables.dictcolumntable(x)) end IteratorInterfaceExtensions.isiterable(::DimTableSources) = true TableTraits.isiterabletable(::DimTableSources) = true function IteratorInterfaceExtensions.getiterator(t::DimTable) - return Tables.datavaluerows(Tables.columntable(t)) + return Tables.datavaluerows(Tables.dictcolumntable(t)) end IteratorInterfaceExtensions.isiterable(::DimTable) = true -TableTraits.isiterabletable(::DimTable) = true +TableTraits.isiterabletable(::DimTable) = true \ No newline at end of file diff --git a/test/merged.jl b/test/merged.jl index 5f00d3bb4..cdf18780d 100644 --- a/test/merged.jl +++ b/test/merged.jl @@ -64,3 +64,27 @@ end @test occursin("Y", sp) @test occursin("Z", sp) end + +@testset "unmerge" begin + a = DimArray(rand(32, 32, 3), (X,Y,Dim{:band})) + merged = mergedims(a, (X,Y)=>:geometry) + unmerged = unmergedims(merged, dims(a)) + perm_unmerged = unmergedims(permutedims(merged, (2,1)), dims(a)) + + # Test Merge + @test hasdim(merged, Dim{:band}) + @test hasdim(merged, Dim{:geometry}) + @test !hasdim(merged, X) + @test !hasdim(merged, Y) + @test size(merged) == (3, 32 * 32) + + # Test Unmerge + @test hasdim(unmerged, X) + @test hasdim(unmerged, Y) + @test hasdim(unmerged, Dim{:band}) + @test !hasdim(unmerged, Dim{:geometry}) + @test dims(unmerged) == dims(a) + @test size(unmerged) == size(a) + @test all(a .== unmerged) + @test all(a .== perm_unmerged) +end \ No newline at end of file diff --git a/test/tables.jl b/test/tables.jl index 9ee13be6e..651c72769 100644 --- a/test/tables.jl +++ b/test/tables.jl @@ -142,3 +142,26 @@ end @test Tables.columntable(a) == (a = [1],) @test Tables.columntable(ds) == (a = [1], b = [2]) end + +@testset "DimTable layersfrom" begin + a = DimArray(rand(32, 32, 5, 3), (X,Y,Dim{:band},Ti)) + t1 = DimTable(a) + t2 = DimTable(a, layersfrom=Dim{:band}) + @test Tables.columnnames(t1) == (:X, :Y, :band, :Ti, :value) + @test Tables.columnnames(t2) == (:X, :Y, :Ti, :band_1, :band_2, :band_3, :band_4, :band_5) + @test length(t1.X) == (32 * 32 * 5 * 3) + @test length(t2.X) == (32 * 32 * 3) +end + +@testset "DimTable mergelayers" begin + a = DimStack([DimArray(rand(32, 32, 3), (X,Y,Ti)) for _ in 1:3]) + b = DimArray(rand(32, 32, 3), (X,Y,Dim{:band})) + t1 = DimTable(a, mergedims=(:X,:Y)=>:geometry) + t2 = DimTable(a, mergedims=(:X,:Y,:Z)=>:geometry) # Merge missing dimension + t3 = DimTable(a, mergedims=(X,:Y,Ti)=>:dimensions) # Mix symbols and dimensions + t4 = DimTable(b, mergedims=(:X,:Y)=>:geometry) # Test DimArray + @test Tables.columnnames(t1) == (:Ti, :geometry, :layer1, :layer2, :layer3) + @test Tables.columnnames(t2) == (:Ti, :geometry, :layer1, :layer2, :layer3) + @test Tables.columnnames(t3) == (:dimensions, :layer1, :layer2, :layer3) + @test Tables.columnnames(t4) == (:band, :geometry, :value) +end \ No newline at end of file