Skip to content

Commit

Permalink
Plugged binary layout
Browse files Browse the repository at this point in the history
  • Loading branch information
JohanMabille committed Dec 17, 2024
1 parent 1998e0a commit 36d448b
Show file tree
Hide file tree
Showing 11 changed files with 752 additions and 37 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@ namespace sparrow
case data_type::LIST:
case data_type::STRUCT:
case data_type::STRING:
case data_type::LARGE_STRING:
case data_type::BINARY:
case data_type::LARGE_BINARY:
case data_type::FIXED_WIDTH_BINARY:
case data_type::LARGE_LIST:
case data_type::LIST_VIEW:
Expand Down
6 changes: 6 additions & 0 deletions include/sparrow/layout/dispatch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,12 @@ namespace sparrow
return func(unwrap_array<primitive_array<float64_t>>(ar));
case data_type::STRING:
return func(unwrap_array<string_array>(ar));
case data_type::LARGE_STRING:
return func(unwrap_array<big_string_array>(ar));
case data_type::BINARY:
return func(unwrap_array<binary_array>(ar));
case data_type::LARGE_BINARY:
return func(unwrap_array<big_binary_array>(ar));
case data_type::RUN_ENCODED:
return func(unwrap_array<run_end_encoded_array>(ar));
case data_type::LIST:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "sparrow/layout/layout_utils.hpp"
#include "sparrow/layout/variable_size_binary_layout/variable_size_binary_iterator.hpp"
#include "sparrow/layout/variable_size_binary_layout/variable_size_binary_reference.hpp"
#include "sparrow/types/data_traits.hpp"
#include "sparrow/utils/repeat_container.hpp"

namespace sparrow
Expand Down Expand Up @@ -57,7 +58,7 @@ namespace sparrow
};

template <>
struct variable_size_binary_format<std::vector<std::byte>, std::int32_t>
struct variable_size_binary_format<std::vector<byte_t>, std::int32_t>
{
static std::string format()
{
Expand All @@ -66,7 +67,7 @@ namespace sparrow
};

template <>
struct variable_size_binary_format<std::vector<std::byte>, std::int64_t>
struct variable_size_binary_format<std::vector<byte_t>, std::int64_t>
{
static std::string format()
{
Expand All @@ -78,8 +79,54 @@ namespace sparrow
template <std::ranges::sized_range T, class CR, layout_offset OT>
class variable_size_binary_array_impl;

using string_array = variable_size_binary_array_impl<std::string, std::string_view, std::int32_t>;
using binary_traits = arrow_traits<std::vector<byte_t>>;

using string_array = variable_size_binary_array_impl<std::string, std::string_view, std::int32_t>;
using big_string_array = variable_size_binary_array_impl<std::string, std::string_view, std::int64_t>;
using binary_array = variable_size_binary_array_impl<binary_traits::value_type, binary_traits::const_reference, std::int32_t>;
using big_binary_array = variable_size_binary_array_impl<binary_traits::value_type, binary_traits::const_reference, std::int64_t>;

namespace detail
{
template<class T>
struct get_data_type_from_array;

template<>
struct get_data_type_from_array<sparrow::string_array>
{
constexpr static sparrow::data_type get()
{
return sparrow::data_type::STRING;
}
};

template<>
struct get_data_type_from_array<sparrow::big_string_array>
{
constexpr static sparrow::data_type get()
{
return sparrow::data_type::LARGE_STRING;
}
};

template<>
struct get_data_type_from_array<sparrow::binary_array>
{
constexpr static sparrow::data_type get()
{
return sparrow::data_type::BINARY;
}
};

template<>
struct get_data_type_from_array<sparrow::big_binary_array>
{
constexpr static sparrow::data_type get()
{
return sparrow::data_type::LARGE_BINARY;
}
};
}

/**
* Checks whether T is a string_array type.
Expand All @@ -92,6 +139,18 @@ namespace sparrow
*/
template <class T>
constexpr bool is_big_string_array_v = std::same_as<T, big_string_array>;

/**
* Checks whether T is a binary_array type.
*/
template <class T>
constexpr bool is_binary_array_v = std::same_as<T, binary_array>;

/**
* Checks whether T is a big_binary_array type.
*/
template <class T>
constexpr bool is_big_binary_array_v = std::same_as<T, big_binary_array>;

template <std::ranges::sized_range T, class CR, layout_offset OT>
struct array_inner_types<variable_size_binary_array_impl<T, CR, OT>> : array_inner_types_base
Expand Down Expand Up @@ -145,6 +204,10 @@ namespace sparrow
class variable_size_binary_array_impl final
: public mutable_array_bitmap_base<variable_size_binary_array_impl<T, CR, OT>>
{
private:

static_assert(sizeof(std::ranges::range_value_t<T>) == sizeof(std::uint8_t),
"Only sequences of types with the same size as uint8_t are supported");
public:

using self_type = variable_size_binary_array_impl<T, CR, OT>;
Expand Down Expand Up @@ -424,13 +487,26 @@ namespace sparrow
const auto shift_val_abs = static_cast<size_type>(std::abs(shift_byte_count));
const auto new_data_buffer_size = shift_byte_count < 0 ? data_buffer.size() - shift_val_abs
: data_buffer.size() + shift_val_abs;
data_buffer.resize(new_data_buffer_size);
// Move elements to make space for the new value
std::move_backward(
data_buffer.begin() + offset_end,
data_buffer.end() - shift_byte_count,
data_buffer.end()
);

if (shift_byte_count > 0)
{
data_buffer.resize(new_data_buffer_size);
// Move elements to make space for the new value
std::move_backward(
data_buffer.begin() + offset_end,
data_buffer.end() - shift_byte_count,
data_buffer.end()
);
}
else
{
std::move(
data_buffer.begin() + offset_end,
data_buffer.end(),
data_buffer.begin() + offset_end + shift_byte_count
);
data_buffer.resize(new_data_buffer_size);
}
// Adjust offsets for subsequent elements
std::for_each(
offset(index + 1),
Expand All @@ -441,8 +517,9 @@ namespace sparrow
}
);
}
auto tmp = std::views::transform(rhs, [](const auto& val) { return static_cast<std::uint8_t>(val); });
// Copy the new value into the buffer
std::copy(std::ranges::begin(rhs), std::ranges::end(rhs), data_buffer.begin() + offset_beg);
std::copy(std::ranges::begin(tmp), std::ranges::end(tmp), data_buffer.begin() + offset_beg);
}

template <std::ranges::sized_range T, class CR, layout_offset OT>
Expand Down
4 changes: 3 additions & 1 deletion include/sparrow/types/data_traits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include "sparrow/types/data_type.hpp"
#include "sparrow/utils/nullable.hpp"
#include "sparrow/utils/vector_view.hpp"

namespace sparrow
{
Expand Down Expand Up @@ -57,8 +58,9 @@ namespace sparrow
template <>
struct arrow_traits<std::vector<byte_t>>
{
static constexpr data_type type_id = data_type::STRING;
static constexpr data_type type_id = data_type::BINARY;
using value_type = std::vector<byte_t>;
using const_reference = vector_view<const byte_t>;
};

template <>
Expand Down
54 changes: 49 additions & 5 deletions include/sparrow/types/data_type.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,10 @@ namespace sparrow
DOUBLE = 12,
// UTF8 variable-length string
STRING = 13,
LARGE_STRING = 14,
// Variable-length bytes (no guarantee of UTF8-ness)
BINARY = 14,
BINARY = 15,
LARGE_BINARY = 16,
// Number of nanoseconds since the UNIX epoch with an optional timezone.
// See: https://arrow.apache.org/docs/python/timestamps.html#timestamps
TIMESTAMP = 18,
Expand Down Expand Up @@ -221,11 +223,13 @@ namespace sparrow
case 'g':
return data_type::DOUBLE;
case 'u':
case 'U': // large string
return data_type::STRING;
case 'z': // binary
case 'Z': // large binary
case 'U':
return data_type::LARGE_STRING;
case 'z':
return data_type::BINARY;
case 'Z':
return data_type::LARGE_BINARY;
default:
return data_type::NA;
}
Expand Down Expand Up @@ -414,8 +418,12 @@ namespace sparrow
return "g";
case data_type::STRING:
return "u";
case data_type::LARGE_STRING:
return "U";
case data_type::BINARY:
return "z";
case cata_type::LARGE_BINARY:
return "Z";
case data_type::TIMESTAMP:
return "tDm";
case data_type::LIST:
Expand Down Expand Up @@ -470,6 +478,42 @@ namespace sparrow
}
}

/// @returns The number of bytes required to store the provided primitive data type.
template<std::integral T>
constexpr size_t primitive_bytes_count(data_type data_type, T size)
{
SPARROW_ASSERT_TRUE(data_type_is_primitive(data_type));
constexpr double bit_per_byte = 8.;
switch (data_type)
{
case data_type::BOOL:
return static_cast<std::size_t>(std::ceil(static_cast<double>(size) / bit_per_byte));
case data_type::UINT8:
// TODO: Replace static_cast<std::size_t> by the 32 bit fix check function
case data_type::INT8:
return static_cast<std::size_t>(size);
case data_type::UINT16:
return (sizeof(std::uint16_t) / sizeof(std::uint8_t)) * static_cast<std::size_t>(size);
case data_type::INT16:
return (sizeof(std::int16_t) / sizeof(std::uint8_t)) * static_cast<std::size_t>(size);
case data_type::UINT32:
return (sizeof(std::uint32_t) / sizeof(std::uint8_t)) * static_cast<std::size_t>(size);
case data_type::INT32:
return (sizeof(std::int32_t) / sizeof(std::uint8_t)) * static_cast<std::size_t>(size);
case data_type::UINT64:
return (sizeof(std::uint64_t) / sizeof(std::uint8_t)) * static_cast<std::size_t>(size);
case data_type::INT64:
return (sizeof(std::int64_t) / sizeof(std::uint8_t)) * static_cast<std::size_t>(size);
case data_type::HALF_FLOAT:
return (sizeof(float16_t) / sizeof(std::uint8_t)) * static_cast<std::size_t>(size);
case data_type::FLOAT:
return (sizeof(float32_t) / sizeof(std::uint8_t)) * static_cast<std::size_t>(size);
case data_type::DOUBLE:
return (sizeof(float64_t) / sizeof(std::uint8_t)) * static_cast<std::size_t>(size);
default:
throw std::runtime_error("Unsupported data type");
}
}

class list_value;
class struct_value;
Expand All @@ -491,7 +535,7 @@ namespace sparrow
float32_t,
float64_t,
std::string,
// std::vector<byte_t>,
std::vector<byte_t>,
sparrow::timestamp,
// TODO: add missing fundamental types here
list_value,
Expand Down
70 changes: 70 additions & 0 deletions include/sparrow/utils/vector_view.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// Man Group Operations Limited
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <algorithm>
#include <ranges>
#include <vector>

namespace sparrow
{
/**
* The class vector_view describes an object that can refer to a constant contiguous
* sequence of T with the first element of the sequence at position zero. It is similar
* to string_view, but for arbitrary T. You can consider it as a span or range supporting
* const operations only, and comparison operators.
*/
template <class T>
class vector_view : public std::span<T>
{
public:

using base_type = std::span<T>;
using value_type = typename base_type::value_type;

using base_type::base_type;

explicit operator std::vector<value_type>() const noexcept
{
return std::vector<value_type>(this->begin(), this->end());
}
};

template <class T>
constexpr bool operator==(const vector_view<T>& lhs, const vector_view<T>& rhs)
{
return std::ranges::equal(lhs, rhs);
}

template <class T>
constexpr bool operator==(const vector_view<T>& lhs, const std::vector<std::decay_t<T>>& rhs)
{
return std::ranges::equal(lhs, rhs);
}

template <class T>
constexpr std::compare_three_way_result<T>
operator<=>(const vector_view<T>& lhs, const vector_view<T>& rhs)
{
return std::lexicographical_compare_three_way(lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
}

template <class T>
constexpr std::compare_three_way_result<T>
operator<=>(const vector_view<T>& lhs, const std::vector<std::decay_t<T>>& rhs)
{
return std::lexicographical_compare_three_way(lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
}
}
6 changes: 6 additions & 0 deletions src/array_factory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,12 @@ namespace sparrow
return detail::make_wrapper_ptr<struct_array>(std::move(proxy));
case data_type::STRING:
return detail::make_wrapper_ptr<string_array>(std::move(proxy));
case data_type::LARGE_STRING:
return detail::make_wrapper_ptr<big_string_array>(std::move(proxy));
case data_type::BINARY:
return detail::make_wrapper_ptr<binary_array>(std::move(proxy));
case data_type::LARGE_BINARY:
return detail::make_wrapper_ptr<big_binary_array>(std::move(proxy));
case data_type::RUN_ENCODED:
return detail::make_wrapper_ptr<run_end_encoded_array>(std::move(proxy));
case data_type::DENSE_UNION:
Expand Down
3 changes: 2 additions & 1 deletion test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ else()
test_arrow_array_schema_utils.cpp
test_arrow_array.cpp
test_arrow_schema.cpp
test_binary_array.cpp
test_bit.cpp
test_buffer_adaptor.cpp
test_buffer.cpp
Expand All @@ -78,13 +79,13 @@ else()
test_record_batch.cpp
test_repeat_container.cpp
test_run_end_encoded_array.cpp
test_string_array.cpp
test_struct_array.cpp
test_traits.cpp
test_union_array.cpp
test_utils_buffers.cpp
test_utils_offsets.cpp
test_utils.hpp
test_variable_size_binary_array.cpp
test_variable_size_binary_view_array.cpp
test_nested_comperators.cpp
test_builder_utils.cpp
Expand Down
Loading

0 comments on commit 36d448b

Please sign in to comment.