Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Plugged binary layout #312

Merged
merged 3 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@ namespace sparrow
case data_type::LIST:
case data_type::STRUCT:
case data_type::STRING:
case data_type::LARGE_STRING:
case data_type::BINARY:
case data_type::LARGE_BINARY:
case data_type::FIXED_WIDTH_BINARY:
case data_type::LARGE_LIST:
case data_type::LIST_VIEW:
Expand Down
6 changes: 6 additions & 0 deletions include/sparrow/layout/dispatch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,12 @@ namespace sparrow
return func(unwrap_array<primitive_array<float64_t>>(ar));
case data_type::STRING:
return func(unwrap_array<string_array>(ar));
case data_type::LARGE_STRING:
return func(unwrap_array<big_string_array>(ar));
case data_type::BINARY:
return func(unwrap_array<binary_array>(ar));
case data_type::LARGE_BINARY:
return func(unwrap_array<big_binary_array>(ar));
case data_type::RUN_ENCODED:
return func(unwrap_array<run_end_encoded_array>(ar));
case data_type::LIST:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "sparrow/layout/layout_utils.hpp"
#include "sparrow/layout/variable_size_binary_layout/variable_size_binary_iterator.hpp"
#include "sparrow/layout/variable_size_binary_layout/variable_size_binary_reference.hpp"
#include "sparrow/types/data_traits.hpp"
#include "sparrow/utils/repeat_container.hpp"

namespace sparrow
Expand Down Expand Up @@ -57,7 +58,7 @@ namespace sparrow
};

template <>
struct variable_size_binary_format<std::vector<std::byte>, std::int32_t>
struct variable_size_binary_format<std::vector<byte_t>, std::int32_t>
{
static std::string format()
{
Expand All @@ -66,7 +67,7 @@ namespace sparrow
};

template <>
struct variable_size_binary_format<std::vector<std::byte>, std::int64_t>
struct variable_size_binary_format<std::vector<byte_t>, std::int64_t>
{
static std::string format()
{
Expand All @@ -78,8 +79,54 @@ namespace sparrow
template <std::ranges::sized_range T, class CR, layout_offset OT>
class variable_size_binary_array_impl;

using string_array = variable_size_binary_array_impl<std::string, std::string_view, std::int32_t>;
using binary_traits = arrow_traits<std::vector<byte_t>>;

using string_array = variable_size_binary_array_impl<std::string, std::string_view, std::int32_t>;
using big_string_array = variable_size_binary_array_impl<std::string, std::string_view, std::int64_t>;
using binary_array = variable_size_binary_array_impl<binary_traits::value_type, binary_traits::const_reference, std::int32_t>;
using big_binary_array = variable_size_binary_array_impl<binary_traits::value_type, binary_traits::const_reference, std::int64_t>;

namespace detail
{
template<class T>
struct get_data_type_from_array;

template<>
struct get_data_type_from_array<sparrow::string_array>
{
constexpr static sparrow::data_type get()
{
return sparrow::data_type::STRING;
}
};

template<>
struct get_data_type_from_array<sparrow::big_string_array>
{
constexpr static sparrow::data_type get()
{
return sparrow::data_type::LARGE_STRING;
}
};

template<>
struct get_data_type_from_array<sparrow::binary_array>
{
constexpr static sparrow::data_type get()
{
return sparrow::data_type::BINARY;
}
};

template<>
struct get_data_type_from_array<sparrow::big_binary_array>
{
constexpr static sparrow::data_type get()
{
return sparrow::data_type::LARGE_BINARY;
}
};
}

/**
* Checks whether T is a string_array type.
Expand All @@ -92,6 +139,18 @@ namespace sparrow
*/
template <class T>
constexpr bool is_big_string_array_v = std::same_as<T, big_string_array>;

/**
* Checks whether T is a binary_array type.
*/
template <class T>
constexpr bool is_binary_array_v = std::same_as<T, binary_array>;

/**
* Checks whether T is a big_binary_array type.
*/
template <class T>
constexpr bool is_big_binary_array_v = std::same_as<T, big_binary_array>;

template <std::ranges::sized_range T, class CR, layout_offset OT>
struct array_inner_types<variable_size_binary_array_impl<T, CR, OT>> : array_inner_types_base
Expand Down Expand Up @@ -145,6 +204,10 @@ namespace sparrow
class variable_size_binary_array_impl final
: public mutable_array_bitmap_base<variable_size_binary_array_impl<T, CR, OT>>
{
private:

static_assert(sizeof(std::ranges::range_value_t<T>) == sizeof(std::uint8_t),
"Only sequences of types with the same size as uint8_t are supported");
public:

using self_type = variable_size_binary_array_impl<T, CR, OT>;
Expand Down Expand Up @@ -424,13 +487,26 @@ namespace sparrow
const auto shift_val_abs = static_cast<size_type>(std::abs(shift_byte_count));
const auto new_data_buffer_size = shift_byte_count < 0 ? data_buffer.size() - shift_val_abs
: data_buffer.size() + shift_val_abs;
data_buffer.resize(new_data_buffer_size);
// Move elements to make space for the new value
std::move_backward(
data_buffer.begin() + offset_end,
data_buffer.end() - shift_byte_count,
data_buffer.end()
);

if (shift_byte_count > 0)
{
data_buffer.resize(new_data_buffer_size);
// Move elements to make space for the new value
std::move_backward(
data_buffer.begin() + offset_end,
data_buffer.end() - shift_byte_count,
data_buffer.end()
);
}
else
{
std::move(
data_buffer.begin() + offset_end,
data_buffer.end(),
data_buffer.begin() + offset_end + shift_byte_count
);
data_buffer.resize(new_data_buffer_size);
}
// Adjust offsets for subsequent elements
std::for_each(
offset(index + 1),
Expand All @@ -441,8 +517,9 @@ namespace sparrow
}
);
}
auto tmp = std::views::transform(rhs, [](const auto& val) { return static_cast<std::uint8_t>(val); });
// Copy the new value into the buffer
std::copy(std::ranges::begin(rhs), std::ranges::end(rhs), data_buffer.begin() + offset_beg);
std::copy(std::ranges::begin(tmp), std::ranges::end(tmp), data_buffer.begin() + offset_beg);
}

template <std::ranges::sized_range T, class CR, layout_offset OT>
Expand Down
4 changes: 3 additions & 1 deletion include/sparrow/types/data_traits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include "sparrow/types/data_type.hpp"
#include "sparrow/utils/nullable.hpp"
#include "sparrow/utils/vector_view.hpp"

namespace sparrow
{
Expand Down Expand Up @@ -57,8 +58,9 @@ namespace sparrow
template <>
struct arrow_traits<std::vector<byte_t>>
{
static constexpr data_type type_id = data_type::STRING;
static constexpr data_type type_id = data_type::BINARY;
using value_type = std::vector<byte_t>;
using const_reference = vector_view<const byte_t>;
};

template <>
Expand Down
35 changes: 30 additions & 5 deletions include/sparrow/types/data_type.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,10 @@ namespace sparrow
DOUBLE = 12,
// UTF8 variable-length string
STRING = 13,
LARGE_STRING = 14,
// Variable-length bytes (no guarantee of UTF8-ness)
BINARY = 14,
BINARY = 15,
LARGE_BINARY = 16,
// Number of nanoseconds since the UNIX epoch with an optional timezone.
// See: https://arrow.apache.org/docs/python/timestamps.html#timestamps
TIMESTAMP = 18,
Expand Down Expand Up @@ -242,11 +244,13 @@ namespace sparrow
case 'g':
return data_type::DOUBLE;
case 'u':
case 'U': // large string
return data_type::STRING;
case 'z': // binary
case 'Z': // large binary
case 'U':
return data_type::LARGE_STRING;
case 'z':
return data_type::BINARY;
case 'Z':
return data_type::LARGE_BINARY;
default:
return data_type::NA;
}
Expand Down Expand Up @@ -435,8 +439,12 @@ namespace sparrow
return "g";
case data_type::STRING:
return "u";
case data_type::LARGE_STRING:
return "U";
case data_type::BINARY:
return "z";
case data_type::LARGE_BINARY:
return "Z";
case data_type::TIMESTAMP:
return "tDm";
case data_type::LIST:
Expand Down Expand Up @@ -511,7 +519,7 @@ namespace sparrow
float32_t,
float64_t,
std::string,
// std::vector<byte_t>,
std::vector<byte_t>,
sparrow::timestamp,
// TODO: add missing fundamental types here
list_value,
Expand Down Expand Up @@ -740,8 +748,12 @@ namespace std
return "double";
case STRING:
return "String";
case LARGE_STRING:
return "Large string";
case BINARY:
return "Binary";
case LARGE_BINARY:
return "Large binary";
case TIMESTAMP:
return "Timestamp";
case LIST:
Expand Down Expand Up @@ -800,6 +812,19 @@ namespace std
}
};

template <>
struct formatter<std::byte>
{
constexpr auto parse(std::format_parse_context& ctx)
{
return ctx.begin(); // Simple implementation
}

auto format(const std::byte& b, std::format_context& ctx) const
{
return std::format_to(ctx.out(), "{}", static_cast<int>(b));
}
};
}

#endif
98 changes: 98 additions & 0 deletions include/sparrow/utils/vector_view.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
// Man Group Operations Limited
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <algorithm>
#include <ranges>
#include <vector>

#if defined(__cpp_lib_format)
# include <format>
#endif

namespace sparrow
{
/**
* The class vector_view describes an object that can refer to a constant contiguous
* sequence of T with the first element of the sequence at position zero. It is similar
* to string_view, but for arbitrary T. You can consider it as a span or range supporting
* const operations only, and comparison operators.
*/
template <class T>
class vector_view : public std::span<T>
{
public:

using base_type = std::span<T>;
using value_type = typename base_type::value_type;

using base_type::base_type;

explicit operator std::vector<value_type>() const noexcept
{
return std::vector<value_type>(this->begin(), this->end());
}
};

template <class T>
constexpr bool operator==(const vector_view<T>& lhs, const vector_view<T>& rhs)
{
return std::ranges::equal(lhs, rhs);
}

template <class T>
constexpr bool operator==(const vector_view<T>& lhs, const std::vector<std::decay_t<T>>& rhs)
{
return std::ranges::equal(lhs, rhs);
}

template <class T>
constexpr std::compare_three_way_result<T>
operator<=>(const vector_view<T>& lhs, const vector_view<T>& rhs)
{
return std::lexicographical_compare_three_way(lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm if you can use the std here, it's means that we don't need the sparrow::lexicographical_compare_three_way implementation anymore (because we use newer version of Xcode I guess).
We should remove this code in another PR.

}

template <class T>
constexpr std::compare_three_way_result<T>
operator<=>(const vector_view<T>& lhs, const std::vector<std::decay_t<T>>& rhs)
{
return std::lexicographical_compare_three_way(lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
}
}

#if defined(__cpp_lib_format)
template <class T>
struct std::formatter<sparrow::vector_view<T>>
{
constexpr auto parse(std::format_parse_context& ctx)
{
return ctx.begin(); // Simple implementation
}

auto format(const sparrow::vector_view<T>& vec, std::format_context& ctx) const
{
std::format_to(ctx.out(), "<");
if (!vec.empty())
{
for (std::size_t i = 0; i < vec.size() - 1; ++i)
{
std::format_to(ctx.out(), "{}, ", vec[i]);
}
}
return std::format_to(ctx.out(), "{}>", vec.back());
}
};
#endif
6 changes: 6 additions & 0 deletions src/array_factory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,12 @@ namespace sparrow
return detail::make_wrapper_ptr<struct_array>(std::move(proxy));
case data_type::STRING:
return detail::make_wrapper_ptr<string_array>(std::move(proxy));
case data_type::LARGE_STRING:
return detail::make_wrapper_ptr<big_string_array>(std::move(proxy));
case data_type::BINARY:
return detail::make_wrapper_ptr<binary_array>(std::move(proxy));
case data_type::LARGE_BINARY:
return detail::make_wrapper_ptr<big_binary_array>(std::move(proxy));
case data_type::RUN_ENCODED:
return detail::make_wrapper_ptr<run_end_encoded_array>(std::move(proxy));
case data_type::DENSE_UNION:
Expand Down
Loading
Loading