Skip to content

Commit

Permalink
Merge branch 'apache:main' into apacheGH-39552
Browse files Browse the repository at this point in the history
  • Loading branch information
janiodev authored Jan 11, 2024
2 parents 59ba0b0 + 1a622ec commit 9ef27de
Show file tree
Hide file tree
Showing 42 changed files with 704 additions and 189 deletions.
22 changes: 21 additions & 1 deletion c_glib/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,17 @@ $ meson compile -C c_glib.build
$ sudo meson install -C c_glib.build
```

> [!WARNING]
>
> When building Arrow GLib, it typically uses the Arrow C++ installed via Homebrew. However, this can lead to build failures
> if there are mismatches between the changes in Arrow's GLib and C++ libraries. To resolve this, you may need to
> reference the Arrow C++ library built locally. In such cases, use the `--cmake-prefix-path` option with the `meson setup`
> command to explicitly specify the library path.
>
> ```console
> $ meson setup c_glib.build c_glib --cmake-prefix-path=${arrow_cpp_install_prefix} -Dgtk_doc=true
> ```
Others:
```console
Expand Down Expand Up @@ -231,9 +242,18 @@ Now, you can run unit tests by the followings:

```console
$ cd c_glib.build
$ bundle exec ../c_glib/test/run-test.sh
$ BUNDLE_GEMFILE=../c_glib/Gemfile bundle exec ../c_glib/test/run-test.sh
```


> [!NOTE]
>
> If debugging is necessary, you can proceed using the `DEBUGGER` option as follows:
>
> ```console
> $ DEBUGGER=lldb BUNDLE_GEMFILE=../c_glib/Gemfile bundle exec ../c_glib/test/run-test.sh
> ```
## Common build problems
### build failed - /usr/bin/ld: cannot find -larrow
Expand Down
23 changes: 19 additions & 4 deletions c_glib/test/run-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,14 @@ for module in "${modules[@]}"; do
module_build_dir="${build_dir}/${module}"
if [ -d "${module_build_dir}" ]; then
LD_LIBRARY_PATH="${module_build_dir}:${LD_LIBRARY_PATH}"
DYLD_LIBRARY_PATH="${module_build_dir}:${DYLD_LIBRARY_PATH}"
fi
done
export LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH

if [ "${BUILD}" != "no" ]; then
if [ -f "Makefile" ]; then
make -j8 > /dev/null || exit $?
elif [ -f "build.ninja" ]; then
if [ -f "build.ninja" ]; then
ninja || exit $?
fi
fi
Expand All @@ -59,4 +59,19 @@ for module in "${modules[@]}"; do
done
export GI_TYPELIB_PATH

${GDB} ruby ${test_dir}/run-test.rb "$@"
if type rbenv > /dev/null 2>&1; then
RUBY="$(rbenv which ruby)"
else
RUBY=ruby
fi
DEBUGGER_ARGS=()
case "${DEBUGGER}" in
"gdb")
DEBUGGER_ARGS+=(--args)
;;
"lldb")
DEBUGGER_ARGS+=(--one-line "env DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}")
DEBUGGER_ARGS+=(--)
;;
esac
${DEBUGGER} "${DEBUGGER_ARGS[@]}" "${RUBY}" ${test_dir}/run-test.rb "$@"
14 changes: 2 additions & 12 deletions cpp/cmake_modules/ThirdpartyToolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1005,14 +1005,8 @@ if("${MAKE}" STREQUAL "")
endif()
endif()

# Using make -j in sub-make is fragile
# see discussion https://github.com/apache/arrow/pull/2779
if(${CMAKE_GENERATOR} MATCHES "Makefiles")
set(MAKE_BUILD_ARGS "")
else()
# limit the maximum number of jobs for ninja
set(MAKE_BUILD_ARGS "-j${NPROC}")
endif()
# Args for external projects using make.
set(MAKE_BUILD_ARGS "-j${NPROC}")

include(FetchContent)
set(FC_DECLARE_COMMON_OPTIONS)
Expand Down Expand Up @@ -2042,10 +2036,6 @@ macro(build_jemalloc)
endif()

set(JEMALLOC_BUILD_COMMAND ${MAKE} ${MAKE_BUILD_ARGS})
# Paralleism for Make fails with CMake > 3.28 see #39517
if(${CMAKE_GENERATOR} MATCHES "Makefiles")
list(APPEND JEMALLOC_BUILD_COMMAND "-j1")
endif()

if(CMAKE_OSX_SYSROOT)
list(APPEND JEMALLOC_BUILD_COMMAND "SDKROOT=${CMAKE_OSX_SYSROOT}")
Expand Down
36 changes: 36 additions & 0 deletions cpp/src/arrow/acero/hash_aggregate_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1694,6 +1694,42 @@ TEST_P(GroupBy, SumMeanProductScalar) {
}
}

TEST_P(GroupBy, MeanOverflow) {
BatchesWithSchema input;
// would overflow if intermediate sum is integer
input.batches = {
ExecBatchFromJSON({int64(), int64()}, {ArgShape::SCALAR, ArgShape::ARRAY},

"[[9223372036854775805, 1], [9223372036854775805, 1], "
"[9223372036854775805, 2], [9223372036854775805, 3]]"),
ExecBatchFromJSON({int64(), int64()}, {ArgShape::SCALAR, ArgShape::ARRAY},
"[[null, 1], [null, 1], [null, 2], [null, 3]]"),
ExecBatchFromJSON({int64(), int64()},
"[[9223372036854775805, 1], [9223372036854775805, 2], "
"[9223372036854775805, 3]]"),
};
input.schema = schema({field("argument", int64()), field("key", int64())});
for (bool use_threads : {true, false}) {
SCOPED_TRACE(use_threads ? "parallel/merged" : "serial");
ASSERT_OK_AND_ASSIGN(Datum actual,
RunGroupBy(input, {"key"},
{
{"hash_mean", nullptr, "argument", "hash_mean"},
},
use_threads));
Datum expected = ArrayFromJSON(struct_({
field("key", int64()),
field("hash_mean", float64()),
}),
R"([
[1, 9223372036854775805],
[2, 9223372036854775805],
[3, 9223372036854775805]
])");
AssertDatumsApproxEqual(expected, actual, /*verbose=*/true);
}
}

TEST_P(GroupBy, VarianceAndStddev) {
auto batch = RecordBatchFromJSON(
schema({field("argument", int32()), field("key", int64())}), R"([
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/array/array_binary.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.

// Array accessor classes for Binary, LargeBinart, String, LargeString,
// Array accessor classes for Binary, LargeBinary, String, LargeString,
// FixedSizeBinary

#pragma once
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/array/array_dict.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ class ARROW_EXPORT DictionaryUnifier {
static Result<std::unique_ptr<DictionaryUnifier>> Make(
std::shared_ptr<DataType> value_type, MemoryPool* pool = default_memory_pool());

/// \brief Unify dictionaries accross array chunks
/// \brief Unify dictionaries across array chunks
///
/// The dictionaries in the array chunks will be unified, their indices
/// accordingly transposed.
Expand All @@ -144,7 +144,7 @@ class ARROW_EXPORT DictionaryUnifier {
const std::shared_ptr<ChunkedArray>& array,
MemoryPool* pool = default_memory_pool());

/// \brief Unify dictionaries accross the chunks of each table column
/// \brief Unify dictionaries across the chunks of each table column
///
/// The dictionaries in each table column will be unified, their indices
/// accordingly transposed.
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/array/array_dict_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1130,7 +1130,7 @@ TEST(TestDictionary, Validate) {
ASSERT_RAISES(Invalid, arr->ValidateFull());

#if !defined(__APPLE__) && !defined(ARROW_VALGRIND)
// GH-35712: ASSERT_DEATH would make testing slow on MacOS.
// GH-35712: ASSERT_DEATH would make testing slow on macOS.
ASSERT_DEATH(
{
std::shared_ptr<Array> null_dict_arr =
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/array/array_list_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ class TestListArray : public ::testing::Test {
ASSERT_OK(result->ValidateFull());
AssertArraysEqual(*result, *expected);

// Offets without nulls, will replace null with empty list
// Offsets without nulls, will replace null with empty list
ASSERT_OK_AND_ASSIGN(result, FromArrays(*offsets_wo_nulls, *sizes_wo_nulls, *values));
ASSERT_OK(result->ValidateFull());
AssertArraysEqual(*result, *std::dynamic_pointer_cast<ArrayType>(
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/array/array_nested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@ Result<std::shared_ptr<ArrayData>> ListViewFromListImpl(
const auto* offsets = list_data->template GetValues<offset_type>(1, 0);
auto* sizes = sizes_buffer->mutable_data_as<offset_type>();
// Zero the initial padding area to avoid leaking any data when buffers are
// sent over IPC or throught the C Data interface.
// sent over IPC or through the C Data interface.
memset(sizes, 0, list_data->offset * sizeof(offset_type));
for (int64_t i = list_data->offset; i < buffer_length; i++) {
sizes[i] = offsets[i + 1] - offsets[i];
Expand Down Expand Up @@ -776,7 +776,7 @@ Result<std::shared_ptr<Array>> MapArray::FromArraysInternal(
}

if (keys->null_count() != 0) {
return Status::Invalid("Map can not contain NULL valued keys");
return Status::Invalid("Map cannot contain NULL valued keys");
}

if (keys->length() != items->length()) {
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/arrow/array/array_run_end.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class ARROW_EXPORT RunEndEncodedArray : public Array {
///
/// The length and offset parameters refer to the dimensions of the logical
/// array which is the array we would get after expanding all the runs into
/// repeated values. As such, length can be much greater than the lenght of
/// repeated values. As such, length can be much greater than the length of
/// the child run_ends and values arrays.
RunEndEncodedArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Array>& run_ends,
Expand All @@ -69,7 +69,7 @@ class ARROW_EXPORT RunEndEncodedArray : public Array {
///
/// The length and offset parameters refer to the dimensions of the logical
/// array which is the array we would get after expanding all the runs into
/// repeated values. As such, length can be much greater than the lenght of
/// repeated values. As such, length can be much greater than the length of
/// the child run_ends and values arrays.
static Result<std::shared_ptr<RunEndEncodedArray>> Make(
const std::shared_ptr<DataType>& type, int64_t logical_length,
Expand Down Expand Up @@ -122,7 +122,7 @@ class ARROW_EXPORT RunEndEncodedArray : public Array {
/// run-ends) necessary to represent the logical range of values from offset
/// to length.
///
/// Avoid calling this function if the physical length can be estabilished in
/// Avoid calling this function if the physical length can be established in
/// some other way (e.g. when iterating over the runs sequentially until the
/// end). This function uses binary-search, so it has a O(log N) cost.
int64_t FindPhysicalLength() const;
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/arrow/array/array_struct_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -541,7 +541,7 @@ TEST_F(TestStructBuilder, TestEquality) {
ASSERT_OK(char_vb->Reserve(list_values.size()));
ASSERT_OK(int_vb->Reserve(int_values.size()));

// setup two equal arrays, one of which takes an unequal bitmap
// set up two equal arrays, one of which takes an unequal bitmap
ASSERT_OK(builder_->AppendValues(struct_is_valid.size(), struct_is_valid.data()));
ASSERT_OK(list_vb->AppendValues(list_offsets.data(), list_offsets.size(),
list_is_valid.data()));
Expand Down Expand Up @@ -574,7 +574,7 @@ TEST_F(TestStructBuilder, TestEquality) {
ASSERT_OK(char_vb->Resize(list_values.size()));
ASSERT_OK(int_vb->Resize(int_values.size()));

// setup an unequal one with the unequal bitmap
// set up an unequal one with the unequal bitmap
ASSERT_OK(builder_->AppendValues(unequal_struct_is_valid.size(),
unequal_struct_is_valid.data()));
ASSERT_OK(list_vb->AppendValues(list_offsets.data(), list_offsets.size(),
Expand All @@ -592,7 +592,7 @@ TEST_F(TestStructBuilder, TestEquality) {
ASSERT_OK(char_vb->Resize(list_values.size()));
ASSERT_OK(int_vb->Resize(int_values.size()));

// setup an unequal one with unequal offsets
// set up an unequal one with unequal offsets
ASSERT_OK(builder_->AppendValues(struct_is_valid.size(), struct_is_valid.data()));
ASSERT_OK(list_vb->AppendValues(unequal_list_offsets.data(),
unequal_list_offsets.size(),
Expand All @@ -610,7 +610,7 @@ TEST_F(TestStructBuilder, TestEquality) {
ASSERT_OK(char_vb->Resize(list_values.size()));
ASSERT_OK(int_vb->Resize(int_values.size()));

// setup anunequal one with unequal values
// set up an unequal one with unequal values
ASSERT_OK(builder_->AppendValues(struct_is_valid.size(), struct_is_valid.data()));
ASSERT_OK(list_vb->AppendValues(list_offsets.data(), list_offsets.size(),
list_is_valid.data()));
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/array/builder_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ inline Result<std::unique_ptr<ArrayBuilder>> MakeBuilderExactIndex(
}

/// \brief Construct an empty DictionaryBuilder initialized optionally
/// with a pre-existing dictionary
/// with a preexisting dictionary
/// \param[in] pool the MemoryPool to use for allocations
/// \param[in] type the dictionary type to create the builder for
/// \param[in] dictionary the initial dictionary, if any. May be nullptr
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/array/builder_nested.h
Original file line number Diff line number Diff line change
Expand Up @@ -677,7 +677,7 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder {

/// \brief Vector append
///
/// If passed, valid_bytes wil be read and any zero byte
/// If passed, valid_bytes will be read and any zero byte
/// will cause the corresponding slot to be null
///
/// This function affects only the validity bitmap; the child values must be appended
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/array/data.cc
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ void ArraySpan::SetMembers(const ArrayData& data) {
this->null_count = 0;
}

// Makes sure any other buffers are seen as null / non-existent
// Makes sure any other buffers are seen as null / nonexistent
for (int i = static_cast<int>(data.buffers.size()); i < 3; ++i) {
this->buffers[i] = {};
}
Expand Down
24 changes: 18 additions & 6 deletions cpp/src/arrow/compute/kernels/hash_aggregate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include "arrow/compute/row/grouper.h"
#include "arrow/record_batch.h"
#include "arrow/stl_allocator.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit_run_reader.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/bitmap_writer.h"
Expand Down Expand Up @@ -441,9 +442,10 @@ struct GroupedCountImpl : public GroupedAggregator {
// ----------------------------------------------------------------------
// Sum/Mean/Product implementation

template <typename Type, typename Impl>
template <typename Type, typename Impl,
typename AccumulateType = typename FindAccumulatorType<Type>::Type>
struct GroupedReducingAggregator : public GroupedAggregator {
using AccType = typename FindAccumulatorType<Type>::Type;
using AccType = AccumulateType;
using CType = typename TypeTraits<AccType>::CType;
using InputCType = typename TypeTraits<Type>::CType;

Expand Down Expand Up @@ -483,7 +485,8 @@ struct GroupedReducingAggregator : public GroupedAggregator {

Status Merge(GroupedAggregator&& raw_other,
const ArrayData& group_id_mapping) override {
auto other = checked_cast<GroupedReducingAggregator<Type, Impl>*>(&raw_other);
auto other =
checked_cast<GroupedReducingAggregator<Type, Impl, AccType>*>(&raw_other);

CType* reduced = reduced_.mutable_data();
int64_t* counts = counts_.mutable_data();
Expand Down Expand Up @@ -733,9 +736,18 @@ using GroupedProductFactory =
// ----------------------------------------------------------------------
// Mean implementation

template <typename T>
struct GroupedMeanAccType {
using Type = typename std::conditional<is_number_type<T>::value, DoubleType,
typename FindAccumulatorType<T>::Type>::type;
};

template <typename Type>
struct GroupedMeanImpl : public GroupedReducingAggregator<Type, GroupedMeanImpl<Type>> {
using Base = GroupedReducingAggregator<Type, GroupedMeanImpl<Type>>;
struct GroupedMeanImpl
: public GroupedReducingAggregator<Type, GroupedMeanImpl<Type>,
typename GroupedMeanAccType<Type>::Type> {
using Base = GroupedReducingAggregator<Type, GroupedMeanImpl<Type>,
typename GroupedMeanAccType<Type>::Type>;
using CType = typename Base::CType;
using InputCType = typename Base::InputCType;
using MeanType =
Expand All @@ -746,7 +758,7 @@ struct GroupedMeanImpl : public GroupedReducingAggregator<Type, GroupedMeanImpl<
template <typename T = Type>
static enable_if_number<T, CType> Reduce(const DataType&, const CType u,
const InputCType v) {
return static_cast<CType>(to_unsigned(u) + to_unsigned(static_cast<CType>(v)));
return static_cast<CType>(u) + static_cast<CType>(v);
}

static CType Reduce(const DataType&, const CType u, const CType v) {
Expand Down
Loading

0 comments on commit 9ef27de

Please sign in to comment.