Merge branch 'apache:main' into apacheGH-39552

janiodev · Jan 11, 2024 · 9ef27de · 9ef27de
2 parents 59ba0b0 + 1a622ec
commit 9ef27de
Show file tree

Hide file tree

Showing 42 changed files with 704 additions and 189 deletions.
diff --git a/c_glib/README.md b/c_glib/README.md
@@ -142,6 +142,17 @@ $ meson compile -C c_glib.build
 $ sudo meson install -C c_glib.build
 ```
 
+> [!WARNING]
+>
+> When building Arrow GLib, it typically uses the Arrow C++ installed via Homebrew. However, this can lead to build failures
+> if there are mismatches between the changes in Arrow's GLib and C++ libraries. To resolve this, you may need to
+> reference the Arrow C++ library built locally. In such cases, use the `--cmake-prefix-path` option with the `meson setup` 
+> command to explicitly specify the library path.
+>
+> ```console
+> $ meson setup c_glib.build c_glib --cmake-prefix-path=${arrow_cpp_install_prefix} -Dgtk_doc=true
+> ```
+
 Others:
 
 ```console
@@ -231,9 +242,18 @@ Now, you can run unit tests by the followings:
 
 ```console
 $ cd c_glib.build
-$ bundle exec ../c_glib/test/run-test.sh
+$ BUNDLE_GEMFILE=../c_glib/Gemfile bundle exec ../c_glib/test/run-test.sh
 ```
 
+
+> [!NOTE]
+>
+> If debugging is necessary, you can proceed using the `DEBUGGER` option as follows:
+>
+> ```console
+> $ DEBUGGER=lldb BUNDLE_GEMFILE=../c_glib/Gemfile bundle exec ../c_glib/test/run-test.sh
+> ```
+
 ## Common build problems
 
 ### build failed - /usr/bin/ld: cannot find -larrow

diff --git a/c_glib/test/run-test.sh b/c_glib/test/run-test.sh
@@ -34,14 +34,14 @@ for module in "${modules[@]}"; do
   module_build_dir="${build_dir}/${module}"
   if [ -d "${module_build_dir}" ]; then
     LD_LIBRARY_PATH="${module_build_dir}:${LD_LIBRARY_PATH}"
+    DYLD_LIBRARY_PATH="${module_build_dir}:${DYLD_LIBRARY_PATH}"
   fi
 done
 export LD_LIBRARY_PATH
+export DYLD_LIBRARY_PATH
 
 if [ "${BUILD}" != "no" ]; then
-  if [ -f "Makefile" ]; then
-    make -j8 > /dev/null || exit $?
-  elif [ -f "build.ninja" ]; then
+  if [ -f "build.ninja" ]; then
     ninja || exit $?
   fi
 fi
@@ -59,4 +59,19 @@ for module in "${modules[@]}"; do
 done
 export GI_TYPELIB_PATH
 
-${GDB} ruby ${test_dir}/run-test.rb "$@"
+if type rbenv > /dev/null 2>&1; then
+  RUBY="$(rbenv which ruby)"
+else
+  RUBY=ruby
+fi
+DEBUGGER_ARGS=()
+case "${DEBUGGER}" in
+  "gdb")
+    DEBUGGER_ARGS+=(--args)
+    ;;
+  "lldb")
+    DEBUGGER_ARGS+=(--one-line "env DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}")
+    DEBUGGER_ARGS+=(--)
+    ;;
+esac
+${DEBUGGER} "${DEBUGGER_ARGS[@]}" "${RUBY}" ${test_dir}/run-test.rb "$@"
diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -1005,14 +1005,8 @@ if("${MAKE}" STREQUAL "")
   endif()
 endif()
 
-# Using make -j in sub-make is fragile
-# see discussion https://github.com/apache/arrow/pull/2779
-if(${CMAKE_GENERATOR} MATCHES "Makefiles")
-  set(MAKE_BUILD_ARGS "")
-else()
-  # limit the maximum number of jobs for ninja
-  set(MAKE_BUILD_ARGS "-j${NPROC}")
-endif()
+# Args for external projects using make.
+set(MAKE_BUILD_ARGS "-j${NPROC}")
 
 include(FetchContent)
 set(FC_DECLARE_COMMON_OPTIONS)
@@ -2042,10 +2036,6 @@ macro(build_jemalloc)
   endif()
 
   set(JEMALLOC_BUILD_COMMAND ${MAKE} ${MAKE_BUILD_ARGS})
-  # Paralleism for Make fails with CMake > 3.28 see #39517
-  if(${CMAKE_GENERATOR} MATCHES "Makefiles")
-    list(APPEND JEMALLOC_BUILD_COMMAND "-j1")
-  endif()
 
   if(CMAKE_OSX_SYSROOT)
     list(APPEND JEMALLOC_BUILD_COMMAND "SDKROOT=${CMAKE_OSX_SYSROOT}")

diff --git a/cpp/src/arrow/acero/hash_aggregate_test.cc b/cpp/src/arrow/acero/hash_aggregate_test.cc
@@ -1694,6 +1694,42 @@ TEST_P(GroupBy, SumMeanProductScalar) {
   }
 }
 
+TEST_P(GroupBy, MeanOverflow) {
+  BatchesWithSchema input;
+  // would overflow if intermediate sum is integer
+  input.batches = {
+      ExecBatchFromJSON({int64(), int64()}, {ArgShape::SCALAR, ArgShape::ARRAY},
+
+                        "[[9223372036854775805, 1], [9223372036854775805, 1], "
+                        "[9223372036854775805, 2], [9223372036854775805, 3]]"),
+      ExecBatchFromJSON({int64(), int64()}, {ArgShape::SCALAR, ArgShape::ARRAY},
+                        "[[null, 1], [null, 1], [null, 2], [null, 3]]"),
+      ExecBatchFromJSON({int64(), int64()},
+                        "[[9223372036854775805, 1], [9223372036854775805, 2], "
+                        "[9223372036854775805, 3]]"),
+  };
+  input.schema = schema({field("argument", int64()), field("key", int64())});
+  for (bool use_threads : {true, false}) {
+    SCOPED_TRACE(use_threads ? "parallel/merged" : "serial");
+    ASSERT_OK_AND_ASSIGN(Datum actual,
+                         RunGroupBy(input, {"key"},
+                                    {
+                                        {"hash_mean", nullptr, "argument", "hash_mean"},
+                                    },
+                                    use_threads));
+    Datum expected = ArrayFromJSON(struct_({
+                                       field("key", int64()),
+                                       field("hash_mean", float64()),
+                                   }),
+                                   R"([
+      [1, 9223372036854775805],
+      [2, 9223372036854775805],
+      [3, 9223372036854775805]
+    ])");
+    AssertDatumsApproxEqual(expected, actual, /*verbose=*/true);
+  }
+}
+
 TEST_P(GroupBy, VarianceAndStddev) {
   auto batch = RecordBatchFromJSON(
       schema({field("argument", int32()), field("key", int64())}), R"([

diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-// Array accessor classes for Binary, LargeBinart, String, LargeString,
+// Array accessor classes for Binary, LargeBinary, String, LargeString,
 // FixedSizeBinary
 
 #pragma once

diff --git a/cpp/src/arrow/array/array_dict.h b/cpp/src/arrow/array/array_dict.h
@@ -133,7 +133,7 @@ class ARROW_EXPORT DictionaryUnifier {
   static Result<std::unique_ptr<DictionaryUnifier>> Make(
       std::shared_ptr<DataType> value_type, MemoryPool* pool = default_memory_pool());
 
-  /// \brief Unify dictionaries accross array chunks
+  /// \brief Unify dictionaries across array chunks
   ///
   /// The dictionaries in the array chunks will be unified, their indices
   /// accordingly transposed.
@@ -144,7 +144,7 @@ class ARROW_EXPORT DictionaryUnifier {
       const std::shared_ptr<ChunkedArray>& array,
       MemoryPool* pool = default_memory_pool());
 
-  /// \brief Unify dictionaries accross the chunks of each table column
+  /// \brief Unify dictionaries across the chunks of each table column
   ///
   /// The dictionaries in each table column will be unified, their indices
   /// accordingly transposed.

diff --git a/cpp/src/arrow/array/array_dict_test.cc b/cpp/src/arrow/array/array_dict_test.cc
@@ -1130,7 +1130,7 @@ TEST(TestDictionary, Validate) {
   ASSERT_RAISES(Invalid, arr->ValidateFull());
 
 #if !defined(__APPLE__) && !defined(ARROW_VALGRIND)
-  // GH-35712: ASSERT_DEATH would make testing slow on MacOS.
+  // GH-35712: ASSERT_DEATH would make testing slow on macOS.
   ASSERT_DEATH(
       {
         std::shared_ptr<Array> null_dict_arr =

diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc
@@ -300,7 +300,7 @@ class TestListArray : public ::testing::Test {
     ASSERT_OK(result->ValidateFull());
     AssertArraysEqual(*result, *expected);
 
-    // Offets without nulls, will replace null with empty list
+    // Offsets without nulls, will replace null with empty list
     ASSERT_OK_AND_ASSIGN(result, FromArrays(*offsets_wo_nulls, *sizes_wo_nulls, *values));
     ASSERT_OK(result->ValidateFull());
     AssertArraysEqual(*result, *std::dynamic_pointer_cast<ArrayType>(

diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc
@@ -391,7 +391,7 @@ Result<std::shared_ptr<ArrayData>> ListViewFromListImpl(
   const auto* offsets = list_data->template GetValues<offset_type>(1, 0);
   auto* sizes = sizes_buffer->mutable_data_as<offset_type>();
   // Zero the initial padding area to avoid leaking any data when buffers are
-  // sent over IPC or throught the C Data interface.
+  // sent over IPC or through the C Data interface.
   memset(sizes, 0, list_data->offset * sizeof(offset_type));
   for (int64_t i = list_data->offset; i < buffer_length; i++) {
     sizes[i] = offsets[i + 1] - offsets[i];
@@ -776,7 +776,7 @@ Result<std::shared_ptr<Array>> MapArray::FromArraysInternal(
   }
 
   if (keys->null_count() != 0) {
-    return Status::Invalid("Map can not contain NULL valued keys");
+    return Status::Invalid("Map cannot contain NULL valued keys");
   }
 
   if (keys->length() != items->length()) {

diff --git a/cpp/src/arrow/array/array_run_end.h b/cpp/src/arrow/array/array_run_end.h
@@ -59,7 +59,7 @@ class ARROW_EXPORT RunEndEncodedArray : public Array {
   ///
   /// The length and offset parameters refer to the dimensions of the logical
   /// array which is the array we would get after expanding all the runs into
-  /// repeated values. As such, length can be much greater than the lenght of
+  /// repeated values. As such, length can be much greater than the length of
   /// the child run_ends and values arrays.
   RunEndEncodedArray(const std::shared_ptr<DataType>& type, int64_t length,
                      const std::shared_ptr<Array>& run_ends,
@@ -69,7 +69,7 @@ class ARROW_EXPORT RunEndEncodedArray : public Array {
   ///
   /// The length and offset parameters refer to the dimensions of the logical
   /// array which is the array we would get after expanding all the runs into
-  /// repeated values. As such, length can be much greater than the lenght of
+  /// repeated values. As such, length can be much greater than the length of
   /// the child run_ends and values arrays.
   static Result<std::shared_ptr<RunEndEncodedArray>> Make(
       const std::shared_ptr<DataType>& type, int64_t logical_length,
@@ -122,7 +122,7 @@ class ARROW_EXPORT RunEndEncodedArray : public Array {
   /// run-ends) necessary to represent the logical range of values from offset
   /// to length.
   ///
-  /// Avoid calling this function if the physical length can be estabilished in
+  /// Avoid calling this function if the physical length can be established in
   /// some other way (e.g. when iterating over the runs sequentially until the
   /// end). This function uses binary-search, so it has a O(log N) cost.
   int64_t FindPhysicalLength() const;

diff --git a/cpp/src/arrow/array/array_struct_test.cc b/cpp/src/arrow/array/array_struct_test.cc
@@ -541,7 +541,7 @@ TEST_F(TestStructBuilder, TestEquality) {
   ASSERT_OK(char_vb->Reserve(list_values.size()));
   ASSERT_OK(int_vb->Reserve(int_values.size()));
 
-  // setup two equal arrays, one of which takes an unequal bitmap
+  // set up two equal arrays, one of which takes an unequal bitmap
   ASSERT_OK(builder_->AppendValues(struct_is_valid.size(), struct_is_valid.data()));
   ASSERT_OK(list_vb->AppendValues(list_offsets.data(), list_offsets.size(),
                                   list_is_valid.data()));
@@ -574,7 +574,7 @@ TEST_F(TestStructBuilder, TestEquality) {
   ASSERT_OK(char_vb->Resize(list_values.size()));
   ASSERT_OK(int_vb->Resize(int_values.size()));
 
-  // setup an unequal one with the unequal bitmap
+  // set up an unequal one with the unequal bitmap
   ASSERT_OK(builder_->AppendValues(unequal_struct_is_valid.size(),
                                    unequal_struct_is_valid.data()));
   ASSERT_OK(list_vb->AppendValues(list_offsets.data(), list_offsets.size(),
@@ -592,7 +592,7 @@ TEST_F(TestStructBuilder, TestEquality) {
   ASSERT_OK(char_vb->Resize(list_values.size()));
   ASSERT_OK(int_vb->Resize(int_values.size()));
 
-  // setup an unequal one with unequal offsets
+  // set up an unequal one with unequal offsets
   ASSERT_OK(builder_->AppendValues(struct_is_valid.size(), struct_is_valid.data()));
   ASSERT_OK(list_vb->AppendValues(unequal_list_offsets.data(),
                                   unequal_list_offsets.size(),
@@ -610,7 +610,7 @@ TEST_F(TestStructBuilder, TestEquality) {
   ASSERT_OK(char_vb->Resize(list_values.size()));
   ASSERT_OK(int_vb->Resize(int_values.size()));
 
-  // setup anunequal one with unequal values
+  // set up an unequal one with unequal values
   ASSERT_OK(builder_->AppendValues(struct_is_valid.size(), struct_is_valid.data()));
   ASSERT_OK(list_vb->AppendValues(list_offsets.data(), list_offsets.size(),
                                   list_is_valid.data()));

diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h
@@ -349,7 +349,7 @@ inline Result<std::unique_ptr<ArrayBuilder>> MakeBuilderExactIndex(
 }
 
 /// \brief Construct an empty DictionaryBuilder initialized optionally
-/// with a pre-existing dictionary
+/// with a preexisting dictionary
 /// \param[in] pool the MemoryPool to use for allocations
 /// \param[in] type the dictionary type to create the builder for
 /// \param[in] dictionary the initial dictionary, if any. May be nullptr

diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h
@@ -677,7 +677,7 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder {
 
   /// \brief Vector append
   ///
-  /// If passed, valid_bytes wil be read and any zero byte
+  /// If passed, valid_bytes will be read and any zero byte
   /// will cause the corresponding slot to be null
   ///
   /// This function affects only the validity bitmap; the child values must be appended

diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc
@@ -221,7 +221,7 @@ void ArraySpan::SetMembers(const ArrayData& data) {
     this->null_count = 0;
   }
 
-  // Makes sure any other buffers are seen as null / non-existent
+  // Makes sure any other buffers are seen as null / nonexistent
   for (int i = static_cast<int>(data.buffers.size()); i < 3; ++i) {
     this->buffers[i] = {};
   }

diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -38,6 +38,7 @@
 #include "arrow/compute/row/grouper.h"
 #include "arrow/record_batch.h"
 #include "arrow/stl_allocator.h"
+#include "arrow/type_traits.h"
 #include "arrow/util/bit_run_reader.h"
 #include "arrow/util/bitmap_ops.h"
 #include "arrow/util/bitmap_writer.h"
@@ -441,9 +442,10 @@ struct GroupedCountImpl : public GroupedAggregator {
 // ----------------------------------------------------------------------
 // Sum/Mean/Product implementation
 
-template <typename Type, typename Impl>
+template <typename Type, typename Impl,
+          typename AccumulateType = typename FindAccumulatorType<Type>::Type>
 struct GroupedReducingAggregator : public GroupedAggregator {
-  using AccType = typename FindAccumulatorType<Type>::Type;
+  using AccType = AccumulateType;
   using CType = typename TypeTraits<AccType>::CType;
   using InputCType = typename TypeTraits<Type>::CType;
 
@@ -483,7 +485,8 @@ struct GroupedReducingAggregator : public GroupedAggregator {
 
   Status Merge(GroupedAggregator&& raw_other,
                const ArrayData& group_id_mapping) override {
-    auto other = checked_cast<GroupedReducingAggregator<Type, Impl>*>(&raw_other);
+    auto other =
+        checked_cast<GroupedReducingAggregator<Type, Impl, AccType>*>(&raw_other);
 
     CType* reduced = reduced_.mutable_data();
     int64_t* counts = counts_.mutable_data();
@@ -733,9 +736,18 @@ using GroupedProductFactory =
 // ----------------------------------------------------------------------
 // Mean implementation
 
+template <typename T>
+struct GroupedMeanAccType {
+  using Type = typename std::conditional<is_number_type<T>::value, DoubleType,
+                                         typename FindAccumulatorType<T>::Type>::type;
+};
+
 template <typename Type>
-struct GroupedMeanImpl : public GroupedReducingAggregator<Type, GroupedMeanImpl<Type>> {
-  using Base = GroupedReducingAggregator<Type, GroupedMeanImpl<Type>>;
+struct GroupedMeanImpl
+    : public GroupedReducingAggregator<Type, GroupedMeanImpl<Type>,
+                                       typename GroupedMeanAccType<Type>::Type> {
+  using Base = GroupedReducingAggregator<Type, GroupedMeanImpl<Type>,
+                                         typename GroupedMeanAccType<Type>::Type>;
   using CType = typename Base::CType;
   using InputCType = typename Base::InputCType;
   using MeanType =
@@ -746,7 +758,7 @@ struct GroupedMeanImpl : public GroupedReducingAggregator<Type, GroupedMeanImpl<
   template <typename T = Type>
   static enable_if_number<T, CType> Reduce(const DataType&, const CType u,
                                            const InputCType v) {
-    return static_cast<CType>(to_unsigned(u) + to_unsigned(static_cast<CType>(v)));
+    return static_cast<CType>(u) + static_cast<CType>(v);
   }
 
   static CType Reduce(const DataType&, const CType u, const CType v) {