Merge branch 'branch-25.02' into host_udf_reduction

rapidsai · Dec 28, 2024 · 2deeb3b · 2deeb3b
2 parents f27c9fd + 45b40c5
commit 2deeb3b
Show file tree

Hide file tree

Showing 34 changed files with 194 additions and 396 deletions.
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -71,7 +71,7 @@ dependencies:
 - ptxcompiler
 - pyarrow>=14.0.0,<19.0.0a0
 - pydata-sphinx-theme!=0.14.2
-- pynvml>=11.4.1,<12.0.0a0
+- pynvml>=12.0.0,<13.0.0a0
 - pytest-benchmark
 - pytest-cases>=3.8.2
 - pytest-cov

diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -69,7 +69,7 @@ dependencies:
 - pyarrow>=14.0.0,<19.0.0a0
 - pydata-sphinx-theme!=0.14.2
 - pynvjitlink>=0.0.0a0
-- pynvml>=11.4.1,<12.0.0a0
+- pynvml>=12.0.0,<13.0.0a0
 - pytest-benchmark
 - pytest-cases>=3.8.2
 - pytest-cov

diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
@@ -43,7 +43,7 @@ requirements:
   run:
     - python
     - cudf ={{ version }}
-    - pynvml >=11.4.1,<12.0.0a0
+    - pynvml >=12.0.0,<13.0.0a0
     - rapids-dask-dependency ={{ minor_version }}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -757,7 +757,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - pynvml>=11.4.1,<12.0.0a0
+          - pynvml>=12.0.0,<13.0.0a0
           - rapids-dask-dependency==25.2.*,>=0.0.0a0
   run_custreamz:
     common:

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources column.pyx scalar.pyx strings_udf.pyx types.pyx utils.pyx)
+set(cython_sources column.pyx scalar.pyx strings_udf.pyx types.pyx)
 set(linked_libraries cudf::cudf)
 
 rapids_cython_create_modules(

diff --git a/python/cudf/cudf/_lib/__init__.pxd b/python/cudf/cudf/_lib/__init__.pxd
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
@@ -1,9 +1,2 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
-import numpy as np
-
 from . import strings_udf
-
-MAX_COLUMN_SIZE = np.iinfo(np.int32).max
-MAX_COLUMN_SIZE_STR = "INT32_MAX"
-MAX_STRING_COLUMN_BYTES = np.iinfo(np.int32).max
-MAX_STRING_COLUMN_BYTES_STR = "INT32_MAX"
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
@@ -31,12 +31,12 @@ from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from cudf._lib.types cimport (
     dtype_from_column_view,
-    dtype_to_data_type,
     dtype_to_pylibcudf_type,
 )
 
 from cudf._lib.types import dtype_from_pylibcudf_column
 
+from pylibcudf cimport DataType as plc_DataType
 cimport pylibcudf.libcudf.copying as cpp_copying
 cimport pylibcudf.libcudf.types as libcudf_types
 cimport pylibcudf.libcudf.unary as libcudf_unary
@@ -361,7 +361,7 @@ cdef class Column:
             col = self
             data_dtype = col.dtype
 
-        cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype)
+        cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype)
         cdef libcudf_types.size_type offset = self.offset
         cdef vector[mutable_column_view] children
         cdef void* data
@@ -398,7 +398,7 @@ cdef class Column:
         self._data = None
 
         return mutable_column_view(
-            dtype,
+            dtype.c_obj,
             self.size,
             data,
             mask,
@@ -424,7 +424,7 @@ cdef class Column:
             col = self
             data_dtype = col.dtype
 
-        cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype)
+        cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype)
         cdef libcudf_types.size_type offset = self.offset
         cdef vector[column_view] children
         cdef void* data
@@ -450,7 +450,7 @@ cdef class Column:
         cdef libcudf_types.size_type c_null_count = null_count
 
         return column_view(
-            dtype,
+            dtype.c_obj,
             self.size,
             data,
             mask,

diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
@@ -10,24 +10,22 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-import pylibcudf
+import pylibcudf as plc
 
 import cudf
-from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES
 from cudf.core.dtypes import ListDtype, StructDtype
+from cudf._lib.types import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES
+from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
 from cudf.core.missing import NA, NaT
 
-cimport pylibcudf.libcudf.types as libcudf_types
 # We currently need this cimport because some of the implementations here
 # access the c_obj of the scalar, and because we need to be able to call
 # pylibcudf.Scalar.from_libcudf. Both of those are temporarily acceptable until
 # DeviceScalar is phased out entirely from cuDF Cython (at which point
 # cudf.Scalar will be directly backed by pylibcudf.Scalar).
-from pylibcudf cimport Scalar as plc_Scalar
+from pylibcudf cimport Scalar as plc_Scalar, type_id as plc_TypeID
 from pylibcudf.libcudf.scalar.scalar cimport list_scalar, scalar, struct_scalar
 
-from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
-
 
 def _replace_nested(obj, check, replacement):
     if isinstance(obj, list):
@@ -62,12 +60,12 @@ def gather_metadata(dtypes):
     """
     out = []
     for name, dtype in dtypes.items():
-        v = pylibcudf.interop.ColumnMetadata(name)
+        v = plc.interop.ColumnMetadata(name)
         if isinstance(dtype, cudf.StructDtype):
             v.children_meta = gather_metadata(dtype.fields)
         elif isinstance(dtype, cudf.ListDtype):
             # Offsets column is unnamed and has no children
-            v.children_meta.append(pylibcudf.interop.ColumnMetadata(""))
+            v.children_meta.append(plc.interop.ColumnMetadata(""))
             v.children_meta.extend(
                 gather_metadata({"": dtype.element_type})
             )
@@ -81,7 +79,7 @@ cdef class DeviceScalar:
     # that from_unique_ptr is implemented is probably dereferencing this in an
     # invalid state. See what the best way to fix that is.
     def __cinit__(self, *args, **kwargs):
-        self.c_value = pylibcudf.Scalar.__new__(pylibcudf.Scalar)
+        self.c_value = plc.Scalar.__new__(plc.Scalar)
 
     def __init__(self, value, dtype):
         """
@@ -127,20 +125,20 @@ cdef class DeviceScalar:
             pa_array = pa.array([pa.scalar(value, type=pa_type)])
 
         pa_table = pa.Table.from_arrays([pa_array], names=[""])
-        table = pylibcudf.interop.from_arrow(pa_table)
+        table = plc.interop.from_arrow(pa_table)
 
         column = table.columns()[0]
         if isinstance(dtype, cudf.core.dtypes.DecimalDtype):
             if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype):
-                column = pylibcudf.unary.cast(
-                    column, pylibcudf.DataType(pylibcudf.TypeId.DECIMAL32, -dtype.scale)
+                column = plc.unary.cast(
+                    column, plc.DataType(plc.TypeId.DECIMAL32, -dtype.scale)
                 )
             elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
-                column = pylibcudf.unary.cast(
-                    column, pylibcudf.DataType(pylibcudf.TypeId.DECIMAL64, -dtype.scale)
+                column = plc.unary.cast(
+                    column, plc.DataType(plc.TypeId.DECIMAL64, -dtype.scale)
                 )
 
-        self.c_value = pylibcudf.copying.get_element(column, 0)
+        self.c_value = plc.copying.get_element(column, 0)
         self._dtype = dtype
 
     def _to_host_scalar(self):
@@ -150,7 +148,7 @@ cdef class DeviceScalar:
         null_type = NaT if is_datetime or is_timedelta else NA
 
         metadata = gather_metadata({"": self.dtype})[0]
-        ps = pylibcudf.interop.to_arrow(self.c_value, metadata)
+        ps = plc.interop.to_arrow(self.c_value, metadata)
         if not ps.is_valid:
             return null_type
 
@@ -225,43 +223,42 @@ cdef class DeviceScalar:
         return s
 
     cdef void _set_dtype(self, dtype=None):
-        cdef libcudf_types.data_type cdtype = self.get_raw_ptr()[0].type()
-
+        cdef plc_TypeID cdtype_id = self.c_value.type().id()
         if dtype is not None:
             self._dtype = dtype
-        elif cdtype.id() in {
-            libcudf_types.type_id.DECIMAL32,
-            libcudf_types.type_id.DECIMAL64,
-            libcudf_types.type_id.DECIMAL128,
+        elif cdtype_id in {
+            plc_TypeID.DECIMAL32,
+            plc_TypeID.DECIMAL64,
+            plc_TypeID.DECIMAL128,
         }:
             raise TypeError(
                 "Must pass a dtype when constructing from a fixed-point scalar"
             )
-        elif cdtype.id() == libcudf_types.type_id.STRUCT:
+        elif cdtype_id == plc_TypeID.STRUCT:
             struct_table_view = (<struct_scalar*>self.get_raw_ptr())[0].view()
             self._dtype = StructDtype({
                 str(i): dtype_from_column_view(struct_table_view.column(i))
                 for i in range(struct_table_view.num_columns())
             })
-        elif cdtype.id() == libcudf_types.type_id.LIST:
+        elif cdtype_id == plc_TypeID.LIST:
             if (
                 <list_scalar*>self.get_raw_ptr()
-            )[0].view().type().id() == libcudf_types.type_id.LIST:
+            )[0].view().type().id() == plc_TypeID.LIST:
                 self._dtype = dtype_from_column_view(
                     (<list_scalar*>self.get_raw_ptr())[0].view()
                 )
             else:
                 self._dtype = ListDtype(
-                    LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
+                    PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
                         <underlying_type_t_type_id>(
                             (<list_scalar*>self.get_raw_ptr())[0]
                             .view().type().id()
                         )
                     ]
                 )
         else:
-            self._dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
-                <underlying_type_t_type_id>(cdtype.id())
+            self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
+                <underlying_type_t_type_id>(cdtype_id)
             ]
 
 

diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd
@@ -1,16 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
-from libcpp cimport bool
 
-cimport pylibcudf.libcudf.types as libcudf_types
 from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 
 ctypedef int32_t underlying_type_t_type_id
 
 cdef dtype_from_column_view(column_view cv)
 
-cdef libcudf_types.data_type dtype_to_data_type(dtype) except *
 cpdef dtype_to_pylibcudf_type(dtype)
-cdef bool is_decimal_type_id(libcudf_types.type_id tid) except *