narwhals-dev · FBruzzesi · Dec 8, 2024 · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/docs/api-reference/dataframe.md b/docs/api-reference/dataframe.md
@@ -11,6 +11,7 @@
         - columns
         - drop
         - drop_nulls
+        - explode
         - filter
         - gather_every
         - get_column

diff --git a/docs/api-reference/lazyframe.md b/docs/api-reference/lazyframe.md
@@ -10,6 +10,7 @@
         - columns
         - drop
         - drop_nulls
+        - explode
         - filter
         - gather_every
         - group_by

diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py
@@ -19,6 +19,7 @@
 from narwhals.utils import Implementation
 from narwhals.utils import flatten
 from narwhals.utils import generate_temporary_column_name
+from narwhals.utils import import_dtypes_module
 from narwhals.utils import is_sequence_but_not_str
 from narwhals.utils import parse_columns_to_drop
 
@@ -743,3 +744,78 @@ def unpivot(
         )
         # TODO(Unassigned): Even with promote_options="permissive", pyarrow does not
         # upcast numeric to non-numeric (e.g. string) datatypes
+
+    def explode(self: Self, columns: str | Sequence[str], *more_columns: str) -> Self:
+        import pyarrow as pa
+        import pyarrow.compute as pc
+
+        from narwhals.exceptions import InvalidOperationError
+
+        dtypes = import_dtypes_module(self._version)
+
+        to_explode = (
+            [columns, *more_columns]
+            if isinstance(columns, str)
+            else [*columns, *more_columns]
+        )
+
+        schema = self.collect_schema()
+        for col_to_explode in to_explode:
+            dtype = schema[col_to_explode]
+
+            if dtype != dtypes.List:
+                msg = f"`explode` operation not supported for dtype `{dtype}`"
+                raise InvalidOperationError(msg)
+
+        native_frame = self._native_frame
+        counts = pc.list_value_length(native_frame[to_explode[0]])
+
+        if not all(
+            pc.all(pc.equal(pc.list_value_length(native_frame[col_name]), counts)).as_py()
+            for col_name in to_explode[1:]
+        ):
+            from narwhals.exceptions import ShapeError
+
+            msg = "exploded columns must have matching element counts"
+            raise ShapeError(msg)
+
+        original_columns = self.columns
+        other_columns = [c for c in original_columns if c not in to_explode]
+        fast_path = pc.all(pc.greater_equal(counts, 1)).as_py()
+
+        if fast_path:
+            indices = pc.list_parent_indices(native_frame[to_explode[0]])
+            flatten_func = pc.list_flatten
+        else:
+            indices = pa.array(
+                [
+                    i
+                    for i, count in enumerate(counts.to_pylist())
+                    for _ in range(max(count or 1, 1))
+                ]
+            )
+
+            def explode_null_array(array: pa.ChunkedArray) -> pa.ChunkedArray:
+                exploded_values = []  # type: ignore[var-annotated]
+                for lst_element in array.to_pylist():
+                    if lst_element is None or len(lst_element) == 0:
+                        exploded_values.append(None)
+                    else:  # Non-empty list)
+                        exploded_values.extend(lst_element)
+                return pa.chunked_array([exploded_values])
+
+            flatten_func = explode_null_array
+
+        arrays = [
+            native_frame[col_name].take(indices=indices)
+            if col_name in other_columns
+            else flatten_func(native_frame[col_name])
+            for col_name in original_columns
+        ]
+
+        return self._from_native_frame(
+            pa.Table.from_arrays(
+                arrays=arrays,
+                names=original_columns,
+            )
+        )
diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py
@@ -937,3 +937,52 @@ def unpivot(
                 value_name=value_name if value_name is not None else "value",
             )
         )
+
+    def explode(self: Self, columns: str | Sequence[str], *more_columns: str) -> Self:
+        from narwhals.exceptions import InvalidOperationError
+
+        dtypes = import_dtypes_module(self._version)
+
+        to_explode = (
+            [columns, *more_columns]
+            if isinstance(columns, str)
+            else [*columns, *more_columns]
+        )
+        schema = self.collect_schema()
+        for col_to_explode in to_explode:
+            dtype = schema[col_to_explode]
+
+            if dtype != dtypes.List:
+                msg = f"`explode` operation not supported for dtype `{dtype}`"
+                raise InvalidOperationError(msg)
+
+        if len(to_explode) == 1:
+            return self._from_native_frame(self._native_frame.explode(to_explode[0]))
+        else:
+            native_frame = self._native_frame
+            anchor_series = native_frame[to_explode[0]].list.len()
+
+            if not all(
+                (native_frame[col_name].list.len() == anchor_series).all()
+                for col_name in to_explode[1:]
+            ):
+                from narwhals.exceptions import ShapeError
+
+                msg = "exploded columns must have matching element counts"
+                raise ShapeError(msg)
+
+            original_columns = self.columns
+            other_columns = [c for c in original_columns if c not in to_explode]
+
+            exploded_frame = native_frame[[*other_columns, to_explode[0]]].explode(
+                to_explode[0]
+            )
+            exploded_series = [
+                native_frame[col_name].explode().to_frame() for col_name in to_explode[1:]
+            ]
+
+            plx = self.__native_namespace__()
+
+            return self._from_native_frame(
+                plx.concat([exploded_frame, *exploded_series], axis=1)[original_columns]
+            )
diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py
@@ -334,6 +334,14 @@ def __eq__(self, other: object) -> NoReturn:
         )
         raise NotImplementedError(msg)
 
+    def explode(self: Self, columns: str | Sequence[str], *more_columns: str) -> Self:
+        return self._from_compliant_dataframe(
+            self._compliant_frame.explode(
+                columns,
+                *more_columns,
+            )
+        )
+
 
 class DataFrame(BaseFrame[DataFrameT]):
     """Narwhals DataFrame, backed by a native dataframe.
@@ -572,8 +580,6 @@ def to_pandas(self) -> pd.DataFrame:
             0    1  6.0   a
             1    2  7.0   b
             2    3  8.0   c
-
-
         """
         return self._compliant_frame.to_pandas()
 
@@ -2925,6 +2931,77 @@ def unpivot(
             on=on, index=index, variable_name=variable_name, value_name=value_name
         )
 
+    def explode(self: Self, columns: str | Sequence[str], *more_columns: str) -> Self:
+        """Explode the dataframe to long format by exploding the given columns.
+
+        Notes:
+            It is possible to explode multiple columns only if these columns must have
+            matching element counts.
+
+        Arguments:
+            columns: Column names. The underlying columns being exploded must be of the `List` data type.
+            *more_columns: Additional names of columns to explode, specified as positional arguments.
+
+        Returns:
+            New DataFrame
+
+        Examples:
+            >>> import narwhals as nw
+            >>> from narwhals.typing import IntoDataFrameT
+            >>> import pandas as pd
+            >>> import polars as pl
+            >>> import pyarrow as pa
+            >>> data = {
+            ...     "a": ["x", "y", "z", "w"],
+            ...     "lst1": [[1, 2], None, [None], []],
+            ...     "lst2": [[3, None], None, [42], []],
+            ... }
+
+            We define a library agnostic function:
+
+            >>> def agnostic_explode(df_native: IntoDataFrameT) -> IntoDataFrameT:
+            ...     return (
+            ...         nw.from_native(df_native)
+            ...         .with_columns(nw.col("lst1", "lst2").cast(nw.List(nw.Int32())))
+            ...         .explode("lst1", "lst2")
+            ...         .to_native()
+            ...     )
+
+            We can then pass any supported library such as pandas, Polars (eager),
+            or PyArrow to `agnostic_explode`:
+
+            >>> agnostic_explode(pd.DataFrame(data))
+               a  lst1  lst2
+            0  x     1     3
+            0  x     2  <NA>
+            1  y  <NA>  <NA>
+            2  z  <NA>    42
+            3  w  <NA>  <NA>
+            >>> agnostic_explode(pl.DataFrame(data))
+            shape: (5, 3)
+            ┌─────┬──────┬──────┐
+            │ a   ┆ lst1 ┆ lst2 │
+            │ --- ┆ ---  ┆ ---  │
+            │ str ┆ i32  ┆ i32  │
+            ╞═════╪══════╪══════╡
+            │ x   ┆ 1    ┆ 3    │
+            │ x   ┆ 2    ┆ null │
+            │ y   ┆ null ┆ null │
+            │ z   ┆ null ┆ 42   │
+            │ w   ┆ null ┆ null │
+            └─────┴──────┴──────┘
+            >>> agnostic_explode(pa.table(data))
+            pyarrow.Table
+            a: string
+            lst1: int64
+            lst2: int64
+            ----
+            a: [["x","x","y","z","w"]]
+            lst1: [[1,2,null,null,null]]
+            lst2: [[3,null,null,42,null]]
+        """
+        return super().explode(columns, *more_columns)
+
 
 class LazyFrame(BaseFrame[FrameT]):
     """Narwhals DataFrame, backed by a native dataframe.
@@ -4643,3 +4720,56 @@ def unpivot(
         return super().unpivot(
             on=on, index=index, variable_name=variable_name, value_name=value_name
         )
+
+    def explode(self: Self, columns: str | Sequence[str], *more_columns: str) -> Self:
+        """Explode the dataframe to long format by exploding the given columns.
+
+        Notes:
+            It is possible to explode multiple columns only if these columns must have
+            matching element counts.
+
+        Arguments:
+            columns: Column names. The underlying columns being exploded must be of the `List` data type.
+            *more_columns: Additional names of columns to explode, specified as positional arguments.
+
+        Returns:
+            New DataFrame
+
+        Examples:
+            >>> import narwhals as nw
+            >>> from narwhals.typing import IntoFrameT
+            >>> import polars as pl
+            >>> data = {
+            ...     "a": ["x", "y", "z", "w"],
+            ...     "lst1": [[1, 2], None, [None], []],
+            ...     "lst2": [[3, None], None, [42], []],
+            ... }
+
+            We define a library agnostic function:
+
+            >>> def agnostic_explode(df_native: IntoFrameT) -> IntoFrameT:
+            ...     return (
+            ...         nw.from_native(df_native)
+            ...         .with_columns(nw.col("lst1", "lst2").cast(nw.List(nw.Int32())))
+            ...         .explode("lst1", "lst2")
+            ...         .to_native()
+            ...     )
+
+            We can then pass any supported library such as pandas, Polars (eager),
+            or PyArrow to `agnostic_explode`:
+
+            >>> agnostic_explode(pl.LazyFrame(data)).collect()
+            shape: (5, 3)
+            ┌─────┬──────┬──────┐
+            │ a   ┆ lst1 ┆ lst2 │
+            │ --- ┆ ---  ┆ ---  │
+            │ str ┆ i32  ┆ i32  │
+            ╞═════╪══════╪══════╡
+            │ x   ┆ 1    ┆ 3    │
+            │ x   ┆ 2    ┆ null │
+            │ y   ┆ null ┆ null │
+            │ z   ┆ null ┆ 42   │
+            │ w   ┆ null ┆ null │
+            └─────┴──────┴──────┘
+        """
+        return super().explode(columns, *more_columns)
diff --git a/narwhals/exceptions.py b/narwhals/exceptions.py
@@ -35,6 +35,10 @@ def from_missing_and_available_column_names(
         return ColumnNotFoundError(message)
 
 
+class ShapeError(Exception):
+    """Exception raised when trying to perform operations on data structures with incompatible shapes."""
+
+
 class InvalidOperationError(Exception):
     """Exception raised during invalid operations."""
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,7 @@ @@
             - columns
             - drop
             - drop_nulls
+            - explode
             - filter
             - gather_every
             - get_column
@@ Expand Down @@