From b8ec883d8a6a357d6330d0a1cedb6312f841917d Mon Sep 17 00:00:00 2001
From: mvashishtha <mahesh@ponder.io>
Date: Fri, 13 May 2022 13:25:11 -0500
Subject: [PATCH 1/2] PERF-#4445: Stop recomputing both indices for axis-wide
 applies.

Signed-off-by: mvashishtha <mahesh@ponder.io>
---
 .../dataframe/pandas/dataframe/dataframe.py   | 30 +++++++++++++++++--
 .../storage_formats/pandas/query_compiler.py  | 18 ++++++++++-
 2 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
index 7381e2c7125..65d0cd3a955 100644
--- a/modin/core/dataframe/pandas/dataframe/dataframe.py
+++ b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -1864,6 +1864,7 @@ def apply_full_axis(
         new_index=None,
         new_columns=None,
         dtypes=None,
+        func_may_change_complementary_index_size=True,
     ):
         """
         Perform a function across an entire axis.
@@ -1884,6 +1885,14 @@ def apply_full_axis(
             The data types of the result. This is an optimization
             because there are functions that always result in a particular data
             type, and allows us to avoid (re)computing it.
+        func_may_change_complementary_index_size : bool, optional
+            Whether the per-axis function may change the complementary axis
+            width/length for each partition. e.g. for a function that applies
+            per column, this parameter should only be set to false if that
+            function is guaranteed not to change the number of columns. Setting
+            this parameter to true can improve performance because the
+            resulting frame doesn't have to recompute the complementary axis
+            lengths for its partitions.
 
         Returns
         -------
@@ -1894,6 +1903,13 @@ def apply_full_axis(
         -----
         The data shape may change as a result of the function.
         """
+        new_row_lengths = None
+        new_column_widths = None
+        if not func_may_change_complementary_index_size:
+            if axis == 0:
+                new_column_widths = self._column_widths
+            else:
+                new_row_lengths = self._row_lengths
         return self.broadcast_apply_full_axis(
             axis=axis,
             func=func,
@@ -1901,6 +1917,8 @@ def apply_full_axis(
             new_columns=new_columns,
             dtypes=dtypes,
             other=None,
+            new_row_lengths=new_row_lengths,
+            new_column_widths=new_column_widths,
         )
 
     @lazy_metadata_decorator(apply_axis="both")
@@ -2267,6 +2285,8 @@ def broadcast_apply_full_axis(
         apply_indices=None,
         enumerate_partitions=False,
         dtypes=None,
+        new_row_lengths=None,
+        new_column_widths=None,
     ):
         """
         Broadcast partitions of `other` Modin DataFrame and apply a function along full axis.
@@ -2294,6 +2314,10 @@ def broadcast_apply_full_axis(
             Data types of the result. This is an optimization
             because there are functions that always result in a particular data
             type, and allows us to avoid (re)computing it.
+        new_row_lengths : list, optional
+            The length of each partition in the rows.
+        new_column_widths : list, optional
+            The width of each partition in the columns.
 
         Returns
         -------
@@ -2336,9 +2360,9 @@ def broadcast_apply_full_axis(
         result = self.__constructor__(
             new_partitions,
             *new_axes,
-            None,
-            None,
-            dtypes,
+            column_widths=new_column_widths,
+            row_lengths=new_row_lengths,
+            dtypes=dtypes,
         )
         if new_index is not None:
             result.synchronize_labels(axis=0)
diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
index 3994d4ee50b..4d3ba09043b 100644
--- a/modin/core/storage_formats/pandas/query_compiler.py
+++ b/modin/core/storage_formats/pandas/query_compiler.py
@@ -2434,6 +2434,7 @@ def _list_like_func(self, func, axis, *args, **kwargs):
             lambda df: pandas.DataFrame(df.apply(func, axis, *args, **kwargs)),
             new_index=new_index,
             new_columns=new_columns,
+            func_may_change_complementary_index_size=False,
         )
         return self.__constructor__(new_modin_frame)
 
@@ -2462,8 +2463,23 @@ def _callable_func(self, func, axis, *args, **kwargs):
         if callable(func):
             func = wrap_udf_function(func)
 
+        if axis == 0:
+            new_index = None
+            new_columns = self._modin_frame.columns
+        else:
+            new_index = self._modin_frame.index
+            new_columns = None
         new_modin_frame = self._modin_frame.apply_full_axis(
-            axis, lambda df: df.apply(func, axis=axis, *args, **kwargs)
+            axis,
+            lambda df: df.apply(
+                func,
+                axis=axis,
+                *args,
+                **kwargs,
+            ),
+            new_index=new_index,
+            new_columns=new_columns,
+            func_may_change_complementary_index_size=False,
         )
         return self.__constructor__(new_modin_frame)
 

From a1b861db3d4eb5b8bc1d9563fde15c52cd596e0d Mon Sep 17 00:00:00 2001
From: mvashishtha <mahesh@ponder.io>
Date: Fri, 13 May 2022 13:29:33 -0500
Subject: [PATCH 2/2] Add a release note.

Signed-off-by: mvashishtha <mahesh@ponder.io>
---
 docs/release_notes/release_notes-0.15.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/release_notes/release_notes-0.15.0.rst b/docs/release_notes/release_notes-0.15.0.rst
index ff448f70b56..e95dade92a0 100644
--- a/docs/release_notes/release_notes-0.15.0.rst
+++ b/docs/release_notes/release_notes-0.15.0.rst
@@ -22,6 +22,7 @@ Key Features and Updates
   * FIX-#4449: Drain the call queue before waiting on result in benchmark mode (#4472)
 * Performance enhancements
   * FEAT-#4320: Add connectorx as an alternative engine for read_sql (#4346)
+  * FEAT-#4445: Stop recomputing both indices for axis-wide applies (#4460).
 * Benchmarking enhancements
   *
 * Refactor Codebase