From b8ec883d8a6a357d6330d0a1cedb6312f841917d Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Fri, 13 May 2022 13:25:11 -0500 Subject: [PATCH 1/2] PERF-#4445: Stop recomputing both indices for axis-wide applies. Signed-off-by: mvashishtha --- .../dataframe/pandas/dataframe/dataframe.py | 30 +++++++++++++++++-- .../storage_formats/pandas/query_compiler.py | 18 ++++++++++- 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index 7381e2c7125..65d0cd3a955 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -1864,6 +1864,7 @@ def apply_full_axis( new_index=None, new_columns=None, dtypes=None, + func_may_change_complementary_index_size=True, ): """ Perform a function across an entire axis. @@ -1884,6 +1885,14 @@ def apply_full_axis( The data types of the result. This is an optimization because there are functions that always result in a particular data type, and allows us to avoid (re)computing it. + func_may_change_complementary_index_size : bool, optional + Whether the per-axis function may change the complementary axis + width/length for each partition. e.g. for a function that applies + per column, this parameter should only be set to false if that + function is guaranteed not to change the number of columns. Setting + this parameter to true can improve performance because the + resulting frame doesn't have to recompute the complementary axis + lengths for its partitions. Returns ------- @@ -1894,6 +1903,13 @@ def apply_full_axis( ----- The data shape may change as a result of the function. """ + new_row_lengths = None + new_column_widths = None + if not func_may_change_complementary_index_size: + if axis == 0: + new_column_widths = self._column_widths + else: + new_row_lengths = self._row_lengths return self.broadcast_apply_full_axis( axis=axis, func=func, @@ -1901,6 +1917,8 @@ def apply_full_axis( new_columns=new_columns, dtypes=dtypes, other=None, + new_row_lengths=new_row_lengths, + new_column_widths=new_column_widths, ) @lazy_metadata_decorator(apply_axis="both") @@ -2267,6 +2285,8 @@ def broadcast_apply_full_axis( apply_indices=None, enumerate_partitions=False, dtypes=None, + new_row_lengths=None, + new_column_widths=None, ): """ Broadcast partitions of `other` Modin DataFrame and apply a function along full axis. @@ -2294,6 +2314,10 @@ def broadcast_apply_full_axis( Data types of the result. This is an optimization because there are functions that always result in a particular data type, and allows us to avoid (re)computing it. + new_row_lengths : list, optional + The length of each partition in the rows. + new_column_widths : list, optional + The width of each partition in the columns. Returns ------- @@ -2336,9 +2360,9 @@ def broadcast_apply_full_axis( result = self.__constructor__( new_partitions, *new_axes, - None, - None, - dtypes, + column_widths=new_column_widths, + row_lengths=new_row_lengths, + dtypes=dtypes, ) if new_index is not None: result.synchronize_labels(axis=0) diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 3994d4ee50b..4d3ba09043b 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -2434,6 +2434,7 @@ def _list_like_func(self, func, axis, *args, **kwargs): lambda df: pandas.DataFrame(df.apply(func, axis, *args, **kwargs)), new_index=new_index, new_columns=new_columns, + func_may_change_complementary_index_size=False, ) return self.__constructor__(new_modin_frame) @@ -2462,8 +2463,23 @@ def _callable_func(self, func, axis, *args, **kwargs): if callable(func): func = wrap_udf_function(func) + if axis == 0: + new_index = None + new_columns = self._modin_frame.columns + else: + new_index = self._modin_frame.index + new_columns = None new_modin_frame = self._modin_frame.apply_full_axis( - axis, lambda df: df.apply(func, axis=axis, *args, **kwargs) + axis, + lambda df: df.apply( + func, + axis=axis, + *args, + **kwargs, + ), + new_index=new_index, + new_columns=new_columns, + func_may_change_complementary_index_size=False, ) return self.__constructor__(new_modin_frame) From a1b861db3d4eb5b8bc1d9563fde15c52cd596e0d Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Fri, 13 May 2022 13:29:33 -0500 Subject: [PATCH 2/2] Add a release note. Signed-off-by: mvashishtha --- docs/release_notes/release_notes-0.15.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/release_notes/release_notes-0.15.0.rst b/docs/release_notes/release_notes-0.15.0.rst index ff448f70b56..e95dade92a0 100644 --- a/docs/release_notes/release_notes-0.15.0.rst +++ b/docs/release_notes/release_notes-0.15.0.rst @@ -22,6 +22,7 @@ Key Features and Updates * FIX-#4449: Drain the call queue before waiting on result in benchmark mode (#4472) * Performance enhancements * FEAT-#4320: Add connectorx as an alternative engine for read_sql (#4346) + * FEAT-#4445: Stop recomputing both indices for axis-wide applies (#4460). * Benchmarking enhancements * * Refactor Codebase