From 7efe546548c3c77472122622e6eb2ea1c56e69e5 Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Fri, 8 Sep 2023 14:40:03 +0000
Subject: [PATCH 01/26] Remove level and errors arguments for Pandas 2
 compatibility. Level was always NotImplemented anyway, and errors I
 conditionally disable based on the pandas version to keep functionality.

---
 sdks/python/apache_beam/dataframe/frames.py   | 41 ++++++++-----------
 .../apache_beam/dataframe/frames_test.py      |  3 ++
 2 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/sdks/python/apache_beam/dataframe/frames.py b/sdks/python/apache_beam/dataframe/frames.py
index 3020eecbaeb5..5a24be2d9561 100644
--- a/sdks/python/apache_beam/dataframe/frames.py
+++ b/sdks/python/apache_beam/dataframe/frames.py
@@ -914,7 +914,7 @@ def sort_index(self, axis, **kwargs):
   @frame_base.args_to_kwargs(pd.DataFrame)
   @frame_base.populate_defaults(pd.DataFrame)
   @frame_base.maybe_inplace
-  def where(self, cond, other, errors, **kwargs):
+  def where(self, cond, other, **kwargs):
     """where is not parallelizable when ``errors="ignore"`` is specified."""
     requires = partitionings.Arbitrary()
     deferred_args = {}
@@ -934,16 +934,18 @@ def where(self, cond, other, errors, **kwargs):
     else:
       actual_args['other'] = other
 
-    if errors == "ignore":
-      # We need all data in order to ignore errors and propagate the original
-      # data.
-      requires = partitionings.Singleton(
-          reason=(
-              f"where(errors={errors!r}) is currently not parallelizable, "
-              "because all data must be collected on one node to determine if "
-              "the original data should be propagated instead."))
+    # For Pandas 2.0, errors was removed as an argument.
+    if PD_VERSION < (2, 0):
+      if "errors" in kwargs and kwargs['errors'] == "ignore":
+        # We need all data in order to ignore errors and propagate the original
+        # data.
+        requires = partitionings.Singleton(
+            reason=(
+                f"where(errors={kwargs['errors']!r}) is currently not parallelizable, "
+                "because all data must be collected on one node to determine if "
+                "the original data should be propagated instead."))
 
-    actual_args['errors'] = errors
+      actual_args['errors'] = kwargs['errors']
 
     def where_execution(df, *args):
       runtime_values = {
@@ -1602,12 +1604,7 @@ def mean(self, skipna, **kwargs):
   @frame_base.with_docs_from(pd.Series)
   @frame_base.args_to_kwargs(pd.Series)
   @frame_base.populate_defaults(pd.Series)
-  def var(self, axis, skipna, level, ddof, **kwargs):
-    """Per-level aggregation is not yet supported
-    (https://github.com/apache/beam/issues/21829). Only the default,
-    ``level=None``, is allowed."""
-    if level is not None:
-      raise NotImplementedError("per-level aggregation")
+  def var(self, axis, skipna, ddof, **kwargs):
     if skipna is None or skipna:
       self = self.dropna()  # pylint: disable=self-cls-assignment
 
@@ -1677,9 +1674,7 @@ def corr(self, other, method, min_periods):
   @frame_base.with_docs_from(pd.Series)
   @frame_base.args_to_kwargs(pd.Series)
   @frame_base.populate_defaults(pd.Series)
-  def skew(self, axis, skipna, level, numeric_only, **kwargs):
-    if level is not None:
-      raise NotImplementedError("per-level aggregation")
+  def skew(self, axis, skipna, numeric_only, **kwargs):
     if skipna is None or skipna:
       self = self.dropna()  # pylint: disable=self-cls-assignment
     # See the online, numerically stable formulae at
@@ -1741,9 +1736,7 @@ def combine_moments(data):
   @frame_base.with_docs_from(pd.Series)
   @frame_base.args_to_kwargs(pd.Series)
   @frame_base.populate_defaults(pd.Series)
-  def kurtosis(self, axis, skipna, level, numeric_only, **kwargs):
-    if level is not None:
-      raise NotImplementedError("per-level aggregation")
+  def kurtosis(self, axis, skipna, numeric_only, **kwargs):
     if skipna is None or skipna:
       self = self.dropna()  # pylint: disable=self-cls-assignment
 
@@ -2571,7 +2564,8 @@ def align(self, other, join, axis, copy, level, method, **kwargs):
     if kwargs:
       raise NotImplementedError('align(%s)' % ', '.join(kwargs.keys()))
 
-    if level is not None:
+    # In Pandas 2.0, all aggregations lost the level keyword.
+    if PD_VERSION < (2, 0) and level is not None:
       # Could probably get by partitioning on the used levels.
       requires_partition_by = partitionings.Singleton(reason=(
           f"align(level={level}) is not currently parallelizable. Only "
@@ -5376,6 +5370,7 @@ def func(df, *args, **kwargs):
           name,
           frame_base._elementwise_method(name, restrictions={'level': None},
                                          base=pd.Series))
+
     if hasattr(pd.DataFrame, name):
       setattr(
           DeferredDataFrame,
diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py
index a2a8ef75f885..1bf3a400d578 100644
--- a/sdks/python/apache_beam/dataframe/frames_test.py
+++ b/sdks/python/apache_beam/dataframe/frames_test.py
@@ -2036,6 +2036,7 @@ def test_dataframe_agg_modes(self):
     self._run_test(lambda df: df.agg({'A': ['sum', 'mean']}), df)
     self._run_test(lambda df: df.agg({'A': ['sum', 'mean'], 'B': 'min'}), df)
 
+  @unittest.skipIf(PD_VERSION >= (2, 0), "level argument removed in Pandas 2")
   def test_series_agg_level(self):
     self._run_test(
         lambda df: df.set_index(['group', 'foo']).bar.count(level=0),
@@ -2059,6 +2060,7 @@ def test_series_agg_level(self):
         lambda df: df.set_index(['group', 'foo']).bar.median(level=1),
         GROUPBY_DF)
 
+  @unittest.skipIf(PD_VERSION >= (2, 0), "level argument removed in Pandas 2")
   def test_dataframe_agg_level(self):
     self._run_test(
         lambda df: df.set_index(['group', 'foo']).count(level=0), GROUPBY_DF)
@@ -2226,6 +2228,7 @@ def test_df_agg_method_invalid_kwarg_raises(self):
     self._run_error_test(
         lambda df: df.median(min_count=3, numeric_only=True), GROUPBY_DF)
 
+  @unittest.skipIf(PD_VERSION >= (2, 0), "level argument removed in Pandas 2")
   def test_agg_min_count(self):
     df = pd.DataFrame({
         'good': [1, 2, 3, np.nan],

From 64be5a8d29b5f3fe33d3a22829cb8a43c7725a1a Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Fri, 8 Sep 2023 14:51:45 +0000
Subject: [PATCH 02/26] Fix formatting

---
 sdks/python/apache_beam/dataframe/frames.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sdks/python/apache_beam/dataframe/frames.py b/sdks/python/apache_beam/dataframe/frames.py
index 5a24be2d9561..7c7a587c2116 100644
--- a/sdks/python/apache_beam/dataframe/frames.py
+++ b/sdks/python/apache_beam/dataframe/frames.py
@@ -941,9 +941,10 @@ def where(self, cond, other, **kwargs):
         # data.
         requires = partitionings.Singleton(
             reason=(
-                f"where(errors={kwargs['errors']!r}) is currently not parallelizable, "
-                "because all data must be collected on one node to determine if "
-                "the original data should be propagated instead."))
+                f"where(errors={kwargs['errors']!r}) is currently not "
+                 "parallelizable, because all data must be collected on one "
+                 "node to determine if the original data should be propagated "
+                 "instead."))
 
       actual_args['errors'] = kwargs['errors']
 

From 006c55b8a3d84728dcb29085c960d5147d5f7633 Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Fri, 8 Sep 2023 14:56:03 +0000
Subject: [PATCH 03/26] More formatting

---
 sdks/python/apache_beam/dataframe/frames.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sdks/python/apache_beam/dataframe/frames.py b/sdks/python/apache_beam/dataframe/frames.py
index 7c7a587c2116..3fdacdb414f7 100644
--- a/sdks/python/apache_beam/dataframe/frames.py
+++ b/sdks/python/apache_beam/dataframe/frames.py
@@ -942,9 +942,9 @@ def where(self, cond, other, **kwargs):
         requires = partitionings.Singleton(
             reason=(
                 f"where(errors={kwargs['errors']!r}) is currently not "
-                 "parallelizable, because all data must be collected on one "
-                 "node to determine if the original data should be propagated "
-                 "instead."))
+                "parallelizable, because all data must be collected on one "
+                "node to determine if the original data should be propagated "
+                "instead."))
 
       actual_args['errors'] = kwargs['errors']
 

From a9bcdfdc2e7288ab7f5c338347ddf5c2b9b01f15 Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Fri, 8 Sep 2023 15:26:39 +0000
Subject: [PATCH 04/26] Fix empty error case

---
 sdks/python/apache_beam/dataframe/frames.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdks/python/apache_beam/dataframe/frames.py b/sdks/python/apache_beam/dataframe/frames.py
index 3fdacdb414f7..56cafa52a05f 100644
--- a/sdks/python/apache_beam/dataframe/frames.py
+++ b/sdks/python/apache_beam/dataframe/frames.py
@@ -946,7 +946,7 @@ def where(self, cond, other, **kwargs):
                 "node to determine if the original data should be propagated "
                 "instead."))
 
-      actual_args['errors'] = kwargs['errors']
+      actual_args['errors'] = kwargs['errors'] if 'errors' in kwargs else None
 
     def where_execution(df, *args):
       runtime_values = {

From 647b006d29d23009b53885de7a72950102314f30 Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Mon, 11 Sep 2023 16:00:22 +0000
Subject: [PATCH 05/26] Remove append method for Pandas >= 2.0.

Pandas removed .append in 2.0 so this helps keep the API consistent.
---
 .../apache_beam/dataframe/frame_base.py       | 31 ++++++++++++++++---
 .../apache_beam/dataframe/frame_base_test.py  |  9 ++++++
 sdks/python/apache_beam/dataframe/frames.py   | 16 ++++++----
 .../apache_beam/dataframe/frames_test.py      |  2 ++
 4 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/sdks/python/apache_beam/dataframe/frame_base.py b/sdks/python/apache_beam/dataframe/frame_base.py
index 24497f1de069..09bb71875160 100644
--- a/sdks/python/apache_beam/dataframe/frame_base.py
+++ b/sdks/python/apache_beam/dataframe/frame_base.py
@@ -475,7 +475,7 @@ def wrapper(self, inplace=False, **kwargs):
   return wrapper
 
 
-def args_to_kwargs(base_type):
+def args_to_kwargs(base_type, ignore=False):
   """Convert all args to kwargs before calling the decorated function.
 
   When applied to a function, this decorator creates a new function
@@ -484,8 +484,15 @@ def args_to_kwargs(base_type):
   determine the name to use for arguments that are converted to keyword
   arguments.
 
-  For internal use only. No backwards compatibility guarantees."""
+  ignore used mainly in cases where a method has been removed in a later version
+  of Pandas.
+
+  For internal use only. No backwards compatibility guarantees.
+  """
   def wrap(func):
+    if ignore:
+      return func
+
     arg_names = getfullargspec(unwrap(getattr(base_type, func.__name__))).args
 
     @functools.wraps(func)
@@ -524,14 +531,21 @@ def wrapper(*args, **kwargs):
     f"**{BEAM_SPECIFIC!r}** for details.")
 
 
-def with_docs_from(base_type, name=None):
+def with_docs_from(base_type, name=None, ignore=False):
   """Decorator that updates the documentation from the wrapped function to
+
   duplicate the documentation from the identically-named method in `base_type`.
 
   Any docstring on the original function will be included in the new function
   under a "Differences from pandas" heading.
+
+  ignore used mainly in cases where a method has been removed in a later version
+  of Pandas.
   """
   def wrap(func):
+    if ignore:
+      return func
+
     fn_name = name or func.__name__
     orig_doc = getattr(base_type, fn_name).__doc__
     if orig_doc is None:
@@ -588,15 +602,22 @@ def format_section(header):
   return wrap
 
 
-def populate_defaults(base_type):
+def populate_defaults(base_type, ignore=False):
   """Populate default values for keyword arguments in decorated function.
 
   When applied to a function, this decorator creates a new function
   with default values for all keyword arguments, based on the default values
   for the identically-named method on `base_type`.
 
-  For internal use only. No backwards compatibility guarantees."""
+  ignore used mainly in cases where a method has been removed in a later version
+  of Pandas.
+
+  For internal use only. No backwards compatibility guarantees.
+  """
   def wrap(func):
+    if ignore:
+      return func
+
     base_argspec = getfullargspec(unwrap(getattr(base_type, func.__name__)))
     if not base_argspec.defaults:
       return func
diff --git a/sdks/python/apache_beam/dataframe/frame_base_test.py b/sdks/python/apache_beam/dataframe/frame_base_test.py
index 82d5b65e1a49..e36662407c3f 100644
--- a/sdks/python/apache_beam/dataframe/frame_base_test.py
+++ b/sdks/python/apache_beam/dataframe/frame_base_test.py
@@ -99,6 +99,11 @@ class Proxy(object):
       def func(self, a, c=1000, **kwargs):
         return dict(kwargs, a=a, c=c)
 
+      @frame_base.args_to_kwargs(Base, ignore=True)
+      @frame_base.populate_defaults(Base, ignore=True)
+      def func_ignore(self, a, **kwargs):
+        return dict(kwargs, a=a)
+
     proxy = Proxy()
     # pylint: disable=too-many-function-args
     self.assertEqual(proxy.func(), {'a': 1, 'c': 1000})
@@ -108,6 +113,10 @@ def func(self, a, c=1000, **kwargs):
     self.assertEqual(proxy.func(c=6, a=2), {'a': 2, 'c': 6})
     self.assertEqual(proxy.func(c=6), {'a': 1, 'c': 6})
 
+    with self.assertRaises(TypeError):  # missing 1 required positional argument
+      proxy.func_ignore()
+    self.assertEqual(proxy.func_ignore(12, c=100), {'a': 12, 'c': 100})
+
 
 if __name__ == '__main__':
   unittest.main()
diff --git a/sdks/python/apache_beam/dataframe/frames.py b/sdks/python/apache_beam/dataframe/frames.py
index 0d9b22ae3f9e..65befe52977c 100644
--- a/sdks/python/apache_beam/dataframe/frames.py
+++ b/sdks/python/apache_beam/dataframe/frames.py
@@ -1336,12 +1336,14 @@ def keys(self):
       frame_base.wont_implement_method(
           pd.Series, 'shape', reason="non-deferred-result"))
 
-  @frame_base.with_docs_from(pd.Series)
-  @frame_base.args_to_kwargs(pd.Series)
-  @frame_base.populate_defaults(pd.Series)
+  @frame_base.with_docs_from(pd.Series, ignore=PD_VERSION >= (2, 0))
+  @frame_base.args_to_kwargs(pd.Series, ignore=PD_VERSION >= (2, 0))
+  @frame_base.populate_defaults(pd.Series, ignore=PD_VERSION >= (2, 0))
   def append(self, to_append, ignore_index, verify_integrity, **kwargs):
     """``ignore_index=True`` is not supported, because it requires generating an
     order-sensitive index."""
+    if PD_VERSION >= (2, 0):
+      raise frame_base.WontImplementError('append() was removed in Pandas 2.0.')
     if not isinstance(to_append, DeferredSeries):
       raise frame_base.WontImplementError(
           "append() only accepts DeferredSeries instances, received " +
@@ -2593,12 +2595,14 @@ def align(self, other, join, axis, copy, level, method, **kwargs):
             requires_partition_by=requires_partition_by,
             preserves_partition_by=partitionings.Arbitrary()))
 
-  @frame_base.with_docs_from(pd.DataFrame)
-  @frame_base.args_to_kwargs(pd.DataFrame)
-  @frame_base.populate_defaults(pd.DataFrame)
+  @frame_base.with_docs_from(pd.DataFrame, ignore=PD_VERSION >= (2, 0))
+  @frame_base.args_to_kwargs(pd.DataFrame, ignore=PD_VERSION >= (2, 0))
+  @frame_base.populate_defaults(pd.DataFrame, ignore=PD_VERSION >= (2, 0))
   def append(self, other, ignore_index, verify_integrity, sort, **kwargs):
     """``ignore_index=True`` is not supported, because it requires generating an
     order-sensitive index."""
+    if PD_VERSION >= (2, 0):
+      raise frame_base.WontImplementError('append() was removed in Pandas 2.0.')
     if not isinstance(other, DeferredDataFrame):
       raise frame_base.WontImplementError(
           "append() only accepts DeferredDataFrame instances, received " +
diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py
index fa121aa85c30..69e7f4b19d6f 100644
--- a/sdks/python/apache_beam/dataframe/frames_test.py
+++ b/sdks/python/apache_beam/dataframe/frames_test.py
@@ -797,6 +797,7 @@ def test_loc(self):
     self._run_test(lambda df: df.C.loc[df.A > 10], df)
     self._run_test(lambda df, s: df.loc[s.loc[1:3]], df, pd.Series(dates))
 
+  @unittest.skipIf(PD_VERSION >= (2, 0), 'append removed in Pandas 2.0')
   def test_append_sort(self):
     # yapf: disable
     df1 = pd.DataFrame({'int': [1, 2, 3], 'str': ['a', 'b', 'c']},
@@ -985,6 +986,7 @@ def test_series_fillna_series_as_value(self):
 
     self._run_test(lambda df, df2: df.A.fillna(df2.A), df, df2)
 
+  @unittest.skipIf(PD_VERSION >= (2, 0), 'append removed in Pandas 2.0')
   def test_append_verify_integrity(self):
     df1 = pd.DataFrame({'A': range(10), 'B': range(10)}, index=range(10))
     df2 = pd.DataFrame({'A': range(10), 'B': range(10)}, index=range(9, 19))

From cba15dd62759536251070d8a8bdae9e073ab305b Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Mon, 11 Sep 2023 19:07:46 +0000
Subject: [PATCH 06/26] Add support for ignored arguments and more tests.

---
 .../apache_beam/dataframe/frame_base.py       | 60 ++++++++++++++-----
 .../apache_beam/dataframe/frame_base_test.py  | 24 ++++++--
 sdks/python/apache_beam/dataframe/frames.py   | 12 ++--
 3 files changed, 70 insertions(+), 26 deletions(-)

diff --git a/sdks/python/apache_beam/dataframe/frame_base.py b/sdks/python/apache_beam/dataframe/frame_base.py
index 09bb71875160..ba866e36262b 100644
--- a/sdks/python/apache_beam/dataframe/frame_base.py
+++ b/sdks/python/apache_beam/dataframe/frame_base.py
@@ -475,7 +475,10 @@ def wrapper(self, inplace=False, **kwargs):
   return wrapper
 
 
-def args_to_kwargs(base_type, ignore=False):
+def args_to_kwargs(
+    base_type: object,
+    removed_method: bool = False,
+    removed_args: list[str] | None = None):
   """Convert all args to kwargs before calling the decorated function.
 
   When applied to a function, this decorator creates a new function
@@ -484,25 +487,43 @@ def args_to_kwargs(base_type, ignore=False):
   determine the name to use for arguments that are converted to keyword
   arguments.
 
-  ignore used mainly in cases where a method has been removed in a later version
-  of Pandas.
+  removed_method used in cases where a method has been removed in a later version
+  of Pandas. removed_args used in cases where a method has had arguments removed
+  in a later version of Pandas.
 
   For internal use only. No backwards compatibility guarantees.
   """
   def wrap(func):
-    if ignore:
+    if removed_method:
+      # Do no processing, let Beam function itself raise the error if called.
       return func
 
-    arg_names = getfullargspec(unwrap(getattr(base_type, func.__name__))).args
+    removed_arg_names = removed_args if removed_args is not None else []
+
+    base_arg_spec = getfullargspec(unwrap(getattr(base_type, func.__name__)))
+    base_arg_names = base_arg_spec.args
+    # Some arguments are keyword only and we still want to check against those.
+    all_possible_base_arg_names = base_arg_names + base_arg_spec.kwonlyargs
+    beam_arg_names = getfullargspec(func).args
+
+    if not_found := (set(beam_arg_names) - set(all_possible_base_arg_names) -
+                     set(removed_arg_names)):
+      raise TypeError(
+          f"Beam definition of {func.__name__} has arguments that are not found in the base version of the function: {not_found}"
+      )
 
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
-      for name, value in zip(arg_names, args):
+      for name, value in zip(base_arg_names, args):
         if name in kwargs:
           raise TypeError(
               "%s() got multiple values for argument '%s'" %
               (func.__name__, name))
         kwargs[name] = value
+      # Still have to populate these for the Beam function signature.
+      if removed_args:
+        for name in removed_args:
+          kwargs[name] = None
       return func(**kwargs)
 
     return wrapper
@@ -531,19 +552,23 @@ def wrapper(*args, **kwargs):
     f"**{BEAM_SPECIFIC!r}** for details.")
 
 
-def with_docs_from(base_type, name=None, ignore=False):
+def with_docs_from(
+    base_type: object,
+    name: str = None,
+    removed_method: bool = False,
+    ignored_args: list[str] | None = None):
   """Decorator that updates the documentation from the wrapped function to
-
   duplicate the documentation from the identically-named method in `base_type`.
 
   Any docstring on the original function will be included in the new function
   under a "Differences from pandas" heading.
 
-  ignore used mainly in cases where a method has been removed in a later version
+  removed_method used in cases where a method has been removed in a later version
   of Pandas.
   """
   def wrap(func):
-    if ignore:
+    if removed_method:
+      func.__doc__ = "This method has been removed in the current version of Pandas."
       return func
 
     fn_name = name or func.__name__
@@ -602,20 +627,25 @@ def format_section(header):
   return wrap
 
 
-def populate_defaults(base_type, ignore=False):
+def populate_defaults(
+    base_type: object,
+    removed_method: bool = False,
+    removed_args: list[str] | None = None,
+    ignore: bool = False):
   """Populate default values for keyword arguments in decorated function.
 
   When applied to a function, this decorator creates a new function
   with default values for all keyword arguments, based on the default values
   for the identically-named method on `base_type`.
 
-  ignore used mainly in cases where a method has been removed in a later version
-  of Pandas.
+  removed_method used in cases where a method has been removed in a later version
+  of Pandas. removed_args used in cases where a method has had arguments removed
+  in a later version of Pandas.
 
   For internal use only. No backwards compatibility guarantees.
   """
   def wrap(func):
-    if ignore:
+    if removed_method:
       return func
 
     base_argspec = getfullargspec(unwrap(getattr(base_type, func.__name__)))
@@ -634,6 +664,8 @@ def wrap(func):
     defaults_to_populate = set(
         func_argspec.args[:num_non_defaults]).intersection(
             arg_to_default.keys())
+    if removed_args:
+      defaults_to_populate -= set(removed_args)
 
     @functools.wraps(func)
     def wrapper(**kwargs):
diff --git a/sdks/python/apache_beam/dataframe/frame_base_test.py b/sdks/python/apache_beam/dataframe/frame_base_test.py
index e36662407c3f..2d16d02ba1ea 100644
--- a/sdks/python/apache_beam/dataframe/frame_base_test.py
+++ b/sdks/python/apache_beam/dataframe/frame_base_test.py
@@ -93,19 +93,27 @@ class Base(object):
       def func(self, a=1, b=2, c=3):
         pass
 
+      def func_removed_args(self, a):
+        pass
+
     class Proxy(object):
       @frame_base.args_to_kwargs(Base)
       @frame_base.populate_defaults(Base)
       def func(self, a, c=1000, **kwargs):
         return dict(kwargs, a=a, c=c)
 
-      @frame_base.args_to_kwargs(Base, ignore=True)
-      @frame_base.populate_defaults(Base, ignore=True)
-      def func_ignore(self, a, **kwargs):
+      @frame_base.args_to_kwargs(Base, removed_method=True)
+      @frame_base.populate_defaults(Base, removed_method=True)
+      def func_removed_method(self, a, **kwargs):
+        return dict(kwargs, a=a)
+
+      @frame_base.args_to_kwargs(Base, removed_args=['c'])
+      @frame_base.populate_defaults(Base, removed_args=['c'])
+      def func_removed_args(self, a, c, **kwargs):
         return dict(kwargs, a=a)
 
     proxy = Proxy()
-    # pylint: disable=too-many-function-args
+    # pylint: disable=too-many-function-args,no-value-for-parameter
     self.assertEqual(proxy.func(), {'a': 1, 'c': 1000})
     self.assertEqual(proxy.func(100), {'a': 100, 'c': 1000})
     self.assertEqual(proxy.func(2, 4, 6), {'a': 2, 'b': 4, 'c': 6})
@@ -114,8 +122,12 @@ def func_ignore(self, a, **kwargs):
     self.assertEqual(proxy.func(c=6), {'a': 1, 'c': 6})
 
     with self.assertRaises(TypeError):  # missing 1 required positional argument
-      proxy.func_ignore()
-    self.assertEqual(proxy.func_ignore(12, c=100), {'a': 12, 'c': 100})
+      proxy.func_removed_method()
+    self.assertEqual(proxy.func_removed_method(12, c=100), {'a': 12, 'c': 100})
+
+    with self.assertRaises(TypeError):  # missing 1 required positional argument
+      proxy.func_removed_args()
+    self.assertEqual(proxy.func_removed_args(12, d=100), {'a': 12, 'd': 100})
 
 
 if __name__ == '__main__':
diff --git a/sdks/python/apache_beam/dataframe/frames.py b/sdks/python/apache_beam/dataframe/frames.py
index 65befe52977c..c8f10969c236 100644
--- a/sdks/python/apache_beam/dataframe/frames.py
+++ b/sdks/python/apache_beam/dataframe/frames.py
@@ -1336,9 +1336,9 @@ def keys(self):
       frame_base.wont_implement_method(
           pd.Series, 'shape', reason="non-deferred-result"))
 
-  @frame_base.with_docs_from(pd.Series, ignore=PD_VERSION >= (2, 0))
-  @frame_base.args_to_kwargs(pd.Series, ignore=PD_VERSION >= (2, 0))
-  @frame_base.populate_defaults(pd.Series, ignore=PD_VERSION >= (2, 0))
+  @frame_base.with_docs_from(pd.Series, removed_method=PD_VERSION >= (2, 0))
+  @frame_base.args_to_kwargs(pd.Series, removed_method=PD_VERSION >= (2, 0))
+  @frame_base.populate_defaults(pd.Series, removed_method=PD_VERSION >= (2, 0))
   def append(self, to_append, ignore_index, verify_integrity, **kwargs):
     """``ignore_index=True`` is not supported, because it requires generating an
     order-sensitive index."""
@@ -2595,9 +2595,9 @@ def align(self, other, join, axis, copy, level, method, **kwargs):
             requires_partition_by=requires_partition_by,
             preserves_partition_by=partitionings.Arbitrary()))
 
-  @frame_base.with_docs_from(pd.DataFrame, ignore=PD_VERSION >= (2, 0))
-  @frame_base.args_to_kwargs(pd.DataFrame, ignore=PD_VERSION >= (2, 0))
-  @frame_base.populate_defaults(pd.DataFrame, ignore=PD_VERSION >= (2, 0))
+  @frame_base.with_docs_from(pd.DataFrame, removed_method=PD_VERSION >= (2, 0))
+  @frame_base.args_to_kwargs(pd.DataFrame, removed_method=PD_VERSION >= (2, 0))
+  @frame_base.populate_defaults(pd.DataFrame, removed_method=PD_VERSION >= (2, 0))
   def append(self, other, ignore_index, verify_integrity, sort, **kwargs):
     """``ignore_index=True`` is not supported, because it requires generating an
     order-sensitive index."""

From de643c57cfa8dab6f724da1678f8f283fbcee70d Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Mon, 11 Sep 2023 19:11:20 +0000
Subject: [PATCH 07/26] Delete removed_args arg from doc part that wasn't using
 it

---
 sdks/python/apache_beam/dataframe/frame_base.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/sdks/python/apache_beam/dataframe/frame_base.py b/sdks/python/apache_beam/dataframe/frame_base.py
index ba866e36262b..6c777b9f2b28 100644
--- a/sdks/python/apache_beam/dataframe/frame_base.py
+++ b/sdks/python/apache_beam/dataframe/frame_base.py
@@ -553,10 +553,7 @@ def wrapper(*args, **kwargs):
 
 
 def with_docs_from(
-    base_type: object,
-    name: str = None,
-    removed_method: bool = False,
-    ignored_args: list[str] | None = None):
+    base_type: object, name: str = None, removed_method: bool = False):
   """Decorator that updates the documentation from the wrapped function to
   duplicate the documentation from the identically-named method in `base_type`.
 

From bfa0ae8a8a7e9d41e85e9c332d1e03414083deb8 Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Mon, 11 Sep 2023 19:27:41 +0000
Subject: [PATCH 08/26] Fix up frame_base

---
 sdks/python/apache_beam/dataframe/frame_base.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/sdks/python/apache_beam/dataframe/frame_base.py b/sdks/python/apache_beam/dataframe/frame_base.py
index 6c777b9f2b28..92cebe4b0c6a 100644
--- a/sdks/python/apache_beam/dataframe/frame_base.py
+++ b/sdks/python/apache_beam/dataframe/frame_base.py
@@ -475,10 +475,7 @@ def wrapper(self, inplace=False, **kwargs):
   return wrapper
 
 
-def args_to_kwargs(
-    base_type: object,
-    removed_method: bool = False,
-    removed_args: list[str] | None = None):
+def args_to_kwargs(base_type, removed_method=False, removed_args=None):
   """Convert all args to kwargs before calling the decorated function.
 
   When applied to a function, this decorator creates a new function
@@ -552,8 +549,7 @@ def wrapper(*args, **kwargs):
     f"**{BEAM_SPECIFIC!r}** for details.")
 
 
-def with_docs_from(
-    base_type: object, name: str = None, removed_method: bool = False):
+def with_docs_from(base_type: object, name=None, removed_method=False):
   """Decorator that updates the documentation from the wrapped function to
   duplicate the documentation from the identically-named method in `base_type`.
 
@@ -624,11 +620,7 @@ def format_section(header):
   return wrap
 
 
-def populate_defaults(
-    base_type: object,
-    removed_method: bool = False,
-    removed_args: list[str] | None = None,
-    ignore: bool = False):
+def populate_defaults(base_type, removed_method=False, removed_args=None):
   """Populate default values for keyword arguments in decorated function.
 
   When applied to a function, this decorator creates a new function

From a8cf5d19925b00513e5049afa603972b17750dd4 Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Mon, 11 Sep 2023 19:37:51 +0000
Subject: [PATCH 09/26] Fix with new removed_args argument

---
 sdks/python/apache_beam/dataframe/frames.py | 24 ++++++++++-----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/sdks/python/apache_beam/dataframe/frames.py b/sdks/python/apache_beam/dataframe/frames.py
index ea8511f83725..c7ef43c35439 100644
--- a/sdks/python/apache_beam/dataframe/frames.py
+++ b/sdks/python/apache_beam/dataframe/frames.py
@@ -914,10 +914,10 @@ def sort_index(self, axis, **kwargs):
         ))
 
   @frame_base.with_docs_from(pd.DataFrame)
-  @frame_base.args_to_kwargs(pd.DataFrame)
-  @frame_base.populate_defaults(pd.DataFrame)
+  @frame_base.args_to_kwargs(pd.DataFrame, removed_args=["errors"] if PD_VERSION >= (2,0) else None)
+  @frame_base.populate_defaults(pd.DataFrame, removed_args=["errors"] if PD_VERSION >= (2,0) else None)
   @frame_base.maybe_inplace
-  def where(self, cond, other, **kwargs):
+  def where(self, cond, other, errors, **kwargs):
     """where is not parallelizable when ``errors="ignore"`` is specified."""
     requires = partitionings.Arbitrary()
     deferred_args = {}
@@ -1606,9 +1606,9 @@ def mean(self, skipna, **kwargs):
     return self.sum(skipna=skipna, **kwargs) / size
 
   @frame_base.with_docs_from(pd.Series)
-  @frame_base.args_to_kwargs(pd.Series)
-  @frame_base.populate_defaults(pd.Series)
-  def var(self, axis, skipna, ddof, **kwargs):
+  @frame_base.args_to_kwargs(pd.Series, removed_args=["level"] if PD_VERSION >= (2,0) else None)
+  @frame_base.populate_defaults(pd.Series, removed_args=["level"] if PD_VERSION >= (2,0) else None)
+  def var(self, axis, skipna, level, ddof, **kwargs):
     if skipna is None or skipna:
       self = self.dropna()  # pylint: disable=self-cls-assignment
 
@@ -1676,9 +1676,9 @@ def corr(self, other, method, min_periods):
               requires_partition_by=partitionings.Singleton(reason=reason)))
 
   @frame_base.with_docs_from(pd.Series)
-  @frame_base.args_to_kwargs(pd.Series)
-  @frame_base.populate_defaults(pd.Series)
-  def skew(self, axis, skipna, numeric_only, **kwargs):
+  @frame_base.args_to_kwargs(pd.Series, removed_args=["level"] if PD_VERSION >= (2,0) else None)
+  @frame_base.populate_defaults(pd.Series, removed_args=["level"] if PD_VERSION >= (2,0) else None)
+  def skew(self, axis, skipna, level, numeric_only, **kwargs):
     if skipna is None or skipna:
       self = self.dropna()  # pylint: disable=self-cls-assignment
     # See the online, numerically stable formulae at
@@ -1738,9 +1738,9 @@ def combine_moments(data):
               requires_partition_by=partitionings.Singleton()))
 
   @frame_base.with_docs_from(pd.Series)
-  @frame_base.args_to_kwargs(pd.Series)
-  @frame_base.populate_defaults(pd.Series)
-  def kurtosis(self, axis, skipna, numeric_only, **kwargs):
+  @frame_base.args_to_kwargs(pd.Series, removed_args=["level"] if PD_VERSION >= (2,0) else None)
+  @frame_base.populate_defaults(pd.Series, removed_args=["level"] if PD_VERSION >= (2,0) else None)
+  def kurtosis(self, axis, skipna, level, numeric_only, **kwargs):
     if skipna is None or skipna:
       self = self.dropna()  # pylint: disable=self-cls-assignment
 

From c20205708ed48c99875cbe5f03b50da2cb7fc92f Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Mon, 11 Sep 2023 19:43:17 +0000
Subject: [PATCH 10/26] Fix line lengths

---
 .../apache_beam/dataframe/frame_base.py       | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/sdks/python/apache_beam/dataframe/frame_base.py b/sdks/python/apache_beam/dataframe/frame_base.py
index 92cebe4b0c6a..123fce4ff9bf 100644
--- a/sdks/python/apache_beam/dataframe/frame_base.py
+++ b/sdks/python/apache_beam/dataframe/frame_base.py
@@ -484,9 +484,9 @@ def args_to_kwargs(base_type, removed_method=False, removed_args=None):
   determine the name to use for arguments that are converted to keyword
   arguments.
 
-  removed_method used in cases where a method has been removed in a later version
-  of Pandas. removed_args used in cases where a method has had arguments removed
-  in a later version of Pandas.
+  removed_method used in cases where a method has been removed in a later
+  version of Pandas. removed_args used in cases where a method has had
+  arguments removed in a later version of Pandas.
 
   For internal use only. No backwards compatibility guarantees.
   """
@@ -506,8 +506,8 @@ def wrap(func):
     if not_found := (set(beam_arg_names) - set(all_possible_base_arg_names) -
                      set(removed_arg_names)):
       raise TypeError(
-          f"Beam definition of {func.__name__} has arguments that are not found in the base version of the function: {not_found}"
-      )
+          f"Beam definition of {func.__name__} has arguments that are not found"
+          f" in the base version of the function: {not_found}")
 
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
@@ -556,12 +556,13 @@ def with_docs_from(base_type: object, name=None, removed_method=False):
   Any docstring on the original function will be included in the new function
   under a "Differences from pandas" heading.
 
-  removed_method used in cases where a method has been removed in a later version
-  of Pandas.
+  removed_method used in cases where a method has been removed in a later
+  version of Pandas.
   """
   def wrap(func):
     if removed_method:
-      func.__doc__ = "This method has been removed in the current version of Pandas."
+      func.__doc__ = (
+          "This method has been removed in the current version of Pandas.")
       return func
 
     fn_name = name or func.__name__
@@ -627,9 +628,9 @@ def populate_defaults(base_type, removed_method=False, removed_args=None):
   with default values for all keyword arguments, based on the default values
   for the identically-named method on `base_type`.
 
-  removed_method used in cases where a method has been removed in a later version
-  of Pandas. removed_args used in cases where a method has had arguments removed
-  in a later version of Pandas.
+  removed_method used in cases where a method has been removed in a later
+  version of Pandas. removed_args used in cases where a method has had
+  arguments removed in a later version of Pandas.
 
   For internal use only. No backwards compatibility guarantees.
   """

From 2a8b489e2522c4427258cfae09132df2c96362df Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Mon, 11 Sep 2023 20:02:02 +0000
Subject: [PATCH 11/26] Fix formatting

---
 sdks/python/apache_beam/dataframe/frames.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sdks/python/apache_beam/dataframe/frames.py b/sdks/python/apache_beam/dataframe/frames.py
index c8f10969c236..6c82b3b258cd 100644
--- a/sdks/python/apache_beam/dataframe/frames.py
+++ b/sdks/python/apache_beam/dataframe/frames.py
@@ -2597,7 +2597,8 @@ def align(self, other, join, axis, copy, level, method, **kwargs):
 
   @frame_base.with_docs_from(pd.DataFrame, removed_method=PD_VERSION >= (2, 0))
   @frame_base.args_to_kwargs(pd.DataFrame, removed_method=PD_VERSION >= (2, 0))
-  @frame_base.populate_defaults(pd.DataFrame, removed_method=PD_VERSION >= (2, 0))
+  @frame_base.populate_defaults(pd.DataFrame,
+                                removed_method=PD_VERSION >= (2, 0))
   def append(self, other, ignore_index, verify_integrity, sort, **kwargs):
     """``ignore_index=True`` is not supported, because it requires generating an
     order-sensitive index."""

From 944770ed1fb6af831f335409a3fac20dc39e1974 Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Tue, 12 Sep 2023 10:40:20 +0000
Subject: [PATCH 12/26] Add test for append deprecation

---
 sdks/python/apache_beam/dataframe/frames_test.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py
index 69e7f4b19d6f..ae6439a94001 100644
--- a/sdks/python/apache_beam/dataframe/frames_test.py
+++ b/sdks/python/apache_beam/dataframe/frames_test.py
@@ -813,6 +813,13 @@ def test_append_sort(self):
     self._run_test(lambda df1, df2: df2.append(df1, sort=True), df1, df2)
     self._run_test(lambda df1, df2: df2.append(df1, sort=False), df1, df2)
 
+  @unittest.skipIf(PD_VERSION < (2, 0), 'append removed in Pandas 2.0')
+  def test_append_raises_pandas_2(self):
+    df = pd.DataFrame({'A': [1, 1]})
+
+    with self.assertRaises(NotImplementedError):
+      df.append(df)
+
   def test_smallest_largest(self):
     df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [2, 3, 5, 7]})
     self._run_test(lambda df: df.nlargest(1, 'A', keep='all'), df)

From ee7a8587ecc8a51ad9c80081bb70daae6af99c38 Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Tue, 12 Sep 2023 10:42:29 +0000
Subject: [PATCH 13/26] Only populate kwargs if not there

---
 sdks/python/apache_beam/dataframe/frame_base.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sdks/python/apache_beam/dataframe/frame_base.py b/sdks/python/apache_beam/dataframe/frame_base.py
index 123fce4ff9bf..ba06b925ac4c 100644
--- a/sdks/python/apache_beam/dataframe/frame_base.py
+++ b/sdks/python/apache_beam/dataframe/frame_base.py
@@ -520,7 +520,8 @@ def wrapper(*args, **kwargs):
       # Still have to populate these for the Beam function signature.
       if removed_args:
         for name in removed_args:
-          kwargs[name] = None
+          if not name in kwargs:
+            kwargs[name] = None
       return func(**kwargs)
 
     return wrapper

From d0a758516ac8b06213f6b7dd14f2a98f63e12086 Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Tue, 12 Sep 2023 11:08:41 +0000
Subject: [PATCH 14/26] Formatting

---
 sdks/python/apache_beam/dataframe/frames.py | 24 ++++++++++++++-------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/sdks/python/apache_beam/dataframe/frames.py b/sdks/python/apache_beam/dataframe/frames.py
index cd285d50c192..e2390bda28be 100644
--- a/sdks/python/apache_beam/dataframe/frames.py
+++ b/sdks/python/apache_beam/dataframe/frames.py
@@ -914,8 +914,10 @@ def sort_index(self, axis, **kwargs):
         ))
 
   @frame_base.with_docs_from(pd.DataFrame)
-  @frame_base.args_to_kwargs(pd.DataFrame, removed_args=["errors"] if PD_VERSION >= (2,0) else None)
-  @frame_base.populate_defaults(pd.DataFrame, removed_args=["errors"] if PD_VERSION >= (2,0) else None)
+  @frame_base.args_to_kwargs(
+      pd.DataFrame, removed_args=["errors"] if PD_VERSION >= (2, 0) else None)
+  @frame_base.populate_defaults(
+      pd.DataFrame, removed_args=["errors"] if PD_VERSION >= (2, 0) else None)
   @frame_base.maybe_inplace
   def where(self, cond, other, errors, **kwargs):
     """where is not parallelizable when ``errors="ignore"`` is specified."""
@@ -1608,8 +1610,10 @@ def mean(self, skipna, **kwargs):
     return self.sum(skipna=skipna, **kwargs) / size
 
   @frame_base.with_docs_from(pd.Series)
-  @frame_base.args_to_kwargs(pd.Series, removed_args=["level"] if PD_VERSION >= (2,0) else None)
-  @frame_base.populate_defaults(pd.Series, removed_args=["level"] if PD_VERSION >= (2,0) else None)
+  @frame_base.args_to_kwargs(
+      pd.Series, removed_args=["level"] if PD_VERSION >= (2, 0) else None)
+  @frame_base.populate_defaults(
+      pd.Series, removed_args=["level"] if PD_VERSION >= (2, 0) else None)
   def var(self, axis, skipna, level, ddof, **kwargs):
     if skipna is None or skipna:
       self = self.dropna()  # pylint: disable=self-cls-assignment
@@ -1678,8 +1682,10 @@ def corr(self, other, method, min_periods):
               requires_partition_by=partitionings.Singleton(reason=reason)))
 
   @frame_base.with_docs_from(pd.Series)
-  @frame_base.args_to_kwargs(pd.Series, removed_args=["level"] if PD_VERSION >= (2,0) else None)
-  @frame_base.populate_defaults(pd.Series, removed_args=["level"] if PD_VERSION >= (2,0) else None)
+  @frame_base.args_to_kwargs(
+      pd.Series, removed_args=["level"] if PD_VERSION >= (2, 0) else None)
+  @frame_base.populate_defaults(
+      pd.Series, removed_args=["level"] if PD_VERSION >= (2, 0) else None)
   def skew(self, axis, skipna, level, numeric_only, **kwargs):
     if skipna is None or skipna:
       self = self.dropna()  # pylint: disable=self-cls-assignment
@@ -1740,8 +1746,10 @@ def combine_moments(data):
               requires_partition_by=partitionings.Singleton()))
 
   @frame_base.with_docs_from(pd.Series)
-  @frame_base.args_to_kwargs(pd.Series, removed_args=["level"] if PD_VERSION >= (2,0) else None)
-  @frame_base.populate_defaults(pd.Series, removed_args=["level"] if PD_VERSION >= (2,0) else None)
+  @frame_base.args_to_kwargs(
+      pd.Series, removed_args=["level"] if PD_VERSION >= (2, 0) else None)
+  @frame_base.populate_defaults(
+      pd.Series, removed_args=["level"] if PD_VERSION >= (2, 0) else None)
   def kurtosis(self, axis, skipna, level, numeric_only, **kwargs):
     if skipna is None or skipna:
       self = self.dropna()  # pylint: disable=self-cls-assignment

From 035fdfd2ab71456c2ff5de09fc3676e704ff841b Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Tue, 12 Sep 2023 16:33:33 +0000
Subject: [PATCH 15/26] Fix frames.py to work with kw only args.

Pandas 2 has made most args of most methods kw only. This still supports
Pandas 1 while allowing for those.
---
 .../apache_beam/dataframe/frame_base.py       | 24 +++++++---
 .../apache_beam/dataframe/frame_base_test.py  | 44 ++++++++++++++++++-
 sdks/python/apache_beam/dataframe/frames.py   |  4 +-
 3 files changed, 62 insertions(+), 10 deletions(-)

diff --git a/sdks/python/apache_beam/dataframe/frame_base.py b/sdks/python/apache_beam/dataframe/frame_base.py
index ba06b925ac4c..f2356c8b75a8 100644
--- a/sdks/python/apache_beam/dataframe/frame_base.py
+++ b/sdks/python/apache_beam/dataframe/frame_base.py
@@ -497,6 +497,8 @@ def wrap(func):
 
     removed_arg_names = removed_args if removed_args is not None else []
 
+    # TODO: Better handle position only arguments if they are ever a thing
+    # in Pandas (as of 2.1 currently they aren't).
     base_arg_spec = getfullargspec(unwrap(getattr(base_type, func.__name__)))
     base_arg_names = base_arg_spec.args
     # Some arguments are keyword only and we still want to check against those.
@@ -511,6 +513,9 @@ def wrap(func):
 
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
+      if len(args) > len(base_arg_names):
+        raise TypeError(f"{func.__name__} got too many positioned arguments.")
+
       for name, value in zip(base_arg_names, args):
         if name in kwargs:
           raise TypeError(
@@ -520,7 +525,7 @@ def wrapper(*args, **kwargs):
       # Still have to populate these for the Beam function signature.
       if removed_args:
         for name in removed_args:
-          if not name in kwargs:
+          if name not in kwargs:
             kwargs[name] = None
       return func(**kwargs)
 
@@ -550,7 +555,7 @@ def wrapper(*args, **kwargs):
     f"**{BEAM_SPECIFIC!r}** for details.")
 
 
-def with_docs_from(base_type: object, name=None, removed_method=False):
+def with_docs_from(base_type, name=None, removed_method=False):
   """Decorator that updates the documentation from the wrapped function to
   duplicate the documentation from the identically-named method in `base_type`.
 
@@ -640,13 +645,18 @@ def wrap(func):
       return func
 
     base_argspec = getfullargspec(unwrap(getattr(base_type, func.__name__)))
-    if not base_argspec.defaults:
+    if not base_argspec.defaults and not base_argspec.kwonlydefaults:
       return func
 
-    arg_to_default = dict(
-        zip(
-            base_argspec.args[-len(base_argspec.defaults):],
-            base_argspec.defaults))
+    arg_to_default = {}
+    if base_argspec.defaults:
+      arg_to_default.update(
+          zip(
+              base_argspec.args[-len(base_argspec.defaults):],
+              base_argspec.defaults))
+
+    if base_argspec.kwonlydefaults:
+      arg_to_default.update(base_argspec.kwonlydefaults)
 
     unwrapped_func = unwrap(func)
     # args that do not have defaults in func, but do have defaults in base
diff --git a/sdks/python/apache_beam/dataframe/frame_base_test.py b/sdks/python/apache_beam/dataframe/frame_base_test.py
index 2d16d02ba1ea..5962bb2fb72b 100644
--- a/sdks/python/apache_beam/dataframe/frame_base_test.py
+++ b/sdks/python/apache_beam/dataframe/frame_base_test.py
@@ -72,7 +72,7 @@ def add_one(frame):
 
   def test_args_to_kwargs(self):
     class Base(object):
-      def func(self, a=1, b=2, c=3):
+      def func(self, a=1, b=2, c=3, *, kw_only=4):
         pass
 
     class Proxy(object):
@@ -87,6 +87,9 @@ def func(self, **kwargs):
     self.assertEqual(proxy.func(2, 4, 6), {'a': 2, 'b': 4, 'c': 6})
     self.assertEqual(proxy.func(2, c=6), {'a': 2, 'c': 6})
     self.assertEqual(proxy.func(c=6, a=2), {'a': 2, 'c': 6})
+    self.assertEqual(proxy.func(2, kw_only=20), {'a': 2, 'kw_only': 20})
+    with self.assertRaises(TypeError):  # got too many positioned arguments
+      proxy.func(2, 4, 6, 8)
 
   def test_args_to_kwargs_populates_defaults(self):
     class Base(object):
@@ -129,6 +132,45 @@ def func_removed_args(self, a, c, **kwargs):
       proxy.func_removed_args()
     self.assertEqual(proxy.func_removed_args(12, d=100), {'a': 12, 'd': 100})
 
+  def test_args_to_kwargs_populates_default_handles_kw_only(self):
+    class Base(object):
+      def func(self, a=1, b=2, c=3, *, kw_only=4):
+        pass
+
+    class ProxyUsesKwOnly(object):
+      @frame_base.args_to_kwargs(Base)
+      @frame_base.populate_defaults(Base)
+      def func(self, a, kw_only, **kwargs):
+        return dict(kwargs, a=a, kw_only=kw_only)
+
+    proxy = ProxyUsesKwOnly()
+
+    # pylint: disable=too-many-function-args,no-value-for-parameter
+    self.assertEqual(proxy.func(), {'a': 1, 'kw_only': 4})
+    self.assertEqual(proxy.func(100), {'a': 100, 'kw_only': 4})
+    self.assertEqual(
+        proxy.func(2, 4, 6, kw_only=8), {
+            'a': 2, 'b': 4, 'c': 6, 'kw_only': 8
+        })
+    with self.assertRaises(TypeError):
+      proxy.func(2, 4, 6, 8)  # got too many positioned arguments
+
+    class ProxyDoesntUseKwOnly(object):
+      @frame_base.args_to_kwargs(Base)
+      @frame_base.populate_defaults(Base)
+      def func(self, a, **kwargs):
+        return dict(kwargs, a=a)
+
+    proxy = ProxyDoesntUseKwOnly()
+
+    # pylint: disable=too-many-function-args,no-value-for-parameter
+    self.assertEqual(proxy.func(), {'a': 1})
+    self.assertEqual(proxy.func(100), {'a': 100})
+    self.assertEqual(
+        proxy.func(2, 4, 6, kw_only=8), {
+            'a': 2, 'b': 4, 'c': 6, 'kw_only': 8
+        })
+
 
 if __name__ == '__main__':
   unittest.main()
diff --git a/sdks/python/apache_beam/dataframe/frames.py b/sdks/python/apache_beam/dataframe/frames.py
index e2390bda28be..a7a278fbf62a 100644
--- a/sdks/python/apache_beam/dataframe/frames.py
+++ b/sdks/python/apache_beam/dataframe/frames.py
@@ -907,7 +907,7 @@ def sort_index(self, axis, **kwargs):
     return frame_base.DeferredFrame.wrap(
         expressions.ComputedExpression(
             'sort_index',
-            lambda df: df.sort_index(axis, **kwargs),
+            lambda df: df.sort_index(axis=axis, **kwargs),
             [self._expr],
             requires_partition_by=partitionings.Arbitrary(),
             preserves_partition_by=partitionings.Arbitrary(),
@@ -2687,7 +2687,7 @@ def set_axis(self, labels, axis, **kwargs):
       return frame_base.DeferredFrame.wrap(
           expressions.ComputedExpression(
               'set_axis',
-              lambda df: df.set_axis(labels, axis, **kwargs),
+              lambda df: df.set_axis(labels, axis=axis, **kwargs),
               [self._expr],
               requires_partition_by=partitionings.Arbitrary(),
               preserves_partition_by=partitionings.Arbitrary()))

From 3ed2822e98e601ab10ed5a13b84df946e8ea01b3 Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Tue, 12 Sep 2023 16:37:27 +0000
Subject: [PATCH 16/26] Fix arg.

---
 sdks/python/apache_beam/dataframe/frame_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdks/python/apache_beam/dataframe/frame_base.py b/sdks/python/apache_beam/dataframe/frame_base.py
index ba06b925ac4c..88e6eb6a060b 100644
--- a/sdks/python/apache_beam/dataframe/frame_base.py
+++ b/sdks/python/apache_beam/dataframe/frame_base.py
@@ -550,7 +550,7 @@ def wrapper(*args, **kwargs):
     f"**{BEAM_SPECIFIC!r}** for details.")
 
 
-def with_docs_from(base_type: object, name=None, removed_method=False):
+def with_docs_from(base_type, name=None, removed_method=False):
   """Decorator that updates the documentation from the wrapped function to
   duplicate the documentation from the identically-named method in `base_type`.
 

From fb38399cda8d7c7bd8aac77a5d6faba51e64fa10 Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Tue, 12 Sep 2023 16:37:27 +0000
Subject: [PATCH 17/26] Fix arg.

---
 sdks/python/apache_beam/dataframe/frame_base.py  | 2 +-
 sdks/python/apache_beam/dataframe/frames_test.py | 7 -------
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/sdks/python/apache_beam/dataframe/frame_base.py b/sdks/python/apache_beam/dataframe/frame_base.py
index ba06b925ac4c..88e6eb6a060b 100644
--- a/sdks/python/apache_beam/dataframe/frame_base.py
+++ b/sdks/python/apache_beam/dataframe/frame_base.py
@@ -550,7 +550,7 @@ def wrapper(*args, **kwargs):
     f"**{BEAM_SPECIFIC!r}** for details.")
 
 
-def with_docs_from(base_type: object, name=None, removed_method=False):
+def with_docs_from(base_type, name=None, removed_method=False):
   """Decorator that updates the documentation from the wrapped function to
   duplicate the documentation from the identically-named method in `base_type`.
 
diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py
index ae6439a94001..69e7f4b19d6f 100644
--- a/sdks/python/apache_beam/dataframe/frames_test.py
+++ b/sdks/python/apache_beam/dataframe/frames_test.py
@@ -813,13 +813,6 @@ def test_append_sort(self):
     self._run_test(lambda df1, df2: df2.append(df1, sort=True), df1, df2)
     self._run_test(lambda df1, df2: df2.append(df1, sort=False), df1, df2)
 
-  @unittest.skipIf(PD_VERSION < (2, 0), 'append removed in Pandas 2.0')
-  def test_append_raises_pandas_2(self):
-    df = pd.DataFrame({'A': [1, 1]})
-
-    with self.assertRaises(NotImplementedError):
-      df.append(df)
-
   def test_smallest_largest(self):
     df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [2, 3, 5, 7]})
     self._run_test(lambda df: df.nlargest(1, 'A', keep='all'), df)

From d3d07bb36eea974be7bd85bdbb547bf9b3bb536d Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Wed, 13 Sep 2023 09:50:43 +0000
Subject: [PATCH 18/26] Fix comments

---
 .../apache_beam/dataframe/frame_base.py       | 22 ++++++++++++-------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/sdks/python/apache_beam/dataframe/frame_base.py b/sdks/python/apache_beam/dataframe/frame_base.py
index 88e6eb6a060b..1da12ececff6 100644
--- a/sdks/python/apache_beam/dataframe/frame_base.py
+++ b/sdks/python/apache_beam/dataframe/frame_base.py
@@ -484,11 +484,14 @@ def args_to_kwargs(base_type, removed_method=False, removed_args=None):
   determine the name to use for arguments that are converted to keyword
   arguments.
 
-  removed_method used in cases where a method has been removed in a later
-  version of Pandas. removed_args used in cases where a method has had
-  arguments removed in a later version of Pandas.
-
   For internal use only. No backwards compatibility guarantees.
+
+  Args:
+      base_type: The pandas type of the method that this is trying to replicate.
+      removed_method: Whether this method has been removed in the running
+           Pandas version.
+      removed_args: If not empty, which arguments have been dropped in the
+           running Pandas version.
   """
   def wrap(func):
     if removed_method:
@@ -629,11 +632,14 @@ def populate_defaults(base_type, removed_method=False, removed_args=None):
   with default values for all keyword arguments, based on the default values
   for the identically-named method on `base_type`.
 
-  removed_method used in cases where a method has been removed in a later
-  version of Pandas. removed_args used in cases where a method has had
-  arguments removed in a later version of Pandas.
-
   For internal use only. No backwards compatibility guarantees.
+
+  Args:
+      base_type: The pandas type of the method that this is trying to replicate.
+      removed_method: Whether this method has been removed in the running
+           Pandas version.
+      removed_args: If not empty, which arguments have been dropped in the
+           running Pandas version.
   """
   def wrap(func):
     if removed_method:

From 88087749bea9e2cf73bb1c33d0eaad68e7c3923c Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Fri, 15 Sep 2023 17:24:52 +0000
Subject: [PATCH 19/26] Address comments

---
 sdks/python/apache_beam/dataframe/frame_base.py      | 2 +-
 sdks/python/apache_beam/dataframe/frame_base_test.py | 9 ++++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/sdks/python/apache_beam/dataframe/frame_base.py b/sdks/python/apache_beam/dataframe/frame_base.py
index b64e8e2d53c2..48a4c29d0589 100644
--- a/sdks/python/apache_beam/dataframe/frame_base.py
+++ b/sdks/python/apache_beam/dataframe/frame_base.py
@@ -500,7 +500,7 @@ def wrap(func):
 
     removed_arg_names = removed_args if removed_args is not None else []
 
-    # TODO: Better handle position only arguments if they are ever a thing
+    # We would need to add position only arguments if they ever become a thing
     # in Pandas (as of 2.1 currently they aren't).
     base_arg_spec = getfullargspec(unwrap(getattr(base_type, func.__name__)))
     base_arg_names = base_arg_spec.args
diff --git a/sdks/python/apache_beam/dataframe/frame_base_test.py b/sdks/python/apache_beam/dataframe/frame_base_test.py
index 5962bb2fb72b..7ce4ba022acd 100644
--- a/sdks/python/apache_beam/dataframe/frame_base_test.py
+++ b/sdks/python/apache_beam/dataframe/frame_base_test.py
@@ -134,7 +134,7 @@ def func_removed_args(self, a, c, **kwargs):
 
   def test_args_to_kwargs_populates_default_handles_kw_only(self):
     class Base(object):
-      def func(self, a=1, b=2, c=3, *, kw_only=4):
+      def func(self, a, b=2, c=3, *, kw_only=4):
         pass
 
     class ProxyUsesKwOnly(object):
@@ -146,7 +146,9 @@ def func(self, a, kw_only, **kwargs):
     proxy = ProxyUsesKwOnly()
 
     # pylint: disable=too-many-function-args,no-value-for-parameter
-    self.assertEqual(proxy.func(), {'a': 1, 'kw_only': 4})
+    with self.assertRaises(TypeError):  # missing 1 require positional argument
+      proxy.func()
+
     self.assertEqual(proxy.func(100), {'a': 100, 'kw_only': 4})
     self.assertEqual(
         proxy.func(2, 4, 6, kw_only=8), {
@@ -164,7 +166,8 @@ def func(self, a, **kwargs):
     proxy = ProxyDoesntUseKwOnly()
 
     # pylint: disable=too-many-function-args,no-value-for-parameter
-    self.assertEqual(proxy.func(), {'a': 1})
+    with self.assertRaises(TypeError):  # missing 1 required positional argument
+      proxy.func()
     self.assertEqual(proxy.func(100), {'a': 100})
     self.assertEqual(
         proxy.func(2, 4, 6, kw_only=8), {

From a5c3a0e17c12e84d33eb502a1e802dc0cb06e7d5 Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Fri, 15 Sep 2023 18:08:46 +0000
Subject: [PATCH 20/26] typo

---
 sdks/python/apache_beam/dataframe/frame_base_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdks/python/apache_beam/dataframe/frame_base_test.py b/sdks/python/apache_beam/dataframe/frame_base_test.py
index 7ce4ba022acd..b3077320720f 100644
--- a/sdks/python/apache_beam/dataframe/frame_base_test.py
+++ b/sdks/python/apache_beam/dataframe/frame_base_test.py
@@ -146,7 +146,7 @@ def func(self, a, kw_only, **kwargs):
     proxy = ProxyUsesKwOnly()
 
     # pylint: disable=too-many-function-args,no-value-for-parameter
-    with self.assertRaises(TypeError):  # missing 1 require positional argument
+    with self.assertRaises(TypeError):  # missing 1 required positional argument
       proxy.func()
 
     self.assertEqual(proxy.func(100), {'a': 100, 'kw_only': 4})

From 36e4f033f55af7bac0e6e806a464ddfdc3e98218 Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Tue, 12 Sep 2023 18:27:08 +0000
Subject: [PATCH 21/26] Fix numeric_only logic in frames_test for Pandas 2

In Pandas 1, a few methods (cor, cov, and quantile being the implemented
ones for Beam) had the default behavior of numeric_only=True if not
specified.  In Pandas 2 the default is numeric_only=False for these. So for tests to
keep passing I need to explicitly pass the argument.
---
 .../apache_beam/dataframe/frames_test.py      | 20 ++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py
index 4e59d1da5de4..b429de8227ef 100644
--- a/sdks/python/apache_beam/dataframe/frames_test.py
+++ b/sdks/python/apache_beam/dataframe/frames_test.py
@@ -1685,8 +1685,15 @@ def test_groupby_series(self, agg_type):
           "https://github.com/apache/beam/issues/20967: proxy generation of "
           "DataFrameGroupBy.describe fails in pandas < 1.2")
 
+    kwargs = {}
+    # Behavior for numeric_only in these methods changed in Pandas 2 to default
+    # to False instead of True, so explicitly make it True in Pandas 2.
+    if PD_VERSION >= (2, 0) and agg_type in ('corr', 'cov', 'quantile'):
+      kwargs["numeric_only"] = True
+
     self._run_test(
-        lambda df: getattr(df[df.foo > 40].groupby(df.group), agg_type)(),
+        lambda df: getattr(df[df.foo > 40].groupby(df.group), agg_type)
+        (**kwargs),
         GROUPBY_DF,
         check_proxy=False)
 
@@ -1900,12 +1907,19 @@ def test_dataframe_groupby_series(self, agg_type):
       self.skipTest(
           "https://github.com/apache/beam/issues/20967: proxy generation of "
           "DataFrameGroupBy.describe fails in pandas < 1.2")
+
+    kwargs = {}
+    # Behavior for numeric_only in these methods changed in Pandas 2 to default
+    # to False instead of True, so explicitly make it True in Pandas 2.
+    if PD_VERSION >= (2, 0) and agg_type in ('corr', 'cov', 'quantile'):
+      kwargs["numeric_only"] = True
+
     self._run_test(
-        lambda df: df[df.foo > 40].groupby(df.group).agg(agg_type),
+        lambda df: df[df.foo > 40].groupby(df.group).agg(agg_type, **kwargs),
         GROUPBY_DF,
         check_proxy=False)
     self._run_test(
-        lambda df: df[df.foo > 40].groupby(df.foo % 3).agg(agg_type),
+        lambda df: df[df.foo > 40].groupby(df.foo % 3).agg(agg_type, **kwargs),
         GROUPBY_DF,
         check_proxy=False)
 

From bc715c62891850104ec3226e75d573519c68d130 Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Thu, 14 Sep 2023 17:17:39 +0000
Subject: [PATCH 22/26] Add tests for 2.0 default case

---
 .../apache_beam/dataframe/frames_test.py       | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py
index b429de8227ef..98296a9f1926 100644
--- a/sdks/python/apache_beam/dataframe/frames_test.py
+++ b/sdks/python/apache_beam/dataframe/frames_test.py
@@ -1688,7 +1688,14 @@ def test_groupby_series(self, agg_type):
     kwargs = {}
     # Behavior for numeric_only in these methods changed in Pandas 2 to default
     # to False instead of True, so explicitly make it True in Pandas 2.
-    if PD_VERSION >= (2, 0) and agg_type in ('corr', 'cov', 'quantile'):
+    # The default will fail due to non-numeric fields in Pandas 2.
+    if PD_VERSION >= (2, 0) and agg_type in ('corr', 'cov'):
+      with self.assertRaises(ValueError):
+        self._run_test(
+            lambda df: getattr(df[df.foo > 40].groupby(df.group), agg_type)(),
+            GROUPBY_DF,
+            check_proxy=False)
+
       kwargs["numeric_only"] = True
 
     self._run_test(
@@ -1911,7 +1918,14 @@ def test_dataframe_groupby_series(self, agg_type):
     kwargs = {}
     # Behavior for numeric_only in these methods changed in Pandas 2 to default
     # to False instead of True, so explicitly make it True in Pandas 2.
-    if PD_VERSION >= (2, 0) and agg_type in ('corr', 'cov', 'quantile'):
+    # The default will fail due to non-numeric fields in Pandas 2.
+    if PD_VERSION >= (2, 0) and agg_type in ('corr', 'cov'):
+      with self.assertRaises(ValueError):
+        self._run_test(
+            lambda df: df[df.foo > 40].groupby(df.group).agg(agg_type),
+            GROUPBY_DF,
+            check_proxy=False)
+
       kwargs["numeric_only"] = True
 
     self._run_test(

From c3f8f05d02650fb10db5f09127e8314889afddf5 Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Fri, 15 Sep 2023 15:19:15 +0000
Subject: [PATCH 23/26] Fix which methods to use to compare

---
 sdks/python/apache_beam/dataframe/frames_test.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py
index 98296a9f1926..ca62f3087fcf 100644
--- a/sdks/python/apache_beam/dataframe/frames_test.py
+++ b/sdks/python/apache_beam/dataframe/frames_test.py
@@ -1634,6 +1634,11 @@ def test_pivot_no_index_provided_on_multiindex(self):
 # https://github.com/pandas-dev/pandas/issues/40139
 ALL_GROUPING_AGGREGATIONS = sorted(
     set(frames.ALL_AGGREGATIONS) - set(('kurt', 'kurtosis')))
+# In Pandas 2 all these change to having default False for numeric_only.
+# The other methods already started that way.
+NUMERIC_ONLY_DEFAULT_TRUE_FOR_PANDAS_LT_2_AGGREGATIONS = set(
+    frames.ALL_AGGREGATIONS) - set(
+        ('count', 'idxmin', 'idxmax', 'mode', 'rank'))
 
 
 class GroupByTest(_AbstractFrameTest):
@@ -1689,7 +1694,10 @@ def test_groupby_series(self, agg_type):
     # Behavior for numeric_only in these methods changed in Pandas 2 to default
     # to False instead of True, so explicitly make it True in Pandas 2.
     # The default will fail due to non-numeric fields in Pandas 2.
-    if PD_VERSION >= (2, 0) and agg_type in ('corr', 'cov'):
+
+    if PD_VERSION >= (
+        2, 0
+    ) and agg_type in NUMERIC_ONLY_DEFAULT_TRUE_FOR_PANDAS_LT_2_AGGREGATIONS:
       with self.assertRaises(ValueError):
         self._run_test(
             lambda df: getattr(df[df.foo > 40].groupby(df.group), agg_type)(),
@@ -1919,7 +1927,9 @@ def test_dataframe_groupby_series(self, agg_type):
     # Behavior for numeric_only in these methods changed in Pandas 2 to default
     # to False instead of True, so explicitly make it True in Pandas 2.
     # The default will fail due to non-numeric fields in Pandas 2.
-    if PD_VERSION >= (2, 0) and agg_type in ('corr', 'cov'):
+    if PD_VERSION >= (
+        2, 0
+    ) and agg_type in NUMERIC_ONLY_DEFAULT_TRUE_FOR_PANDAS_LT_2_AGGREGATIONS:
       with self.assertRaises(ValueError):
         self._run_test(
             lambda df: df[df.foo > 40].groupby(df.group).agg(agg_type),

From f4f56c51c6fc021dbf6e96b40652daa6d53db361 Mon Sep 17 00:00:00 2001
From: Chris Neffshade <caneff@google.com>
Date: Fri, 15 Sep 2023 16:32:40 +0000
Subject: [PATCH 24/26] Fix tests

---
 .../apache_beam/dataframe/frames_test.py      | 59 +++++++++----------
 1 file changed, 29 insertions(+), 30 deletions(-)

diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py
index ca62f3087fcf..1f3de5bcf80f 100644
--- a/sdks/python/apache_beam/dataframe/frames_test.py
+++ b/sdks/python/apache_beam/dataframe/frames_test.py
@@ -1635,10 +1635,10 @@ def test_pivot_no_index_provided_on_multiindex(self):
 ALL_GROUPING_AGGREGATIONS = sorted(
     set(frames.ALL_AGGREGATIONS) - set(('kurt', 'kurtosis')))
 # In Pandas 2 all these change to having default False for numeric_only.
-# The other methods already started that way.
+# The other methods already started that way or don't have that argument.
 NUMERIC_ONLY_DEFAULT_TRUE_FOR_PANDAS_LT_2_AGGREGATIONS = set(
     frames.ALL_AGGREGATIONS) - set(
-        ('count', 'idxmin', 'idxmax', 'mode', 'rank'))
+        ('count', 'idxmin', 'idxmax', 'mode', 'rank', 'all', 'any'))
 
 
 class GroupByTest(_AbstractFrameTest):
@@ -1690,27 +1690,21 @@ def test_groupby_series(self, agg_type):
           "https://github.com/apache/beam/issues/20967: proxy generation of "
           "DataFrameGroupBy.describe fails in pandas < 1.2")
 
-    kwargs = {}
+    def agg(df, **kwargs):
+      return getattr(df[df.foo > 40].groupby(df.group), agg_type)(**kwargs)
+
     # Behavior for numeric_only in these methods changed in Pandas 2 to default
     # to False instead of True, so explicitly make it True in Pandas 2.
     # The default will fail due to non-numeric fields in Pandas 2.
-
     if PD_VERSION >= (
         2, 0
     ) and agg_type in NUMERIC_ONLY_DEFAULT_TRUE_FOR_PANDAS_LT_2_AGGREGATIONS:
       with self.assertRaises(ValueError):
-        self._run_test(
-            lambda df: getattr(df[df.foo > 40].groupby(df.group), agg_type)(),
-            GROUPBY_DF,
-            check_proxy=False)
-
-      kwargs["numeric_only"] = True
-
-    self._run_test(
-        lambda df: getattr(df[df.foo > 40].groupby(df.group), agg_type)
-        (**kwargs),
-        GROUPBY_DF,
-        check_proxy=False)
+        self._run_test(agg, GROUPBY_DF, check_proxy=False)
+      self._run_test(
+          lambda df: agg(df, numeric_only=True), GROUPBY_DF, check_proxy=False)
+    else:
+      self._run_test(agg, GROUPBY_DF, check_proxy=False)
 
   def test_groupby_user_guide(self):
     # Example from https://pandas.pydata.org/docs/user_guide/groupby.html
@@ -1923,7 +1917,9 @@ def test_dataframe_groupby_series(self, agg_type):
           "https://github.com/apache/beam/issues/20967: proxy generation of "
           "DataFrameGroupBy.describe fails in pandas < 1.2")
 
-    kwargs = {}
+    def agg(df, group_by, **kwargs):
+      return df[df.foo > 40].groupby(group_by).agg(agg_type, **kwargs)
+
     # Behavior for numeric_only in these methods changed in Pandas 2 to default
     # to False instead of True, so explicitly make it True in Pandas 2.
     # The default will fail due to non-numeric fields in Pandas 2.
@@ -1932,20 +1928,23 @@ def test_dataframe_groupby_series(self, agg_type):
     ) and agg_type in NUMERIC_ONLY_DEFAULT_TRUE_FOR_PANDAS_LT_2_AGGREGATIONS:
       with self.assertRaises(ValueError):
         self._run_test(
-            lambda df: df[df.foo > 40].groupby(df.group).agg(agg_type),
-            GROUPBY_DF,
-            check_proxy=False)
-
-      kwargs["numeric_only"] = True
+            lambda df: agg(df, df.group), GROUPBY_DF, check_proxy=False)
+        self._run_test(
+            lambda df: agg(df, df.foo % 3), GROUPBY_DF, check_proxy=False)
 
-    self._run_test(
-        lambda df: df[df.foo > 40].groupby(df.group).agg(agg_type, **kwargs),
-        GROUPBY_DF,
-        check_proxy=False)
-    self._run_test(
-        lambda df: df[df.foo > 40].groupby(df.foo % 3).agg(agg_type, **kwargs),
-        GROUPBY_DF,
-        check_proxy=False)
+      self._run_test(
+          lambda df: agg(df, df.group, numeric_only=True),
+          GROUPBY_DF,
+          check_proxy=False)
+      self._run_test(
+          lambda df: agg(df, df.foo % 3, numeric_only=True),
+          GROUPBY_DF,
+          check_proxy=False)
+    else:
+      self._run_test(
+          lambda df: agg(df, df.group), GROUPBY_DF, check_proxy=False)
+      self._run_test(
+          lambda df: agg(df, df.foo % 3), GROUPBY_DF, check_proxy=False)
 
   @parameterized.expand(ALL_GROUPING_AGGREGATIONS)
   def test_series_groupby_series(self, agg_type):

From 5316fe65e6fd60eb2b22a7c30aef90b57c980c5b Mon Sep 17 00:00:00 2001
From: Christopher Neffshade <caneff@google.com>
Date: Sun, 17 Sep 2023 23:56:38 +0000
Subject: [PATCH 25/26] Add all remaining numeric_only places

---
 .../apache_beam/dataframe/frames_test.py      | 118 +++++++++---------
 1 file changed, 59 insertions(+), 59 deletions(-)

diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py
index 1f3de5bcf80f..7feee9154c95 100644
--- a/sdks/python/apache_beam/dataframe/frames_test.py
+++ b/sdks/python/apache_beam/dataframe/frames_test.py
@@ -1634,11 +1634,30 @@ def test_pivot_no_index_provided_on_multiindex(self):
 # https://github.com/pandas-dev/pandas/issues/40139
 ALL_GROUPING_AGGREGATIONS = sorted(
     set(frames.ALL_AGGREGATIONS) - set(('kurt', 'kurtosis')))
-# In Pandas 2 all these change to having default False for numeric_only.
-# The other methods already started that way or don't have that argument.
-NUMERIC_ONLY_DEFAULT_TRUE_FOR_PANDAS_LT_2_AGGREGATIONS = set(
-    frames.ALL_AGGREGATIONS) - set(
-        ('count', 'idxmin', 'idxmax', 'mode', 'rank', 'all', 'any'))
+AGGREGATIONS_WHERE_NUMERIC_ONLY_DEFAULTS_TO_TRUE_IN_PANDAS_1 = set(
+    frames.ALL_AGGREGATIONS) - set((
+        'nunique',
+        'size',
+        'count',
+        'idxmin',
+        'idxmax',
+        'mode',
+        'rank',
+        'all',
+        'any',
+        'describe'))
+
+
+def numeric_only_kwargs_for_pandas_2(agg_type: str) -> dict[str, bool]:
+  """Get proper arguments for numeric_only.
+
+  Behavior for numeric_only in these methods changed in Pandas 2 to default
+  to False instead of True, so explicitly make it True in Pandas 2."""
+  if PD_VERSION >= (2, 0) and (
+      agg_type in AGGREGATIONS_WHERE_NUMERIC_ONLY_DEFAULTS_TO_TRUE_IN_PANDAS_1):
+    return {'numeric_only': True}
+  else:
+    return {}
 
 
 class GroupByTest(_AbstractFrameTest):
@@ -1655,8 +1674,9 @@ def test_groupby_agg(self, agg_type):
       self.skipTest(
           "https://github.com/apache/beam/issues/20967: proxy generation of "
           "DataFrameGroupBy.describe fails in pandas < 1.2")
+    kwargs = numeric_only_kwargs_for_pandas_2(agg_type)
     self._run_test(
-        lambda df: df.groupby('group').agg(agg_type),
+        lambda df: df.groupby('group').agg(agg_type, **kwargs),
         GROUPBY_DF,
         check_proxy=False)
 
@@ -1666,8 +1686,10 @@ def test_groupby_with_filter(self, agg_type):
       self.skipTest(
           "https://github.com/apache/beam/issues/20967: proxy generation of "
           "DataFrameGroupBy.describe fails in pandas < 1.2")
+    kwargs = numeric_only_kwargs_for_pandas_2(agg_type)
     self._run_test(
-        lambda df: getattr(df[df.foo > 30].groupby('group'), agg_type)(),
+        lambda df: getattr(df[df.foo > 30].groupby('group'), agg_type)
+        (**kwargs),
         GROUPBY_DF,
         check_proxy=False)
 
@@ -1678,8 +1700,9 @@ def test_groupby(self, agg_type):
           "https://github.com/apache/beam/issues/20967: proxy generation of "
           "DataFrameGroupBy.describe fails in pandas < 1.2")
 
+    kwargs = numeric_only_kwargs_for_pandas_2(agg_type)
     self._run_test(
-        lambda df: getattr(df.groupby('group'), agg_type)(),
+        lambda df: getattr(df.groupby('group'), agg_type)(**kwargs),
         GROUPBY_DF,
         check_proxy=False)
 
@@ -1690,21 +1713,12 @@ def test_groupby_series(self, agg_type):
           "https://github.com/apache/beam/issues/20967: proxy generation of "
           "DataFrameGroupBy.describe fails in pandas < 1.2")
 
-    def agg(df, **kwargs):
-      return getattr(df[df.foo > 40].groupby(df.group), agg_type)(**kwargs)
-
-    # Behavior for numeric_only in these methods changed in Pandas 2 to default
-    # to False instead of True, so explicitly make it True in Pandas 2.
-    # The default will fail due to non-numeric fields in Pandas 2.
-    if PD_VERSION >= (
-        2, 0
-    ) and agg_type in NUMERIC_ONLY_DEFAULT_TRUE_FOR_PANDAS_LT_2_AGGREGATIONS:
-      with self.assertRaises(ValueError):
-        self._run_test(agg, GROUPBY_DF, check_proxy=False)
-      self._run_test(
-          lambda df: agg(df, numeric_only=True), GROUPBY_DF, check_proxy=False)
-    else:
-      self._run_test(agg, GROUPBY_DF, check_proxy=False)
+    kwargs = numeric_only_kwargs_for_pandas_2(agg_type)
+    self._run_test(
+        lambda df: getattr(df[df.foo > 40].groupby(df.group), agg_type)
+        (**kwargs),
+        GROUPBY_DF,
+        check_proxy=False)
 
   def test_groupby_user_guide(self):
     # Example from https://pandas.pydata.org/docs/user_guide/groupby.html
@@ -1733,12 +1747,15 @@ def test_groupby_project_series(self, agg_type):
           "https://github.com/apache/beam/issues/20895: "
           "SeriesGroupBy.{corr, cov} do not raise the expected error.")
 
-    self._run_test(lambda df: getattr(df.groupby('group').foo, agg_type)(), df)
-    self._run_test(lambda df: getattr(df.groupby('group').bar, agg_type)(), df)
+    kwargs = numeric_only_kwargs_for_pandas_2(agg_type)
     self._run_test(
-        lambda df: getattr(df.groupby('group')['foo'], agg_type)(), df)
+        lambda df: getattr(df.groupby('group').foo, agg_type)(**kwargs), df)
     self._run_test(
-        lambda df: getattr(df.groupby('group')['bar'], agg_type)(), df)
+        lambda df: getattr(df.groupby('group').bar, agg_type)(**kwargs), df)
+    self._run_test(
+        lambda df: getattr(df.groupby('group')['foo'], agg_type)(**kwargs), df)
+    self._run_test(
+        lambda df: getattr(df.groupby('group')['bar'], agg_type)(**kwargs), df)
 
   @parameterized.expand(ALL_GROUPING_AGGREGATIONS)
   def test_groupby_project_dataframe(self, agg_type):
@@ -1746,8 +1763,10 @@ def test_groupby_project_dataframe(self, agg_type):
       self.skipTest(
           "https://github.com/apache/beam/issues/20967: proxy generation of "
           "DataFrameGroupBy.describe fails in pandas < 1.2")
+    kwargs = numeric_only_kwargs_for_pandas_2(agg_type)
     self._run_test(
-        lambda df: getattr(df.groupby('group')[['bar', 'baz']], agg_type)(),
+        lambda df: getattr(df.groupby('group')[['bar', 'baz']], agg_type)
+        (**kwargs),
         GROUPBY_DF,
         check_proxy=False)
 
@@ -1776,9 +1795,10 @@ def test_groupby_errors_non_existent_label(self):
 
   def test_groupby_callable(self):
     df = GROUPBY_DF
-
-    self._run_test(lambda df: df.groupby(lambda x: x % 2).foo.sum(), df)
-    self._run_test(lambda df: df.groupby(lambda x: x % 5).median(), df)
+    kwargs = numeric_only_kwargs_for_pandas_2('sum')
+    self._run_test(lambda df: df.groupby(lambda x: x % 2).foo.sum(**kwargs), df)
+    kwargs = numeric_only_kwargs_for_pandas_2('median')
+    self._run_test(lambda df: df.groupby(lambda x: x % 5).median(**kwargs), df)
 
   def test_groupby_apply(self):
     df = GROUPBY_DF
@@ -1833,8 +1853,9 @@ def test_groupby_transform(self):
 
   def test_groupby_pipe(self):
     df = GROUPBY_DF
-
-    self._run_test(lambda df: df.groupby('group').pipe(lambda x: x.sum()), df)
+    kwargs = numeric_only_kwargs_for_pandas_2('sum')
+    self._run_test(
+        lambda df: df.groupby('group').pipe(lambda x: x.sum(**kwargs)), df)
     self._run_test(
         lambda df: df.groupby('group')['bool'].pipe(lambda x: x.any()), df)
     self._run_test(
@@ -1917,34 +1938,13 @@ def test_dataframe_groupby_series(self, agg_type):
           "https://github.com/apache/beam/issues/20967: proxy generation of "
           "DataFrameGroupBy.describe fails in pandas < 1.2")
 
-    def agg(df, group_by, **kwargs):
+    def agg(df, group_by):
+      kwargs = numeric_only_kwargs_for_pandas_2(agg_type)
       return df[df.foo > 40].groupby(group_by).agg(agg_type, **kwargs)
 
-    # Behavior for numeric_only in these methods changed in Pandas 2 to default
-    # to False instead of True, so explicitly make it True in Pandas 2.
-    # The default will fail due to non-numeric fields in Pandas 2.
-    if PD_VERSION >= (
-        2, 0
-    ) and agg_type in NUMERIC_ONLY_DEFAULT_TRUE_FOR_PANDAS_LT_2_AGGREGATIONS:
-      with self.assertRaises(ValueError):
-        self._run_test(
-            lambda df: agg(df, df.group), GROUPBY_DF, check_proxy=False)
-        self._run_test(
-            lambda df: agg(df, df.foo % 3), GROUPBY_DF, check_proxy=False)
-
-      self._run_test(
-          lambda df: agg(df, df.group, numeric_only=True),
-          GROUPBY_DF,
-          check_proxy=False)
-      self._run_test(
-          lambda df: agg(df, df.foo % 3, numeric_only=True),
-          GROUPBY_DF,
-          check_proxy=False)
-    else:
-      self._run_test(
-          lambda df: agg(df, df.group), GROUPBY_DF, check_proxy=False)
-      self._run_test(
-          lambda df: agg(df, df.foo % 3), GROUPBY_DF, check_proxy=False)
+    self._run_test(lambda df: agg(df, df.group), GROUPBY_DF, check_proxy=False)
+    self._run_test(
+        lambda df: agg(df, df.foo % 3), GROUPBY_DF, check_proxy=False)
 
   @parameterized.expand(ALL_GROUPING_AGGREGATIONS)
   def test_series_groupby_series(self, agg_type):

From ccd88be2309b3da26e4f037c7aa4fe963892dc22 Mon Sep 17 00:00:00 2001
From: Christopher Neffshade <caneff@google.com>
Date: Mon, 18 Sep 2023 15:37:17 +0000
Subject: [PATCH 26/26] Fix typing

---
 sdks/python/apache_beam/dataframe/frames_test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py
index 7feee9154c95..b8b8cb733ef1 100644
--- a/sdks/python/apache_beam/dataframe/frames_test.py
+++ b/sdks/python/apache_beam/dataframe/frames_test.py
@@ -17,6 +17,7 @@
 import re
 import unittest
 import warnings
+from typing import Dict
 
 import numpy as np
 import pandas as pd
@@ -1648,7 +1649,7 @@ def test_pivot_no_index_provided_on_multiindex(self):
         'describe'))
 
 
-def numeric_only_kwargs_for_pandas_2(agg_type: str) -> dict[str, bool]:
+def numeric_only_kwargs_for_pandas_2(agg_type: str) -> Dict[str, bool]:
   """Get proper arguments for numeric_only.
 
   Behavior for numeric_only in these methods changed in Pandas 2 to default