Skip to content

Commit

Permalink
Fix remaining tests for pandas 2 compatibility (#28524)
Browse files Browse the repository at this point in the history
  • Loading branch information
caneff authored Sep 21, 2023
1 parent b5b69b1 commit fee9808
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 17 deletions.
4 changes: 4 additions & 0 deletions sdks/python/apache_beam/dataframe/frames_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1936,6 +1936,8 @@ def test_groupby_sum_min_count(self):

self._run_test(lambda df: df.groupby('group').sum(min_count=2), df)

@unittest.skipIf(
PD_VERSION >= (2, 0), "dtypes on groups is deprecated in Pandas 2.")
def test_groupby_dtypes(self):
self._run_test(
lambda df: df.groupby('group').dtypes, GROUPBY_DF, check_proxy=False)
Expand Down Expand Up @@ -2159,6 +2161,7 @@ def test_dataframe_agg_level(self):
level=1, numeric_only=True),
GROUPBY_DF)

@unittest.skipIf(PD_VERSION >= (2, 0), "level argument removed in Pandas 2")
def test_series_agg_multifunc_level(self):
# level= is ignored for multiple agg fns
self._run_test(
Expand All @@ -2181,6 +2184,7 @@ def test_series_mean_skipna(self):
self._run_test(lambda df: df.two.mean(skipna=True), df)
self._run_test(lambda df: df.three.mean(skipna=True), df)

@unittest.skipIf(PD_VERSION >= (2, 0), "level argument removed in Pandas 2")
def test_dataframe_agg_multifunc_level(self):
# level= is ignored for multiple agg fns
self._run_test(
Expand Down
61 changes: 45 additions & 16 deletions sdks/python/apache_beam/dataframe/pandas_doctests_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,9 @@ def test_ndframe_tests(self):
' key=lambda x: np.argsort(index_natsorted(df["time"]))\n'
')'
],
# TODO(https://github.com/apache/beam/issues/28559): Re-enable when
# bug is fixed.
'pandas.core.generic.NDFrame.xs': ['*'],
**skip_writes
})
self.assertEqual(result.failed, 0)
Expand Down Expand Up @@ -296,13 +299,19 @@ def test_dataframe_tests(self):
'pandas.core.frame.DataFrame.value_counts': [
'df.value_counts(dropna=False)'
],

'pandas.core.frame.DataFrame.to_timestamp': ['*']
},
skip={
# DataFrame construction from a dictionary and
# Series requires using the len() function, which
# is a non-deferred operation that we do not allow
# DataFrame construction from a dictionary, Series, or other
# DataFrame requires using the len() function, which is a
# non-deferred operation that we do not allow
'pandas.core.frame.DataFrame': [
'pd.DataFrame(data=d, index=[0, 1, 2, 3])',
'df = pd.DataFrame(data=ser, index=["a", "c"])',
'df',
'df2 = pd.DataFrame(data=df1, index=["a", "c"])',
'df2',
],
# s2 created with reindex
'pandas.core.frame.DataFrame.dot': [
Expand Down Expand Up @@ -361,15 +370,17 @@ def test_dataframe_tests(self):
# actually raise NotImplementedError
'pandas.core.frame.DataFrame.pivot_table': ['*'],
# Expected to raise a ValueError, but we raise NotImplementedError
# pylint: disable=line-too-long
'pandas.core.frame.DataFrame.pivot': [
"df.pivot(index='foo', columns='bar', values='baz')",
"df.pivot(index='foo', columns='bar')['baz']",
"df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])",
# pylint: disable=line-too-long
'df.pivot(index="lev1", columns=["lev2", "lev3"],values="values")',
# pylint: disable=line-too-long
'df.pivot(index=["lev1", "lev2"], columns=["lev3"],values="values")'
'df.pivot(index=["lev1", "lev2"], columns=["lev3"],values="values")',
'df.pivot(index="lev1", columns=["lev2", "lev3"], values="values")',
'df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values")',
],
# pylint: enable=line-too-long
'pandas.core.frame.DataFrame.append': [
'df',
# pylint: disable=line-too-long
Expand Down Expand Up @@ -511,6 +522,8 @@ def test_series_tests(self):
'ser.groupby(["a", "b", "a", np.nan]).mean()',
'ser.groupby(["a", "b", "a", np.nan], dropna=False).mean()',
],
'pandas.core.series.Series.to_period': ['*'],
'pandas.core.series.Series.to_timestamp': ['*'],
},
skip={
# Relies on setting values with iloc
Expand All @@ -535,6 +548,8 @@ def test_series_tests(self):
'pandas.core.series.Series.idxmin': ['s.idxmin()'],
'pandas.core.series.Series.idxmax': ['s.idxmax()'],
'pandas.core.series.Series.duplicated': ['*'],
# Relies on setting index.
'pandas.core.series.Series.rename_axis': ['*'],
'pandas.core.series.Series.set_axis': ['*'],
'pandas.core.series.Series.nonzero': ['*'],
'pandas.core.series.Series.pop': ['ser'], # testing side effect
Expand Down Expand Up @@ -710,6 +725,7 @@ def test_groupby_tests(self):
'pandas.core.groupby.groupby.GroupBy.nth': ['*'],
'pandas.core.groupby.groupby.GroupBy.cumcount': ['*'],
'pandas.core.groupby.groupby.GroupBy.resample': ['*'],
'pandas.core.groupby.groupby.GroupBy.rolling': ['*'],
},
not_implemented_ok={
'pandas.core.groupby.groupby.GroupBy.first': ['*'],
Expand Down Expand Up @@ -764,16 +780,21 @@ def test_groupby_tests(self):
'df.fillna(method=\'ffill\')',
'df.fillna(method="ffill")',
'df.fillna(value=values, limit=1)',
'df.groupby("key").fillna(method="ffill")',
'df.groupby("key").fillna(method="bfill")',
'df.groupby("key").fillna(method="ffill", limit=1)',
],
'pandas.core.groupby.generic.SeriesGroupBy.fillna': [
'df.fillna(method=\'ffill\')',
'df.fillna(method="ffill")',
'df.fillna(value=values, limit=1)',
],
'pandas.core.groupby.groupby.GroupBy.tail': ['*'],
},
not_implemented_ok={
'pandas.core.groupby.generic.DataFrameGroupBy.idxmax': ['*'],
'pandas.core.groupby.generic.DataFrameGroupBy.idxmin': ['*'],
'pandas.core.groupby.generic.DataFrameGroupBy.transform': ['*'],
'pandas.core.groupby.generic.SeriesGroupBy.transform': ['*'],
'pandas.core.groupby.generic.SeriesGroupBy.idxmax': ['*'],
'pandas.core.groupby.generic.SeriesGroupBy.idxmin': ['*'],
Expand All @@ -794,14 +815,6 @@ def test_groupby_tests(self):
# These examples rely on grouping by a list
'pandas.core.groupby.generic.SeriesGroupBy.aggregate': ['*'],
'pandas.core.groupby.generic.DataFrameGroupBy.aggregate': ['*'],
'pandas.core.groupby.generic.SeriesGroupBy.transform': [
# Dropping invalid columns during a transform is unsupported.
'grouped.transform(lambda x: (x - x.mean()) / x.std())'
],
'pandas.core.groupby.generic.DataFrameGroupBy.transform': [
# Dropping invalid columns during a transform is unsupported.
'grouped.transform(lambda x: (x - x.mean()) / x.std())'
],
# Skipped idxmax/idxmin due an issue with the test framework
'pandas.core.groupby.generic.SeriesGroupBy.idxmin': ['s.idxmin()'],
'pandas.core.groupby.generic.SeriesGroupBy.idxmax': ['s.idxmax()'],
Expand All @@ -811,7 +824,24 @@ def test_groupby_tests(self):
# pylint: disable=line-too-long
"df.groupby('gender', as_index=False).value_counts(normalize=True)",
],
})
# These examples rely on grouping by a list
'pandas.core.groupby.generic.SeriesGroupBy.fillna': ['*'],
# These examples rely on grouping by a list
'pandas.core.groupby.generic.DataFrameGroupBy.fillna': ['*'],
# These examples rely on grouping by a list
'pandas.core.groupby.generic.SeriesGroupBy.take': ['*'],
# These examples rely on grouping by a list
'pandas.core.groupby.generic.DataFrameGroupBy.take': ['*'],
# Named aggregation not supported yet.
'pandas.core.groupby.generic.NamedAgg': [
'df.groupby("key").agg(result_a=agg_a, result_1=agg_1)'
],
# These examples rely on grouping by a list
'pandas.core.groupby.generic.DataFrameGroupBy.transform': ['*'],
# These examples rely on grouping by a list
'pandas.core.groupby.generic.SeriesGroupBy.transform': ['*'],
},
)
self.assertEqual(result.failed, 0)

def test_top_level(self):
Expand Down Expand Up @@ -843,7 +873,6 @@ def test_top_level(self):
'pivot_table': ['*'],
'qcut': ['*'],
'reset_option': ['*'],
'set_eng_float_format': ['*'],
'set_option': ['*'],
'to_numeric': ['*'],
'to_timedelta': ['*'],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ def concat(
period_range = _defer_to_pandas('period_range')
pivot = _call_on_first_arg('pivot')
pivot_table = _call_on_first_arg('pivot_table')
set_eng_float_format = _defer_to_pandas('set_eng_float_format')
show_versions = _defer_to_pandas('show_versions')
test = frame_base.wont_implement_method(
pd,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ def test_dataframes_with_grouped_index(self):
Record('c', 18, 150)
]

aggregate = lambda df: df.groupby('height').mean()
aggregate = lambda df: df.groupby('height').mean(numeric_only=True)

deferred_df = aggregate(to_dataframe(p | beam.Create(data)))
df_expected = aggregate(pd.DataFrame(data))
Expand Down

0 comments on commit fee9808

Please sign in to comment.