Skip to content

Commit

Permalink
Only value_counts.drop_na for Pandas 2 (#28500)
Browse files Browse the repository at this point in the history
  • Loading branch information
caneff authored Sep 19, 2023
1 parent f2a0c76 commit 0f2f3b1
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 3 deletions.
17 changes: 14 additions & 3 deletions sdks/python/apache_beam/dataframe/frames.py
Original file line number Diff line number Diff line change
Expand Up @@ -2348,8 +2348,13 @@ def value_counts(

result = column.groupby(column, dropna=dropna).size()

# groupby.size() names the index, which we don't need
result.index.name = None
# Pandas 2 introduces new naming for the results.
if PD_VERSION >= (2, 0):
result.index.name = getattr(self, "name", None)
result.name = "proportion" if normalize else "count"
else:
# groupby.size() names the index, which we don't need
result.index.name = None

if normalize:
return result / column.length()
Expand Down Expand Up @@ -4007,12 +4012,18 @@ def value_counts(self, subset=None, sort=False, normalize=False,
columns = subset or list(self.columns)

if dropna:
dropped = self.dropna()
# Must include subset here because otherwise we spuriously drop NAs due
# to columns outside our subset.
dropped = self.dropna(subset=subset)
else:
dropped = self

result = dropped.groupby(columns, dropna=dropna).size()

# Pandas 2 introduces new naming for the results.
if PD_VERSION >= (2,0):
result.name = "proportion" if normalize else "count"

if normalize:
return result/dropped.length()
else:
Expand Down
2 changes: 2 additions & 0 deletions sdks/python/apache_beam/dataframe/frames_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,6 +694,8 @@ def test_value_counts_with_nans(self):

self._run_test(lambda df: df.value_counts(), df)
self._run_test(lambda df: df.value_counts(normalize=True), df)
# Ensure we don't drop rows due to nan values in unused columns.
self._run_test(lambda df: df.value_counts('num_wings'), df)

if PD_VERSION >= (1, 3):
# dropna=False is new in pandas 1.3
Expand Down

0 comments on commit 0f2f3b1

Please sign in to comment.