Skip to content

Commit

Permalink
Upgrade Pandas dependency to 2.1 (#31185)
Browse files Browse the repository at this point in the history
* Upgrade to Pandas 2.1

* Pandas 2.1: Disable interchange protocol tests.

* Exclude attrs tests as it is not supported.

* Exclude new doctests that exercise unsupported order-sensitive ops.

* Iteration over deferred DFs is not supported

* Skip 'mul' op when index is used as an axis

* Exclude new tests that use index.

* Exclude shift test as order-sensitive.

* Exclude known failure modes.

* Exclude failures that existed on Pandas 1.

* Allow bulk-exclusion of an example in all tests.

* Exclude examples that use to_timedelta.

* Exclude the test that evaluates an inferred .tz value.

* Exclude more tz and timedelta tests.

* Exclude a tests exercision PeriodProporties.end_time

* Exclude tests exercising unsupported GroupBy operations.

* Expand the list of elementwise string methods.

* Exclude known WontImpl ops

* Fix test output normalization.

* Exclude remaining new tests that didn't work

* Remove test that uses values, an unsupported non-deferred op.

* lint
  • Loading branch information
tvalentyn authored May 7, 2024
1 parent bb51380 commit 2ca9af8
Show file tree
Hide file tree
Showing 5 changed files with 190 additions and 15 deletions.
16 changes: 11 additions & 5 deletions sdks/python/apache_beam/dataframe/doctests.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,8 @@ def concat(values):

def fix(self, want, got):
if 'DeferredBase' in got:
# When we have a tuple of Dataframes, pandas prints each from a new line.
got = re.sub(r'DeferredBase\[(\d+)\],', '\\g<0>\n', got)
try:
to_compute = {
m.group(0): self._env._all_frames[int(m.group(1))]
Expand Down Expand Up @@ -381,20 +383,23 @@ def to_callable(cond):
self._skipped_set = set()

def _is_wont_implement_ok(self, example, test):
always_wont_implement = self._wont_implement_ok.get('*', [])
return any(
wont_implement(example)
for wont_implement in self._wont_implement_ok.get(test.name, []))
wont_implement(example) for wont_implement in (
self._wont_implement_ok.get(test.name, []) + always_wont_implement))

def _is_not_implemented_ok(self, example, test):
always_not_impl = self._not_implemented_ok.get('*', [])
return any(
not_implemented(example)
for not_implemented in self._not_implemented_ok.get(test.name, []))
not_implemented(example) for not_implemented in (
self._not_implemented_ok.get(test.name, []) + always_not_impl))

def run(self, test, **kwargs):
self._checker.reset()
always_skip = self._skip.get('*', [])
for example in test.examples:
if any(should_skip(example)
for should_skip in self._skip.get(test.name, [])):
for should_skip in self._skip.get(test.name, []) + always_skip):
self._skipped_set.add(example)
example.source = 'pass'
example.want = ''
Expand Down Expand Up @@ -726,6 +731,7 @@ def wrapper(fn):
verify the examples, else use PartitioningSession to simulate
distributed execution.
skip (Dict[str,str]): A set of examples to skip entirely.
If a key is '*', an example will be skipped in all test scenarios.
wont_implement_ok (Dict[str,str]): A set of examples that are allowed to
raise WontImplementError.
not_implemented_ok (Dict[str,str]): A set of examples that are allowed to
Expand Down
19 changes: 17 additions & 2 deletions sdks/python/apache_beam/dataframe/frames.py
Original file line number Diff line number Diff line change
Expand Up @@ -1181,8 +1181,11 @@ def _set_index(self, value):
pd.DataFrame, 'hist', reason="plotting-tools")

attrs = property(
frame_base.wont_implement_method(
pd.DataFrame, 'attrs', reason='experimental'))
fget=frame_base.wont_implement_method(
pd.DataFrame, 'attrs', reason='experimental'),
fset=frame_base.wont_implement_method(
pd.DataFrame, 'attrs', reason='experimental'),
)

reorder_levels = frame_base._proxy_method(
'reorder_levels',
Expand Down Expand Up @@ -5124,13 +5127,18 @@ def rsplit(self, **kwargs):
ELEMENTWISE_STRING_METHODS = [
'capitalize',
'casefold',
'center',
'contains',
'count',
'decode',
'encode',
'endswith',
'extract',
'find',
'findall',
'fullmatch',
'get',
'index',
'isalnum',
'isalpha',
'isdecimal',
Expand All @@ -5142,22 +5150,29 @@ def rsplit(self, **kwargs):
'isupper',
'join',
'len',
'lfind',
'ljust',
'lower',
'lstrip',
'match',
'normalize',
'pad',
'partition',
'removeprefix',
'removesuffix',
'replace',
'rpartition',
'rfind',
'rindex',
'rjust',
'rstrip',
'slice',
'slice_replace',
'startswith',
'strip',
'swapcase',
'title',
'translate',
'upper',
'wrap',
'zfill',
Expand Down
14 changes: 14 additions & 0 deletions sdks/python/apache_beam/dataframe/frames_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from apache_beam.dataframe import frame_base
from apache_beam.dataframe import frames
from apache_beam.dataframe.convert import to_dataframe
from apache_beam.dataframe.doctests import teststring
from apache_beam.runners.interactive import interactive_beam as ib
from apache_beam.runners.interactive import interactive_environment as ie
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
Expand Down Expand Up @@ -363,6 +364,19 @@ def new_column(df):
})
self._run_inplace_test(new_column, df)

def test_tz_with_utc_zone_set_explicitly(self):
test = """
>>> s = pd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+03:00"])
>>> s = pd.to_datetime(s, utc=True)
>>> s
0 2020-01-01 10:00:00+00:00
1 2020-02-01 08:00:00+00:00
dtype: datetime64[ns, UTC]
>>> s.dt.tz
datetime.timezone.utc
"""
teststring(test)

def test_tz_localize_ambiguous_series(self):
# This replicates a tz_localize doctest:
# s.tz_localize('CET', ambiguous=np.array([True, True, False]))
Expand Down
Loading

0 comments on commit 2ca9af8

Please sign in to comment.