Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Set check_exact to true if dtype is int #55934

Merged
merged 9 commits into from
Nov 27, 2023
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor

Other API changes
^^^^^^^^^^^^^^^^^
-
- ``check_exact`` now only takes effect for floating-point dtypes in :func:`testing.assert_frame_equal` and :func:`testing.assert_series_equal`. In particular, integer dtypes are always checked exactly (:issue:`55882`)
-

.. ---------------------------------------------------------------------------
Expand Down
18 changes: 12 additions & 6 deletions pandas/_testing/asserters.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from pandas.core.dtypes.common import (
is_bool,
is_float_dtype,
is_integer_dtype,
is_number,
is_numeric_dtype,
Expand Down Expand Up @@ -713,7 +714,7 @@ def assert_extension_array_equal(
index_values : Index | numpy.ndarray, default None
Optional index (shared by both left and right), used in output.
check_exact : bool, default False
Whether to compare number exactly.
Whether to compare number exactly. Only takes effect for float dtypes.
rtol : float, default 1e-5
Relative tolerance. Only used when check_exact is False.
atol : float, default 1e-8
Expand Down Expand Up @@ -782,7 +783,10 @@ def assert_extension_array_equal(

left_valid = left[~left_na].to_numpy(dtype=object)
right_valid = right[~right_na].to_numpy(dtype=object)
if check_exact:
if check_exact or (
(is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype))
or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype))
):
assert_numpy_array_equal(
left_valid, right_valid, obj=obj, index_values=index_values
)
Expand Down Expand Up @@ -836,7 +840,7 @@ def assert_series_equal(
check_names : bool, default True
Whether to check the Series and Index names attribute.
check_exact : bool, default False
Whether to compare number exactly.
Whether to compare number exactly. Only takes effect for float dtypes.
check_datetimelike_compat : bool, default False
Compare datetime-like which is comparable ignoring dtype.
check_categorical : bool, default True
Expand Down Expand Up @@ -929,8 +933,10 @@ def assert_series_equal(
pass
else:
assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}")

if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype(right.dtype):
if check_exact or (
(is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype))
or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype))
):
left_values = left._values
right_values = right._values
# Only check exact if dtype is numeric
Expand Down Expand Up @@ -1093,7 +1099,7 @@ def assert_frame_equal(
Specify how to compare internal data. If False, compare by columns.
If True, compare by blocks.
check_exact : bool, default False
Whether to compare number exactly.
Whether to compare number exactly. Only takes effect for float dtypes.
check_datetimelike_compat : bool, default False
Compare datetime-like which is comparable ignoring dtype.
check_categorical : bool, default True
Expand Down
15 changes: 10 additions & 5 deletions pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pandas._typing import Dtype

from pandas.core.dtypes.common import is_bool_dtype
from pandas.core.dtypes.dtypes import NumpyEADtype
from pandas.core.dtypes.missing import na_value_for_dtype

import pandas as pd
Expand Down Expand Up @@ -331,7 +332,7 @@ def test_fillna_length_mismatch(self, data_missing):
data_missing.fillna(data_missing.take([1]))

# Subclasses can override if we expect e.g Sparse[bool], boolean, pyarrow[bool]
_combine_le_expected_dtype: Dtype = np.dtype(bool)
_combine_le_expected_dtype: Dtype = NumpyEADtype("bool")

def test_combine_le(self, data_repeated):
# GH 20825
Expand All @@ -341,16 +342,20 @@ def test_combine_le(self, data_repeated):
s2 = pd.Series(orig_data2)
result = s1.combine(s2, lambda x1, x2: x1 <= x2)
expected = pd.Series(
[a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))],
dtype=self._combine_le_expected_dtype,
pd.array(
[a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))],
dtype=self._combine_le_expected_dtype,
)
)
tm.assert_series_equal(result, expected)

val = s1.iloc[0]
result = s1.combine(val, lambda x1, x2: x1 <= x2)
expected = pd.Series(
[a <= val for a in list(orig_data1)],
dtype=self._combine_le_expected_dtype,
pd.array(
[a <= val for a in list(orig_data1)],
dtype=self._combine_le_expected_dtype,
)
)
tm.assert_series_equal(result, expected)

Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,9 @@ def test_dtype_backend_pyarrow(all_parsers, request):
tm.assert_frame_equal(result, expected)


# pyarrow engine failing:
# https://github.com/pandas-dev/pandas/issues/56136
@pytest.mark.usefixtures("pyarrow_xfail")
def test_ea_int_avoid_overflow(all_parsers):
# GH#32134
parser = all_parsers
Expand Down
10 changes: 8 additions & 2 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,10 @@ def test_constructor_maskedarray(self):
data[1] = 1
result = Series(data, index=index)
expected = Series([0, 1, 2], index=index, dtype=int)
tm.assert_series_equal(result, expected)
with pytest.raises(AssertionError, match="Series classes are different"):
# TODO should this be raising at all?
# https://github.com/pandas-dev/pandas/issues/56131
tm.assert_series_equal(result, expected)
Comment on lines 572 to +578
Copy link
Member

@jorisvandenbossche jorisvandenbossche Dec 5, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a different issue than reported in #56131 (there is also no check_dtype=False here)

This doesn't involve different data types (it's all numpy dtypes), and the two objects created here seemingly are the same:

data = np.ma.masked_all((3,), dtype=int)
data[0] = 0
data[1] = 1
data[2] = 2
index = ["a", "b", "c"]
result = Series(data, index=index)
expected = Series([0, 1, 2], index=index, dtype=int)
In [29]: result
Out[29]: 
a    0
b    1
c    2
dtype: int64

In [30]: expected
Out[30]: 
a    0
b    1
c    2
dtype: int64

In [31]: result.dtype == expected.dtype
Out[31]: True

But apparently we have a bug in the Series constructor that preserves the masked array as underlying value if it has no masked elements:

In [32]: result.values
Out[32]: 
masked_array(data=[0, 1, 2],
             mask=[False, False, False],
       fill_value=999999)

In [33]: expected.values
Out[33]: array([0, 1, 2])

That seems like a separate, actual bug we should solve, regardless of the behaviour of check_dtype in assert_series_equal (although being more strict here actually uncovered this bug ..)


data = ma.masked_all((3,), dtype=bool)
result = Series(data)
Expand All @@ -589,7 +592,10 @@ def test_constructor_maskedarray(self):
data[1] = True
result = Series(data, index=index)
expected = Series([True, True, False], index=index, dtype=bool)
tm.assert_series_equal(result, expected)
with pytest.raises(AssertionError, match="Series classes are different"):
# TODO should this be raising at all?
# https://github.com/pandas-dev/pandas/issues/56131
tm.assert_series_equal(result, expected)

data = ma.masked_all((3,), dtype="M8[ns]")
result = Series(data)
Expand Down
8 changes: 7 additions & 1 deletion pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -2116,7 +2116,13 @@ def test_float_to_datetime_raise_near_bounds(self):
expected = (should_succeed * oneday_in_ns).astype(np.int64)
for error_mode in ["raise", "coerce", "ignore"]:
result1 = to_datetime(should_succeed, unit="D", errors=error_mode)
tm.assert_almost_equal(result1.astype(np.int64), expected, rtol=1e-10)
# Cast to `np.float64` so that `rtol` and inexact checking kick in
# (`check_exact` doesn't take place for integer dtypes)
tm.assert_almost_equal(
result1.astype(np.int64).astype(np.float64),
expected.astype(np.float64),
rtol=1e-10,
)
# just out of bounds
should_fail1 = Series([0, tsmax_in_days + 0.005], dtype=float)
should_fail2 = Series([0, -tsmax_in_days - 0.005], dtype=float)
Expand Down
18 changes: 14 additions & 4 deletions pandas/tests/util/test_assert_frame_equal.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,10 @@ def test_assert_frame_equal_extension_dtype_mismatch():
"\\[right\\]: int[32|64]"
)

tm.assert_frame_equal(left, right, check_dtype=False)
# TODO: this shouldn't raise (or should raise a better error message)
# https://github.com/pandas-dev/pandas/issues/56131
with pytest.raises(AssertionError, match="classes are different"):
tm.assert_frame_equal(left, right, check_dtype=False)

with pytest.raises(AssertionError, match=msg):
tm.assert_frame_equal(left, right, check_dtype=True)
Expand All @@ -228,11 +231,18 @@ def test_assert_frame_equal_interval_dtype_mismatch():
tm.assert_frame_equal(left, right, check_dtype=True)


@pytest.mark.parametrize("right_dtype", ["Int32", "int64"])
def test_assert_frame_equal_ignore_extension_dtype_mismatch(right_dtype):
def test_assert_frame_equal_ignore_extension_dtype_mismatch():
# https://github.com/pandas-dev/pandas/issues/35715
left = DataFrame({"a": [1, 2, 3]}, dtype="Int64")
right = DataFrame({"a": [1, 2, 3]}, dtype="Int32")
tm.assert_frame_equal(left, right, check_dtype=False)


@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/56131")
def test_assert_frame_equal_ignore_extension_dtype_mismatch_cross_class():
# https://github.com/pandas-dev/pandas/issues/35715
left = DataFrame({"a": [1, 2, 3]}, dtype="Int64")
right = DataFrame({"a": [1, 2, 3]}, dtype=right_dtype)
right = DataFrame({"a": [1, 2, 3]}, dtype="int64")
tm.assert_frame_equal(left, right, check_dtype=False)


Expand Down
27 changes: 23 additions & 4 deletions pandas/tests/util/test_assert_series_equal.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,10 @@ def test_assert_series_equal_extension_dtype_mismatch():
\\[left\\]: Int64
\\[right\\]: int[32|64]"""

tm.assert_series_equal(left, right, check_dtype=False)
# TODO: this shouldn't raise (or should raise a better error message)
# https://github.com/pandas-dev/pandas/issues/56131
with pytest.raises(AssertionError, match="Series classes are different"):
tm.assert_series_equal(left, right, check_dtype=False)

with pytest.raises(AssertionError, match=msg):
tm.assert_series_equal(left, right, check_dtype=True)
Expand Down Expand Up @@ -348,11 +351,18 @@ def test_series_equal_exact_for_nonnumeric():
tm.assert_series_equal(s3, s1, check_exact=True)


@pytest.mark.parametrize("right_dtype", ["Int32", "int64"])
def test_assert_series_equal_ignore_extension_dtype_mismatch(right_dtype):
def test_assert_series_equal_ignore_extension_dtype_mismatch():
# https://github.com/pandas-dev/pandas/issues/35715
left = Series([1, 2, 3], dtype="Int64")
right = Series([1, 2, 3], dtype=right_dtype)
right = Series([1, 2, 3], dtype="Int32")
tm.assert_series_equal(left, right, check_dtype=False)


@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/56131")
def test_assert_series_equal_ignore_extension_dtype_mismatch_cross_class():
# https://github.com/pandas-dev/pandas/issues/35715
left = Series([1, 2, 3], dtype="Int64")
right = Series([1, 2, 3], dtype="int64")
tm.assert_series_equal(left, right, check_dtype=False)


Expand Down Expand Up @@ -423,3 +433,12 @@ def test_check_dtype_false_different_reso(dtype):

with pytest.raises(AssertionError, match="Series are different"):
tm.assert_series_equal(ser_s, ser_ms, check_dtype=False)


@pytest.mark.parametrize("dtype", ["Int64", "int64"])
def test_large_unequal_ints(dtype):
# https://github.com/pandas-dev/pandas/issues/55882
left = Series([1577840521123000], dtype=dtype)
right = Series([1577840521123543], dtype=dtype)
with pytest.raises(AssertionError, match="Series are different"):
tm.assert_series_equal(left, right)
Loading