From bc2247c36b643fe61ac93b5341e8e37fda8150dd Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 31 Oct 2023 14:30:58 -0700 Subject: [PATCH] BUG: to_datetime with mixed-string-and-numeric --- doc/source/whatsnew/v2.2.0.rst | 3 +++ pandas/_libs/tslib.pyx | 12 +++++------ pandas/core/arrays/datetimelike.py | 4 +++- pandas/tests/dtypes/test_missing.py | 20 +++++++++++++------ pandas/tests/frame/test_constructors.py | 4 ++-- .../indexes/datetimes/test_constructors.py | 15 ++++++++++++-- pandas/tests/tools/test_to_datetime.py | 14 +++++++++++++ 7 files changed, 55 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 99b6310a80f83..5fdb937c49548 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -332,8 +332,11 @@ Datetimelike - Bug in addition or subtraction of very large :class:`Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond :class:`DatetimeTZDtype` and inputs that would be out of bounds with nanosecond resolution incorrectly raising ``OutOfBoundsDatetime`` (:issue:`54620`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`) +- Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing a list of mixed-string-and-numeric types incorrectly raising (:issue:`??`) +- Bug in :class:`DatetimeIndex` when passing an object-dtype ndarray of float objects and a ``tz`` incorrectly localizing the result (:issue:`??`) - + Timedelta ^^^^^^^^^ - Bug in :class:`Timedelta` construction raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 94a984c9db594..3c694ab26d912 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -700,15 +700,15 @@ def array_to_datetime_with_tz(ndarray values, tzinfo tz, NPY_DATETIMEUNIT creso) ival = NPY_NAT else: - ts = Timestamp(item) + if PyDateTime_Check(item) and item.tzinfo is not None: + # We can't call Timestamp constructor with a tz arg, have to + # do 2-step + ts = Timestamp(item).tz_convert(tz) + else: + ts = Timestamp(item, tz=tz) if ts is NaT: ival = NPY_NAT else: - if ts.tzinfo is not None: - ts = ts.tz_convert(tz) - else: - # datetime64, tznaive pydatetime, int, float - ts = ts.tz_localize(tz) ts = (<_Timestamp>ts)._as_creso(creso) ival = ts._value diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 8091543df8e79..33b2f65340a3b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -81,6 +81,7 @@ ) from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( is_all_strings, is_integer_dtype, @@ -2358,7 +2359,8 @@ def ensure_arraylike_for_datetimelike(data, copy: bool, cls_name: str): if not isinstance(data, (list, tuple)) and np.ndim(data) == 0: # i.e. generator data = list(data) - data = np.asarray(data) + + data = construct_1d_object_array_from_listlike(data) copy = False elif isinstance(data, ABCMultiIndex): raise TypeError(f"Cannot create a {cls_name} from a MultiIndex.") diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 451ac2afd1d91..b995dc591c749 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -418,12 +418,10 @@ def test_array_equivalent(dtype_equal): assert not array_equivalent( Index([0, np.nan]), Index([1, np.nan]), dtype_equal=dtype_equal ) - assert array_equivalent( - DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan]), dtype_equal=dtype_equal - ) - assert not array_equivalent( - DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan]), dtype_equal=dtype_equal - ) + + +@pytest.mark.parametrize("dtype_equal", [True, False]) +def test_array_equivalent_tdi(dtype_equal): assert array_equivalent( TimedeltaIndex([0, np.nan]), TimedeltaIndex([0, np.nan]), @@ -435,6 +433,16 @@ def test_array_equivalent(dtype_equal): dtype_equal=dtype_equal, ) + +@pytest.mark.parametrize("dtype_equal", [True, False]) +def test_array_equivalent_dti(dtype_equal): + assert array_equivalent( + DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan]), dtype_equal=dtype_equal + ) + assert not array_equivalent( + DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan]), dtype_equal=dtype_equal + ) + dti1 = DatetimeIndex([0, np.nan], tz="US/Eastern") dti2 = DatetimeIndex([0, np.nan], tz="CET") dti3 = DatetimeIndex([1, np.nan], tz="US/Eastern") diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 5530fa336971c..ff4fb85fa615a 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -3154,9 +3154,9 @@ def test_from_scalar_datetimelike_mismatched(self, constructor, cls): dtype = {np.datetime64: "m8[ns]", np.timedelta64: "M8[ns]"}[cls] if cls is np.datetime64: - msg1 = r"dtype datetime64\[ns\] cannot be converted to timedelta64\[ns\]" + msg1 = "Invalid type for timedelta scalar: " else: - msg1 = r"dtype timedelta64\[ns\] cannot be converted to datetime64\[ns\]" + msg1 = " is not convertible to datetime" msg = "|".join(["Cannot cast", msg1]) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 90ddc9b5f618a..6c23382fa4866 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -1024,8 +1024,11 @@ def test_dti_constructor_with_non_nano_dtype(self, tz): # to 2 microseconds vals = [ts, "2999-01-02 03:04:05.678910", 2500] result = DatetimeIndex(vals, dtype=dtype) - exp_arr = np.array([ts.asm8, vals[1], 2], dtype="M8[us]") - expected = DatetimeIndex(exp_arr, dtype="M8[us]").tz_localize(tz) + exp_vals = [Timestamp(x, tz=tz).as_unit("us").asm8 for x in vals] + exp_arr = np.array(exp_vals, dtype="M8[us]") + expected = DatetimeIndex(exp_arr, dtype="M8[us]") + if tz is not None: + expected = expected.tz_localize("UTC").tz_convert(tz) tm.assert_index_equal(result, expected) result2 = DatetimeIndex(np.array(vals, dtype=object), dtype=dtype) @@ -1050,6 +1053,14 @@ def test_dti_constructor_with_non_nano_now_today(self): assert diff1 >= pd.Timedelta(0) assert diff1 < tolerance + def test_dti_constructor_object_float_matches_float_dtype(self): + arr = np.array([0, np.nan], dtype=np.float64) + arr2 = arr.astype(object) + + dti1 = DatetimeIndex(arr, tz="CET") + dti2 = DatetimeIndex(arr2, tz="CET") + tm.assert_index_equal(dti1, dti2) + class TestTimeSeries: def test_dti_constructor_preserve_dti_freq(self): diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index ac58d312619fe..295fb6ea05ab6 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -603,6 +603,20 @@ def test_to_datetime_mixed_datetime_and_string(self): expected = to_datetime([d1, d2]).tz_convert(timezone(timedelta(minutes=-60))) tm.assert_index_equal(res, expected) + def test_to_datetime_mixed_string_and_numeric(self): + # np.array(vals) would incorrectly cast the number to str + vals = ["2016-01-01", 0] + expected = DatetimeIndex([Timestamp(x) for x in vals]) + result = to_datetime(vals, format="mixed") + result2 = to_datetime(vals[::-1], format="mixed")[::-1] + result3 = DatetimeIndex(vals) + result4 = DatetimeIndex(vals[::-1])[::-1] + + tm.assert_index_equal(result, expected) + tm.assert_index_equal(result2, expected) + tm.assert_index_equal(result3, expected) + tm.assert_index_equal(result4, expected) + @pytest.mark.parametrize( "format", ["%Y-%m-%d", "%Y-%d-%m"], ids=["ISO8601", "non-ISO8601"] )